rapid7-insightvm-data-exporter/insightVM-exporter.py

#!/usr/bin/env python

import requests
from urllib3.exceptions import InsecureRequestWarning
import json
import pandas as pd
import argparse
import unicodedata

# Set the base URL for the InsightVM API
base_url = 'https://<YOUR_INSIGHTVM_HOST>:3780/api/3/'
enrich_asset_vulns = True # lookup the vulns for each asset
tag_assets = True # lookup the tags for each asset

# global variables for the data tables
assets_df = pd.DataFrame()
vulns_df = pd.DataFrame()
vuln_extra_df = pd.DataFrame()


class bcolors:
    """
    Helper class for color output on the console prompt
    """
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def _create_api_session(username: str, password: str):
    """
    Helper function to create a session with the Rapid7 InsightVM API
    :param username:
    :param password:
    :return:
    """
    # disable warnings for self-signed certificates
    requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

    # Create a Session object and set the authentication credentials
    auth = (username, password)
    try:
        session = requests.Session()
        assert type(session) == requests.Session
        session.auth = auth
        session.verify = False
    except AssertionError:
        print("Error: Could not create a session object")
        exit(1)
    except Exception as e:
        print("Error: " + str(e))
        exit(1)

    return session


def _initiate_pagination(session: requests.Session, api_endpoint: str):
    """
    Helper function to initiate pagination for the Rapid7 InsightVM API
    :param session:
    :param api_endpoint:
    :return:
    """
    # Set the URL for the  endpoint
    api_url = base_url + api_endpoint + '?size=200'

    # Make a GET request to the vulnerabilities endpoint
    response = session.get(api_url)

    # Check the status code of the response
    if response.status_code == 200:
        # Load the response data as a JSON object
        try:
            setup_data = json.loads(response.text)
            assert 'page' in setup_data
            page_limit = setup_data["page"]["totalPages"]
        except AssertionError:
            print("Error: The API endpoint does not support pagination")
            exit(1)
        except Exception as e:
            print("Error: " + str(e))
            exit(1)
    else:
        print('Error: ' + str(response.status_code))

    return page_limit, response, api_url


def create_rapid7_asset_list(session: requests.Session):
    """
    Function to create a list of assets from the Rapid7 InsightVM API
    :param session:
    :return:
    """
    print("Retrieving asset data from Rapid7 InsightVM")

    # initial control variables for pagination and followup requests
    page_limit, response, asset_url = _initiate_pagination(session, 'assets')
    page_offset = 0
    more_pages = True

    # make a list to hold the asset data
    asset_data = []
    global assets_df

    print("Retrieving asset list...")

    while more_pages:
        # Check the status code of the response
        if response.status_code == 200:
            # Set the parameters for the API request
            params = {'offset': page_offset, 'limit': page_limit}

            # Make a GET request to the vulnerabilities endpoint
            response = session.get(asset_url, params=params)
            try:
                data = json.loads(response.text)
                assert 'resources' in data
            except AssertionError:
                print("resources key not found in response")
                exit(1)
            except Exception as e:
                print("Error: " + str(e))
                exit(1)

            df = pd.json_normalize(data["resources"])

            if tag_assets:
                for index, row in df.iterrows():
                    asset_id = df.loc[index, 'id']
                    tags_url = base_url + 'assets/{id}/tags'.format(id=asset_id)
                    response = session.get(tags_url)

                    if response.status_code == 200:

                        try:
                            tag_data = json.loads(response.text)
                            print("\t\tLooking up tags for host {hostname}".format(hostname=row["hostName"]))
                            assert 'resources' in tag_data
                            tags_df = pd.json_normalize(tag_data["resources"])

                            if 'name' not in tags_df.columns:
                                tags_df['name'] = "untagged"

                        except AssertionError:
                            print("resources or name key not found in response")
                            exit(1)
                        except Exception as e:
                            print("Error: " + str(e))
                            exit(1)

                        # making a list of tags for each asset
                        asset_tags_string = tags_df['name'].to_csv(header=None, index=False).strip('\n').split('\n')
                        df.loc[index, 'tags'] = str(asset_tags_string)

                    else:
                        print("Error: " + str(response.status_code))

                print("\tAsset tags retrieved")

            asset_data.append(df)

            page_offset += 1
            if page_offset >= page_limit:
                more_pages = False

        else:
            print('An error occurred:')
            print(response.status_code)

    print("Building asset list...")
    assets_df = pd.concat(asset_data)

    print("Writing asset list to file...")
    name = "asset_list"
    assets_df.to_csv(name + '.csv', index=False)
    assets_df.to_json(name + '.json', orient='records', lines=True)


def _enrich_vuln_data(session: requests.Session, vuln_id: str):
    """
    Helper function to enrich the vulnerability data with additional information
    :param session:
    :param vuln_id:
    :return:
    """
    # https://help.rapid7.com/insightvm/en-us/api/index.html#operation/getVulnerability

    current_vuln_url = base_url + "vulnerabilities/{id}".format(id=vuln_id)
    response = session.get(current_vuln_url)

    if response.status_code == 200:
        try:
            data = json.loads(response.text)
            assert 'cvss' in data
        except AssertionError:
            print("cvss key not found in response")
            exit(1)
        except Exception as e:
            print("Error: " + str(e))
            exit(1)

        df = pd.json_normalize(data)
        return df
    else:
        print('An error occurred:')
        print(response.status_code)


def create_rapid7_vuln_asset_list(session: requests.Session):
    """
    Function to create a list of vulnerabilities from the Rapid7 InsightVM API
    :param session:
    :return:
    """
    # https://help.rapid7.com/insightvm/en-us/api/index.html#operation/getAssetVulnerabilities

    # make a list to hold the vulnerability data per asset
    # collect all lists into a DataFrame at the end
    vuln_data = []
    global vulns_df


    # create a list of asset ids for the requests
    try:
        asset_ids = assets_df['id'].tolist()
        asset_names = assets_df['hostName'].tolist()
        asset_tags = assets_df['tags'].tolist()
        assert len(asset_ids) > 0 and len(asset_names) > 0
    except AssertionError:
        print("Error: No assets found")
        exit(1)
    except Exception as e:
        print("Error: " + str(e))
        exit(1)

    # loop over the assets and maintain an index i
    for i, asset in enumerate(asset_ids):

        current_asset = asset
        current_hostname = asset_names[i]
        current_asset_url = "assets/{id}/vulnerabilities".format(id=current_asset)

        current_asset_tags = asset_tags[i]

        page_limit, response, current_asset_url = _initiate_pagination(session, current_asset_url)
        page_offset = 0
        more_pages = True

        while more_pages:
            # Check the status code of the response
            if response.status_code == 200:
                print("Asset {} of {}".format(i + 1, len(asset_ids)))

                # Set the parameters for the API request
                params = {'offset': page_offset, 'limit': page_limit}

                # Make a GET request to the vulnerabilities endpoint
                response = session.get(current_asset_url, params=params)

                try:
                    data = json.loads(response.text)
                    assert 'resources' in data
                except AssertionError:
                    print("resources key not found in response")
                    exit(1)
                except Exception as e:
                    print("Error: " + str(e))
                    exit(1)

                df = pd.json_normalize(data["resources"])
                df = df.assign(assetHostname=current_hostname)

                if enrich_asset_vulns:
                    print("\tEnriching vulnerability data for asset: " + current_hostname)
                    print("\tTags: " + current_asset_tags)
                    print("\t\t... processing page {} of {}".format(page_offset + 1, page_limit))
                    # enrich the vulnerability data with additional information
                    try:
                        vuln_id_list = df["id"].tolist()
                        assert len(vuln_id_list) > 0
                    except AssertionError:
                        print("Error: No vulnerabilities found for asset: " + current_hostname)
                        exit(1)
                    except Exception as e:
                        print("Error: " + str(e))
                        exit(1)

                    for vuln_id in vuln_id_list:
                        vuln_extra_df = _enrich_vuln_data(session, vuln_id)
                        # we cannot have duplicate column names
                        # we also drop html content in the tables
                        vuln_extra_df.drop('id', axis=1, inplace=True)
                        vuln_extra_df.drop('links', axis=1, inplace=True)
                        vuln_extra_df.drop('added', axis=1, inplace=True)
                        vuln_extra_df.drop('modified', axis=1, inplace=True)
                        vuln_extra_df.drop('published', axis=1, inplace=True)
                        vuln_extra_df.drop('description.html', axis=1, inplace=True)
                        vuln_extra_df['tags'] = current_asset_tags

                    df = pd.concat([df, vuln_extra_df], axis=1)

                page_offset += 1
                vuln_data.append(df)

                if page_offset >= page_limit:
                    more_pages = False

            else:
                print('An error occurred:')
                print(response.status_code)

        print("\tData retrieved from asset {}\n".format(current_hostname))

    print("Building vulnerability list...")
    vulns_df = pd.concat(vuln_data)

    print("Saving vulnerability data to file...")
    name = "vulns_list"
    vulns_df.to_csv(name + ".csv", index=False)
    vulns_df.to_json(name + ".json", orient='records', lines=True)


if __name__ == '__main__':
    """
    Main function to run the script
    """

    # create the args parser
    parser = argparse.ArgumentParser()
    parser.add_argument("--username", required=True)
    parser.add_argument("--password", required=True)
    args = parser.parse_args()

    print(bcolors.BOLD + bcolors.OKBLUE + "! Starting Rapid7 InsightVM API script !" + bcolors.ENDC)
    print(bcolors.WARNING + "Currently only tested for less than 200 assets" + bcolors.ENDC)
    print()

    try:
        # normalize the username and password to ASCII
        username = unicodedata.normalize("NFKD", args.username)
        password = unicodedata.normalize("NFKD", args.password)
    except UnicodeError:
        # handle UnicodeError exception
        print("Invalid Unicode string in username or password")
        exit(1)
    except Exception as e:
        print("Error: " + str(e))
        exit(1)

    try:
        session = _create_api_session(username, password)
        del (password)
        create_rapid7_asset_list(session)
        create_rapid7_vuln_asset_list(session)
    except Exception as e:
        print("Error: " + str(e))
    finally:
        # close the session
        session.close()