diff --git a/.flake8 b/.flake8 index 92ff826..2a02950 100644 --- a/.flake8 +++ b/.flake8 @@ -22,4 +22,9 @@ select = C,D,E,F,W,B,B950 # operators. It no longer agrees with PEP8. See, for example, here: # https://github.com/ambv/black/issues/21. Guido agrees here: # https://github.com/python/peps/commit/c59c4376ad233a62ca4b3a6060c81368bd21e85b. -ignore = E501,W503 +# +# Also ignore flake8's error about whitespaces before a ':'. (E203) +# No longer complies with PEP8 See example, here: +# https://github.com/PyCQA/pycodestyle/issues/373 +# and here: https://github.com/psf/black/issues/315 +ignore = E203,E501,W503 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9ea780d..37cf8bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -111,14 +111,10 @@ jobs: os: - ubuntu-latest python-version: - - "3.7" - "3.8" - "3.9" - "3.10" - "3.11" - include: - - os: ubuntu-20.04 - python-version: "3.6" steps: - uses: actions/checkout@v3 - id: setup-python @@ -207,14 +203,10 @@ jobs: os: - ubuntu-latest python-version: - - "3.7" - "3.8" - "3.9" - "3.10" - "3.11" - include: - - os: ubuntu-20.04 - python-version: "3.6" steps: - uses: actions/checkout@v3 - id: setup-python @@ -260,14 +252,10 @@ jobs: os: - ubuntu-latest python-version: - - "3.7" - "3.8" - "3.9" - "3.10" - "3.11" - include: - - os: ubuntu-20.04 - python-version: "3.6" steps: - uses: actions/checkout@v3 - id: setup-python diff --git a/.gitignore b/.gitignore index 242b4aa..d1b854a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,13 @@ # Files already tracked by Git are not affected. # See: https://git-scm.com/docs/gitignore +## macOS ## +.DS_Store + +## Project Specific ## +pe_reports_logging.log +src/pe_source/data/dnstwist_output.txt + ## Python ## __pycache__ .coverage @@ -10,3 +17,6 @@ __pycache__ .python-version *.egg-info dist + +## IDE ## +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 033389b..36a91aa 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -116,6 +116,19 @@ repos: hooks: - id: mypy additional_dependencies: + - boto3-stubs + - celery-types + - pandas-stubs + - types-chevron + - types-colorama + - types-docopt + - types-Flask-Migrate + - types-psycopg2 + - types-Pygments + - types-PyYAML + - types-python-dateutil==2.8.19 + - types-requests + - types-retry - types-setuptools - repo: https://github.com/asottile/pyupgrade rev: v3.3.1 @@ -124,7 +137,7 @@ repos: # Ansible hooks - repo: https://github.com/ansible-community/ansible-lint - rev: v5.4.0 + rev: v6.17.2 hooks: - id: ansible-lint # files: molecule/default/playbook.yml diff --git a/README.md b/README.md index 0456754..26e7709 100644 --- a/README.md +++ b/README.md @@ -5,20 +5,65 @@ [![Coverage Status](https://coveralls.io/repos/github/cisagov/pe-source/badge.svg?branch=develop)](https://coveralls.io/github/cisagov/pe-source?branch=develop) [![Known Vulnerabilities](https://snyk.io/test/github/cisagov/pe-source/develop/badge.svg)](https://snyk.io/test/github/cisagov/pe-source) -This is a generic skeleton project that can be used to quickly get a -new [cisagov](https://github.com/cisagov) Python library GitHub -project started. This skeleton project contains [licensing -information](LICENSE), as well as -[pre-commit hooks](https://pre-commit.com) and -[GitHub Actions](https://github.com/features/actions) configurations -appropriate for a Python library project. - -## New Repositories from a Skeleton ## - -Please see our [Project Setup guide](https://github.com/cisagov/development-guide/tree/develop/project_setup) -for step-by-step instructions on how to start a new repository from -a skeleton. This will save you time and effort when configuring a -new repository! +This package is used to gather and store data for the CISA +[Posture & Exposure Reports](https://github.com/cisagov/pe-reports). + +Data of interest include *Exposed Credentials, Domain Masquerading, Malware, +Inferred Vulnerabilities, and the Dark Web*. The data collected for the reports +is gathered on the 1st and 15th of each month. + +## Requirements ## + +- [Python Environment](CONTRIBUTING.md#creating-the-python-virtual-environment) + +## Installation ## + +- `git clone https://github.com/cisagov/pe-source.git` +- Add database/API credentials to `src/pe_source/data/pe_db/database.ini` +- `pip install -e .` + +## Run P&E Source ## + +```console +Usage: + pe-source DATA_SOURCE [--log-level=LEVEL] [--orgs=ORG_LIST] [--cybersix-methods=METHODS] [--soc_med_included] + +Arguments: + DATA_SOURCE Source to collect data from. Valid values are "cybersixgill", + "dnstwist", "hibp", "intelx", and "shodan". + +Options: + -h --help Show this message. + -v --version Show version information. + -l --log-level=LEVEL If specified, then the log level will be set to + the specified value. Valid values are "debug", "info", + "warning", "error", and "critical". [default: info] + -o --orgs=ORG_LIST A comma-separated list of orgs to collect data for. + If not specified, data will be collected for all + orgs in the pe database. Orgs in the list must match the + IDs in the cyhy-db. E.g. DHS,DHS_ICE,DOC + [default: all] + -c --cybersix-methods=METHODS A comma-separated list of Cybersixgill methods to run. + If not specified, all will run. Valid values are "alerts", + "credentials", "mentions", "topCVEs". E.g. alerts,mentions. + [default: all] + -s --soc-med-included Include social media posts from cybersixgill in data collection. + +``` + +## Examples ## + +Run shodan on DHS and DOT: + +```console +pe-source shodan --orgs=DHS,DOT +``` + +Run Cybersixgill mentions on DHS and include social media data: + +```console +pe-source cybersixgill --cybersix-methods=mentions --orgs=DHS --soc_med_included +``` ## Contributing ## diff --git a/setup.py b/setup.py index ad05d0d..8a41b3c 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ """ -This is the setup module for the example project. +This is the setup module for the pe-source project. Based on: @@ -42,10 +42,10 @@ def get_version(version_file): setup( - name="example", + name="pe_source", # Versions should comply with PEP440 - version=get_version("src/example/_version.py"), - description="Example Python library", + version=get_version("src/pe_source/_version.py"), + description="Posture and Exposure Source Library", long_description=readme(), long_description_content_type="text/markdown", # Landing page for CISA's cybersecurity mission @@ -74,9 +74,6 @@ def get_version(version_file): # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", @@ -85,13 +82,33 @@ def get_version(version_file): ], python_requires=">=3.6", # What does your project relate to? - keywords="skeleton", + keywords="posture exposure source", packages=find_packages(where="src"), package_dir={"": "src"}, - package_data={"example": ["data/*.txt"]}, + package_data={ + "pe_source": [ + "data/*", + "data/dnsmonitor/*", + "data/pe_db/*", + "data/shodan/*", + "data/sixgill/*", + ], + }, py_modules=[splitext(basename(path))[0] for path in glob("src/*.py")], include_package_data=True, - install_requires=["docopt", "schema", "setuptools >= 24.2.0"], + install_requires=[ + "click", + "docopt", + "dnstwist", + "dshield", + "dnspython == 2.2.1", + "importlib_resources == 5.4.0", + "pandas == 1.5.1", + "psycopg2-binary == 2.9.3", + "retry == 0.9.2", + "schema == 0.7.5", + "shodan == 1.27.0", + ], extras_require={ "test": [ "coverage", @@ -107,6 +124,10 @@ def get_version(version_file): "pytest", ] }, - # Conveniently allows one to run the CLI tool as `example` - entry_points={"console_scripts": ["example = example.example:main"]}, + # Conveniently allows one to run the CLI tool as `pe-source' + entry_points={ + "console_scripts": [ + "pe-source = pe_source.pe_source:main", + ] + }, ) diff --git a/src/example/__init__.py b/src/example/__init__.py deleted file mode 100644 index 98b5e04..0000000 --- a/src/example/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -"""The example library.""" -# We disable a Flake8 check for "Module imported but unused (F401)" here because -# although this import is not directly used, it populates the value -# package_name.__version__, which is used to get version information about this -# Python package. -from ._version import __version__ # noqa: F401 -from .example import example_div - -__all__ = ["example_div"] diff --git a/src/example/data/secret.txt b/src/example/data/secret.txt deleted file mode 100644 index c40a49b..0000000 --- a/src/example/data/secret.txt +++ /dev/null @@ -1 +0,0 @@ -Three may keep a secret, if two of them are dead. diff --git a/src/example/example.py b/src/example/example.py deleted file mode 100644 index d3eda19..0000000 --- a/src/example/example.py +++ /dev/null @@ -1,103 +0,0 @@ -"""example is an example Python library and tool. - -Divide one integer by another and log the result. Also log some information -from an environment variable and a package resource. - -EXIT STATUS - This utility exits with one of the following values: - 0 Calculation completed successfully. - >0 An error occurred. - -Usage: - example [--log-level=LEVEL] - example (-h | --help) - -Options: - -h --help Show this message. - --log-level=LEVEL If specified, then the log level will be set to - the specified value. Valid values are "debug", "info", - "warning", "error", and "critical". [default: info] -""" - -# Standard Python Libraries -import logging -import os -import sys -from typing import Any, Dict - -# Third-Party Libraries -import docopt -import pkg_resources -from schema import And, Schema, SchemaError, Use - -from ._version import __version__ - -DEFAULT_ECHO_MESSAGE: str = "Hello World from the example default!" - - -def example_div(dividend: int, divisor: int) -> float: - """Print some logging messages.""" - logging.debug("This is a debug message") - logging.info("This is an info message") - logging.warning("This is a warning message") - logging.error("This is an error message") - logging.critical("This is a critical message") - return dividend / divisor - - -def main() -> None: - """Set up logging and call the example function.""" - args: Dict[str, str] = docopt.docopt(__doc__, version=__version__) - # Validate and convert arguments as needed - schema: Schema = Schema( - { - "--log-level": And( - str, - Use(str.lower), - lambda n: n in ("debug", "info", "warning", "error", "critical"), - error="Possible values for --log-level are " - + "debug, info, warning, error, and critical.", - ), - "": Use(int, error=" must be an integer."), - "": And( - Use(int), - lambda n: n != 0, - error=" must be an integer that is not 0.", - ), - str: object, # Don't care about other keys, if any - } - ) - - try: - validated_args: Dict[str, Any] = schema.validate(args) - except SchemaError as err: - # Exit because one or more of the arguments were invalid - print(err, file=sys.stderr) - sys.exit(1) - - # Assign validated arguments to variables - dividend: int = validated_args[""] - divisor: int = validated_args[""] - log_level: str = validated_args["--log-level"] - - # Set up logging - logging.basicConfig( - format="%(asctime)-15s %(levelname)s %(message)s", level=log_level.upper() - ) - - logging.info("%d / %d == %f", dividend, divisor, example_div(dividend, divisor)) - - # Access some data from an environment variable - message: str = os.getenv("ECHO_MESSAGE", DEFAULT_ECHO_MESSAGE) - logging.info('ECHO_MESSAGE="%s"', message) - - # Access some data from our package data (see the setup.py) - secret_message: str = ( - pkg_resources.resource_string("example", "data/secret.txt") - .decode("utf-8") - .strip() - ) - logging.info('Secret="%s"', secret_message) - - # Stop logging and clean up - logging.shutdown() diff --git a/src/pe_source/__init__.py b/src/pe_source/__init__.py new file mode 100644 index 0000000..b20f85d --- /dev/null +++ b/src/pe_source/__init__.py @@ -0,0 +1,28 @@ +"""The pe_source library.""" +# Standard Python Libraries +import logging + +# We disable a Flake8 check for "Module imported but unused (F401)" here because +# although this import is not directly used, it populates the value +# package_name.__version__, which is used to get version information about this +# Python package. +from ._version import __version__ # noqa: F401 + +__all__ = ["cybersixgill", "shodan"] +CENTRAL_LOGGING_FILE = "pe_reports_logging.log" +DEBUG = False + +# Setup Logging +"""Set up logging.""" +if DEBUG is True: + level = "DEBUG" +else: + level = "INFO" + +logging.basicConfig( + filename=CENTRAL_LOGGING_FILE, + filemode="a", + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%m/%d/%Y %I:%M:%S", + level=level, +) diff --git a/src/example/__main__.py b/src/pe_source/__main__.py similarity index 71% rename from src/example/__main__.py rename to src/pe_source/__main__.py index 11a3238..4dc4ea1 100644 --- a/src/example/__main__.py +++ b/src/pe_source/__main__.py @@ -1,5 +1,5 @@ """Code to run if this package is used as a Python module.""" -from .example import main +from .pe_source import main main() diff --git a/src/example/_version.py b/src/pe_source/_version.py similarity index 70% rename from src/example/_version.py rename to src/pe_source/_version.py index 6baaa6f..f5df3e9 100644 --- a/src/example/_version.py +++ b/src/pe_source/_version.py @@ -1,2 +1,2 @@ """This file defines the version of this module.""" -__version__ = "0.2.0" +__version__ = "1.0.1" diff --git a/src/pe_source/cybersixgill.py b/src/pe_source/cybersixgill.py new file mode 100644 index 0000000..bae366c --- /dev/null +++ b/src/pe_source/cybersixgill.py @@ -0,0 +1,405 @@ +"""Collect Cybersixgill data.""" + +# Standard Python Libraries +from datetime import date, datetime, timedelta +import logging +import sys +import traceback + +from .data.pe_db.db_query_source import ( + get_breaches, + get_data_source_uid, + get_orgs, + insert_sixgill_alerts, + insert_sixgill_breaches, + insert_sixgill_credentials, + insert_sixgill_mentions, + insert_sixgill_topCVEs, +) +from .data.sixgill.api import get_sixgill_organizations +from .data.sixgill.source import ( + alerts, + alias_organization, + all_assets_list, + creds, + cve_summary, + get_alerts_content, + mentions, + root_domains, + top_cves, +) + +# Set todays date and the start_date 30 days prior +TODAY = date.today() +DAYS_BACK = timedelta(days=30) +MENTIONS_DAYS_BACK = timedelta(days=16) +MENTIONS_START_DATE = str(TODAY - MENTIONS_DAYS_BACK) +END_DATE = str(TODAY) +DATE_SPAN = f"[{MENTIONS_START_DATE} TO {END_DATE}]" + +# Set dates to YYYY-MM-DD H:M:S format +NOW = datetime.now() +START_DATE_TIME = (NOW - DAYS_BACK).strftime("%Y-%m-%d %H:%M:%S") +END_DATE_TIME = NOW.strftime("%Y-%m-%d %H:%M:%S") + +LOGGER = logging.getLogger(__name__) + + +class Cybersixgill: + """Fetch Cybersixgill data.""" + + def __init__(self, orgs_list, method_list, soc_med_included): + """Initialize Cybersixgill class.""" + self.orgs_list = orgs_list + self.method_list = method_list + self.soc_med_included = soc_med_included + + def run_cybersixgill(self): + """Run Cybersixgill api calls.""" + orgs_list = self.orgs_list + method_list = self.method_list + soc_med_included = self.soc_med_included + + # Get org info from PE database + pe_orgs = get_orgs() + # Get Cybersixgill org info + sixgill_orgs = get_sixgill_organizations() + failed = [] + count = 0 + + # Get data source uid + source_uid = get_data_source_uid("Cybersixgill") + + # Run top CVEs. Same for all orgs + if "topCVEs" in method_list: + if self.get_topCVEs(source_uid) == 1: + failed.append("Top CVEs") + + LOGGER.info(",".join([org["cyhy_db_name"] for org in pe_orgs])) + for pe_org in pe_orgs: + org_id = pe_org["cyhy_db_name"] + pe_org_uid = pe_org["org_uid"] + # Only run on specified orgs + if org_id in orgs_list or orgs_list == "all": + count += 1 + # Get sixgill_org_id associated with the PE org + try: + # Cybersixgill lists their ID as the 6th entry in the org list + sixgill_org_id = sixgill_orgs[org_id][5] + except KeyError as err: + LOGGER.error("PE org is not listed in Cybersixgill.") + print(err, file=sys.stderr) + failed.append("%s not in sixgill" % org_id) + continue + + # Run alerts + if "alerts" in method_list: + if ( + self.get_alerts( + org_id, + sixgill_org_id, + pe_org_uid, + source_uid, + soc_med_included, + ) + == 1 + ): + failed.append("%s alerts" % org_id) + # Run mentions + if "mentions" in method_list: + if ( + self.get_mentions( + org_id, + sixgill_org_id, + pe_org_uid, + source_uid, + soc_med_included, + ) + == 1 + ): + failed.append("%s mentions" % org_id) + # Run credentials + if "credentials" in method_list: + if ( + self.get_credentials( + org_id, sixgill_org_id, pe_org_uid, source_uid + ) + == 1 + ): + failed.append("%s credentials" % org_id) + if len(failed) > 0: + LOGGER.error("Failures: %s", failed) + + def get_alerts( + self, org_id, sixgill_org_id, pe_org_uid, source_uid, soc_med_included + ): + """Get alerts.""" + LOGGER.info("Fetching alert data for %s.", org_id) + soc_med_platforms = [ + "discord", + "forum_discord", + "icq", + "ICQ", + "jabber", + "linkedin", + "Linkedin", + "mastodon", + "parler", + "Parler", + "raddle", + "reddit", + "Reddit", + "telegram", + "twitter", + "Twitter", + ] + + # Fetch alert data with sixgill_org_id + try: + alerts_df = alerts(sixgill_org_id) + if not soc_med_included: + alerts_df = alerts_df[~alerts_df["site"].isin(soc_med_platforms)] + # Add pe_org_id + alerts_df["organizations_uid"] = pe_org_uid + # Add data source uid + alerts_df["data_source_uid"] = source_uid + # Rename columns + alerts_df = alerts_df.rename(columns={"id": "sixgill_id"}) + except Exception as e: + LOGGER.error("Failed fetching alert data for %s", org_id) + LOGGER.error(e) + print(traceback.format_exc()) + return 1 + + # Get Alert content + try: + LOGGER.info("Fetching alert content data for %s.", org_id) + # Fetch organization assets + org_assets_dict = all_assets_list(sixgill_org_id) + for alert_index, alert_row in alerts_df.iterrows(): + try: + alert_id = alert_row["sixgill_id"] + + content_snip, asset_mentioned, asset_type = get_alerts_content( + sixgill_org_id, alert_id, org_assets_dict + ) + + alerts_df.at[alert_index, "content_snip"] = content_snip + alerts_df.at[alert_index, "asset_mentioned"] = asset_mentioned + alerts_df.at[alert_index, "asset_type"] = asset_type + except Exception as e: + LOGGER.error( + "Failed fetching a specific alert content for %s", org_id + ) + LOGGER.error(e) + print(traceback.format_exc()) + alerts_df.at[alert_index, "content_snip"] = "" + alerts_df.at[alert_index, "asset_mentioned"] = "" + alerts_df.at[alert_index, "asset_type"] = "" + + except Exception as e: + LOGGER.error("Failed fetching alert content for %s", org_id) + LOGGER.error(e) + LOGGER.error(traceback.format_exc()) + return 1 + + # Insert alert data into the PE database + try: + insert_sixgill_alerts(alerts_df) + except Exception as e: + LOGGER.error("Failed inserting alert data for %s", org_id) + LOGGER.error(e) + return 1 + return 0 + + def get_mentions( + self, org_id, sixgill_org_id, pe_org_uid, source_uid, soc_med_included + ): + """Get mentions.""" + LOGGER.info("Fetching mention data for %s.", org_id) + + # Fetch org aliases from Cybersixgill + try: + aliases = alias_organization(sixgill_org_id) + except Exception as e: + LOGGER.error("Failed fetching aliases for %s", org_id) + print(traceback.format_exc()) + LOGGER.error(e) + return 1 + + # Fetch mention data + try: + try: + mentions_df = mentions(DATE_SPAN, aliases, soc_med_included) + except UnboundLocalError: + return 1 + mentions_df = mentions_df.rename(columns={"id": "sixgill_mention_id"}) + mentions_df["organizations_uid"] = pe_org_uid + # Add data source uid + mentions_df["data_source_uid"] = source_uid + except Exception as e: + LOGGER.error("Failed fetching mentions for %s", org_id) + print(traceback.format_exc()) + LOGGER.error(e) + return 1 + + # Insert mention data into the PE database + try: + insert_sixgill_mentions(mentions_df) + except Exception as e: + LOGGER.error("Failed inserting mentions for %s", org_id) + print(traceback.format_exc) + LOGGER.error(e) + return 1 + return 0 + + def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid): + """Get credentials.""" + LOGGER.info("Fetching credential data for %s.", org_id) + + # Fetch org root domains from Cybersixgill + try: + roots = root_domains(sixgill_org_id) + LOGGER.info(f"Got roots:{roots}") + except Exception as e: + LOGGER.error("Failed fetching root domains for %s", org_id) + LOGGER.error(e) + return 1 + + # Fetch credential data + try: + creds_df = creds(roots, START_DATE_TIME, END_DATE_TIME) + LOGGER.info("Found %s credentials.", len(creds_df.index)) + creds_df["organizations_uid"] = pe_org_uid + # Add data source uid + creds_df["data_source_uid"] = source_uid + except Exception as e: + LOGGER.error("Failed fetching credentials for %s", org_id) + LOGGER.error(e) + return 1 + + if creds_df.empty: + LOGGER.error("No credentials for %s", org_id) + return 1 + + # Change empty and ambiguous breach names + try: + creds_df.loc[ + creds_df["breach_name"] == "", "breach_name" + ] = "Cybersixgill_" + creds_df["breach_id"].astype(str) + + creds_df.loc[ + creds_df["breach_name"] == "Automatic leaked credentials detection", + "breach_name", + ] = "Cybersixgill_" + creds_df["breach_id"].astype(str) + creds_breach_df = creds_df[ + [ + "breach_name", + "description", + "breach_date", + "password", + "data_source_uid", + ] + ].reset_index() + + # Create password_included column + creds_breach_df["password_included"] = creds_breach_df["password"] != "" + + # Group breaches and count the number of credentials + count_creds = creds_breach_df.groupby( + [ + "breach_name", + "description", + "breach_date", + "password_included", + "data_source_uid", + ] + ).size() + creds_breach_df = count_creds.to_frame( + name="exposed_cred_count" + ).reset_index() + creds_breach_df["modified_date"] = creds_breach_df["breach_date"] + creds_breach_df.drop_duplicates( + subset=["breach_name"], keep="first", inplace=True + ) + creds_breach_df.drop(columns=["exposed_cred_count"], inplace=True) + except Exception as e: + LOGGER.error("Probably no credential breaches for %s", org_id) + LOGGER.error(e) + return 1 + + # Insert breach data into the PE database + try: + insert_sixgill_breaches(creds_breach_df) + except Exception as e: + LOGGER.error("Failed inserting breaches for %s", org_id) + LOGGER.error(e) + return 1 + + # Get breach uids and match to credentials + breach_dict = dict(get_breaches()) + for cred_index, cred_row in creds_df.iterrows(): + breach_uid = breach_dict[cred_row["breach_name"]] + creds_df.at[cred_index, "credential_breaches_uid"] = breach_uid + + # Insert credential data into the PE database + creds_df = creds_df.rename( + columns={"domain": "sub_domain", "breach_date": "modified_date"} + ) + creds_df = creds_df[ + [ + "modified_date", + "sub_domain", + "email", + "hash_type", + "name", + "login_id", + "password", + "phone", + "breach_name", + "organizations_uid", + "data_source_uid", + "credential_breaches_uid", + ] + ] + try: + insert_sixgill_credentials(creds_df) + except Exception as e: + LOGGER.error("Failed inserting credentials for %s", org_id) + LOGGER.error(e) + return 1 + return 0 + + def get_topCVEs(self, source_uid): + """Get top CVEs.""" + LOGGER.info("Fetching top CVE data.") + + # Fetch top CVE data + try: + top_cve_df = top_cves() + top_cve_df["date"] = END_DATE + top_cve_df["nvd_base_score"] = top_cve_df["nvd_base_score"].astype("str") + # Add data source uid + top_cve_df["data_source_uid"] = source_uid + # Get CVE summary from circl.lu + top_cve_df["summary"] = "" + for cve_index, cve_row in top_cve_df.iterrows(): + try: + resp = cve_summary(cve_row["cve_id"]) + summary = resp["summary"] + except Exception: + summary = "" + top_cve_df.at[cve_index, "summary"] = summary + except Exception as e: + LOGGER.error("Failed fetching top CVEs.") + LOGGER.error(e) + return 1 + + # Insert credential data into the PE database + try: + insert_sixgill_topCVEs(top_cve_df) + except Exception as e: + LOGGER.error("Failed inserting top CVEs.") + LOGGER.error(e) + return 1 + return 0 diff --git a/src/pe_source/data/common_tlds.dict b/src/pe_source/data/common_tlds.dict new file mode 100644 index 0000000..098de06 --- /dev/null +++ b/src/pe_source/data/common_tlds.dict @@ -0,0 +1,369 @@ +ac +academy +ad +ae +aero +af +africa +ag +agency +ai +al +am +amsterdam +ao +app +ar +art +as +asia +at +au +audio +az +ba +bank +bar +bd +be +beer +bet +bf +bg +bh +bi +bid +bike +bio +biz +blog +blue +bm +bn +bo +br +bt +buzz +bw +by +bz +bzh +ca +cafe +cam +camp +capital +care +careers +cash +cat +cc +cd +center +cf +ch +chat +church +ci +city +cl +click +cloud +club +cm +cn +co +codes +coffee +com +community +company +cool +coop +cr +cu +cx +cy +cz +date +de +deals +design +digital +direct +directory +dk +do +dz +earth +ec +edu +education +ee +eg +email +energy +es +et +eu +eus +events +exchange +expert +farm +fi +film +fit +fj +fm +fo +foundation +fr +fund +fyi +ga +gal +gallery +games +gdn +ge +gg +gh +gi +gl +global +gov +gq +gr +gratis +group +gs +gt +guide +guru +help +hk +hn +host +house +hr +ht +hu +icu +id +ie +il +im +in +info +ink +int +international +io +iq +ir +is +it +je +jm +jo +jobs +jp +ke +kg +kh +kim +kr +kw +ky +kz +la +land +law +lb +li +life +link +live +lk +loan +lol +love +lt +ltd +lu +lv +ly +ma +market +marketing +mc +md +me +media +men +mg +mil +mk +ml +mm +mn +mo +mobi +moe +money +moscow +movie +mr +ms +mt +mu +mv +mw +mx +my +mz +na +nc +net +network +news +ng +ni +ninja +nl +no +np +nu +nyc +nz +om +one +online +ooo +org +ovh +pa +party +pe +pf +pg +ph +photo +photography +photos +pics +pink +pk +pl +plus +pm +porn +pr +press +pro +ps +pt +pub +pw +py +qa +re +red +report +restaurant +reviews +ro +rs +ru +run +rw +sa +sc +school +science +scot +sd +se +services +sexy +sg +sh +shop +show +si +site +sk +sm +sn +so +social +software +solutions +space +st +store +stream +studio +style +su +support +sv +swiss +sy +systems +tc +team +tech +technology +th +tips +tj +tk +tl +tm +tn +to +tools +top +tr +trade +training +travel +tt +tube +tv +tw +tz +ua +ug +uk +us +uy +uz +vc +ve +ventures +video +vip +vn +wang +watch +webcam +website +wien +wiki +win +work +works +world +ws +wtf +xyz +za +zm +zone +zw +рус +рф +укр diff --git a/src/pe_source/data/database.ini b/src/pe_source/data/database.ini new file mode 100644 index 0000000..62dbc40 --- /dev/null +++ b/src/pe_source/data/database.ini @@ -0,0 +1,43 @@ +[cyhy_mongo] +host= +database= +user= +password= +port= + +[dnsmonitor] +client_id= +client_secret= + +[intelx] +api_key= + +[pe_db_password_key] +key= + +[postgres] +host= +database= +user= +password= +port= + +[shodan] +key1= +key2= +key3= +key4= + +[sixgill] +client_id= +client_secret= + +[staging] +host= +database= +user= +password= +port= + +[whoisxml] +key= diff --git a/src/pe_source/data/dnsmonitor/root_domains_dnsmonitor.csv b/src/pe_source/data/dnsmonitor/root_domains_dnsmonitor.csv new file mode 100644 index 0000000..da57512 --- /dev/null +++ b/src/pe_source/data/dnsmonitor/root_domains_dnsmonitor.csv @@ -0,0 +1,716 @@ +domain_name,org +18f.gov,General Services Administration +2020census.gov,Census +400yaahc.gov,General Services Administration +911.gov,Department of Transportation +abandonedmines.gov,Department of the Interior +accessibility.gov,General Services Administration +acf.gov,Department of Health and Human Services +acl.gov,Department of Health and Human Services +acquisition.gov,General Services Administration +acwi.gov,Department of the Interior +ada.gov,Department of Justice +afadvantage.gov,General Services Administration +afterschool.gov,Department of Health and Human Services +ag.gov,Department of Agriculture +aging.gov,Department of Health and Human Services +agingstats.gov,Department of Health and Human Services +ahcpr.gov,Department of Health and Human Services +ahrq.gov,Department of Health and Human Services +aids.gov,Department of Health and Human Services +airnow.gov,Environmental Protection Agency +alaskacenters.gov,Department of the Interior +alzeimers.gov,National Institutes of Health +alzheimers.gov,Department of Health and Human Services +amberalert.gov,Department of Justice +america.gov,Department of State +americathebeautifulquarters.gov,Department of Treasury +anstaskforce.gov,Department of the Interior +aoa.gov,Department of Health and Human Services +ap.gov,Department of Commerce +apprenticeship.gov,Department of Labor +apprenticeships.gov,Department of Labor +arctic.gov,National Science Foundation +arm.gov,Department of Energy +asap.gov,Department of Treasury +atf.gov,Department of Justice +atfonline.gov,Department of Justice +aviationweather.gov,Department of Commerce +ayudaconmibanco.gov,Department of Treasury +bankanswers.gov,Department of Treasury +bankcustomer.gov,Department of Treasury +bankcustomerassistance.gov,Department of Treasury +bankhelp.gov,Department of Treasury +banknet.gov,Department of Treasury +bats.gov,Department of Justice +bcfp.gov,Consumer Financial Protection Bureau +bea.gov,Department of Commerce +benefits.gov,Department of Labor +bep.gov,Department of Treasury +betobaccofree.gov,Department of Health and Human Services +bfem.gov,Department of Treasury +bia.gov,Department of the Interior +bioethics.gov,Department of Health and Human Services +biomassboard.gov,Department of Energy +biometrics.gov,Department of Homeland Security +biopreferred.gov,Department of Agriculture +bja.gov,Department of Justice +bjs.gov,Department of Justice +blm.gov,Department of the Interior +bls.gov,Department of Labor +bls.gov,U.S. Bureau of Labor Statistics +bnl.gov,Department of Energy +boem.gov,Department of the Interior +bondpro.gov,Department of Treasury +bosque.gov,Department of Agriculture +bpa.gov,Department of Energy +brainhealth.gov,Department of Health and Human Services +brainhealth.gov,National Institutes of Health +bsee.gov,Department of the Interior +btfa.gov,Department of the Interior +budgetlob.gov,Department of Education +build.gov,Executive Office of the President +buildbackbetter.gov,Executive Office of the President +buildingamerica.gov,Department of Energy +business.gov,Small Business Administration +businessusa.gov,General Services Administration +buyaccessible.gov,General Services Administration +buyamerican.gov,General Services Administration +buyusa.gov,Department of Commerce +campusdrugprevention.gov,Department of Justice +cancer.gov,Department of Health and Human Services +cao.gov,General Services Administration +casl.gov,Department of Energy +cbca.gov,General Services Administration +cbp.gov,Department of Homeland Security +ccac.gov,Department of Treasury +cdc.gov,Department of Health and Human Services +cdcpartners.gov,National Institutes of Health +cdfifund.gov,Department of Treasury +cendi.gov,Department of Energy +census.gov,Census +cfo.gov,General Services Administration +cfpa.gov,Consumer Financial Protection Bureau +cfpb.gov,Consumer Financial Protection Bureau +challenge.gov,General Services Administration +chcoc.gov,Office of Personnel Management +childcare.gov,Department of Health and Human Services +childreninadversity.gov,USAID +childstats.gov,Department of Education +childtaxcredit.gov,Executive Office of the President +childwelfare.gov,Department of Health and Human Services +choosemyplate.gov,Department of Agriculture +cio.gov,General Services Administration +cisa.gov,Department of Homeland Security +cisa.gov,CISA +citizenscience.gov,General Services Administration +cjis.gov,Department of Justice +climate.gov,Department of Commerce +clinicaltrial.gov,National Institutes of Health +clinicaltrials.gov,Department of Health and Human Services +clinicaltrials.gov,National Institutes of Health +cloud.gov,General Services Administration +cms.gov,Department of Health and Human Services +code.gov,General Services Administration +collegedrinkingprevention.gov,Department of Health and Human Services +collegenavigator.gov,Department of Education +commerce.gov,Department of Commerce +complaintreferralexpress.gov,Department of Treasury +comptrollerofthecurrency.gov,Department of Treasury +computersforlearning.gov,General Services Administration +consumeraction.gov,General Services Administration +consumerbureau.gov,Consumer Financial Protection Bureau +consumerfinance.gov,Consumer Financial Protection Bureau +consumerfinancial.gov,Consumer Financial Protection Bureau +consumerfinancialbureau.gov,Consumer Financial Protection Bureau +consumerfinancialprotectionbureau.gov,Consumer Financial Protection Bureau +consumerprotection.gov,Consumer Financial Protection Bureau +"consumerprotectionbureau.gov,mimm.gov",Consumer Financial Protection Bureau +contractdirectory.gov,General Services Administration +coralreef.gov,Department of the Interior +cpars.gov,General Services Administration +cpnireporting.gov,Department of Homeland Security +crimesolutions.gov,Department of Justice +crimevictims.gov,Department of Justice +crisisnextdoor.gov,Executive Office of the President +cuidadodesalud.gov,Department of Health and Human Services +cupcao.gov,Department of the Interior +cwc.gov,Department of State +cyber.gov,Department of Homeland Security +cybercareers.gov,Office of Personnel Management +cybercrime.gov,Department of Justice +cybersecurity.gov,Department of Homeland Security +data.gov,General Services Administration +dea.gov,Department of Justice +deaecom.gov,Department of Justice +dfafacts.gov,USAID +dhhs.gov,Department of Health and Human Services +dhs.gov,Department of Homeland Security +diabetescommittee.gov,Department of Health and Human Services +dietaryguidelines.gov,Department of Agriculture +digital.gov,General Services Administration +digitaldashboard.gov,General Services Administration +digitalgov.gov,General Services Administration +directoasucuenta.gov,Department of Treasury +disability.gov,Department of Labor +disasterassistance.gov,Federal Emergency Management Agency +disasterhousing.gov,Department of Housing and Urban Development +distracteddriving.gov,Department of Transportation +distraction.gov,Department of Transportation +dnsops.gov,Department of Commerce +doc.gov,Department of Commerce +docline.gov,Department of Health and Human Services +docline.gov,National Institutes of Health +doeal.gov,Department of Energy +doi.gov,Department of the Interior +doioig.gov,Department of the Interior +dol.gov,Department of Labor +doleta.gov,Department of Labor +donaciondeorganos.gov,Department of Health and Human Services +dot.gov,Department of Transportation +dotgov.gov,General Services Administration +dotideahub.gov,Department of Transportation +drought.gov,Department of Commerce +drugabuse.gov,Department of Health and Human Services +drugabuse.gov,National Institutes of Health +dsac.gov,Department of Justice +e-verify.gov,Department of Homeland Security +eac.gov,U.S. Election Assistance Commission +eaglecash.gov,Department of Treasury +earmarks.gov,Executive Office of the President +earthquake.gov,Department of the Interior +ecpic.gov,General Services Administration +ed.gov,Department of Education +eda.gov,Department of Commerce +edison.gov,Department of Health and Human Services +edison.gov,National Institutes of Health +eftps.gov,Department of Treasury +eia.gov,Department of Energy +eldercare.gov,Department of Health and Human Services +elderjustice.gov,Department of Justice +employeeexpress.gov,Office of Personnel Management +employer.gov,Department of Labor +empowhr.gov,Department of Agriculture +ems.gov,Department of Transportation +endingthedocumentgame.gov,Department of Health and Human Services +endingthedocumentgame.gov,National Institutes of Health +energy.gov,Department of Energy +energycodes.gov,Department of Energy +energystar.gov,Environmental Protection Agency +eop.gov,Executive Office of the President +epa.gov,Environmental Protection Agency +epic.gov,Department of Justice +esc.gov,Department of Transportation +esrs.gov,General Services Administration +eta-find.gov,Department of Treasury +evergladesrestoration.gov,Department of the Interior +everify.gov,Department of Homeland Security +everykidoutdoors.gov,Department of the Interior +everytrycounts.gov,Department of Health and Human Services +evus.gov,Department of Homeland Security +execsec.gov,Department of Agriculture +export.gov,Department of Commerce +eyenote.gov,Department of Treasury +faasafety.gov,Department of Transportation +faca.gov,General Services Administration +facadatabase.gov,General Services Administration +fafsa.gov,Department of Education +fai.gov,General Services Administration +fan.gov,Department of State +fapiis.gov,General Services Administration +fara.gov,Department of Justice +farmers.gov,Department of Agriculture +fatherhood.gov,Department of Health and Human Services +fbf.gov,General Services Administration +fbi.gov,Department of Justice +fbijobs.gov,Department of Justice +fbo.gov,General Services Administration +fda.gov,Food and Drug Administration +fdms.gov,General Services Administration +feb.gov,Office of Personnel Management +fedbizopps.gov,General Services Administration +fedcenter.gov,Environmental Protection Agency +federalinvestments.gov,Department of Treasury +federaljobs.gov,Office of Personnel Management +fedidcard.gov,General Services Administration +fedinfo.gov,General Services Administration +fedinvest.gov,Department of Treasury +fedjobs.gov,Office of Personnel Management +fedramp.gov,General Services Administration +fedrooms.gov,General Services Administration +fedshirevets.gov,Office of Personnel Management +feedthefuture.gov,USAID +fegli.gov,Office of Personnel Management +fema.gov,Federal Emergency Management Agency +ffb.gov,Department of Treasury +fgdc.gov,Department of the Interior +fha.gov,Department of Housing and Urban Development +financialresearch.gov,Department of Treasury +financialstability.gov,Department of Treasury +fincen.gov,Department of Treasury +findtreatment.gov,Department of Health and Human Services +firecode.gov,Department of the Interior +fireleadership.gov,Department of the Interior +firescience.gov,Department of the Interior +firstfreedom.gov,Department of Justice +firstgov.gov,General Services Administration +firstnet.gov,Department of Commerce +firstrespondertraining.gov,Department of Homeland Security +fishwatch.gov,Department of Commerce +fitness.gov,Department of Health and Human Services +fleta.gov,Department of Homeland Security +fletc.gov,Federal Law Enforcement Training Center +flightschoolcandidates.gov,Department of Homeland Security +flightschoolcandidates.gov,Transportation Security Administration +floodsmart.gov,Federal Emergency Management Agency +flyhealthy.gov,Department of Transportation +fmi.gov,General Services Administration +fnal.gov,Department of Energy +foia.gov,Department of Justice +foiaonline.gov,Environmental Protection Agency +foodsafety.gov,Department of Health and Human Services +foreignassistance.gov,Department of State +forestsandrangelands.gov,Department of Agriculture +forfeiture.gov,Department of Justice +forms.gov,General Services Administration +fpc.gov,General Services Administration +fpds.gov,General Services Administration +freshempire.gov,Department of Health and Human Services +frpg.gov,General Services Administration +frtr.gov,Environmental Protection Agency +fsd.gov,General Services Administration +fsoc.gov,Department of Treasury +fsrs.gov,General Services Administration +fueleconomy.gov,Department of Energy +fws.gov,Department of the Interior +g5.gov,Department of Education +gcmrc.gov,Department of the Interior +genome.gov,Department of Health and Human Services +genome.gov,National Institutes of Health +geomac.gov,Department of the Interior +geoplatform.gov,Department of the Interior +getsmartaboutdrugs.gov,Department of Justice +ginniemae.gov,Department of Housing and Urban Development +girlshealth.gov,Department of Health and Human Services +glnpo.gov,Environmental Protection Agency +globalentry.gov,Department of Homeland Security +globalhealth.gov,Department of Health and Human Services +globe.gov,National Aeronautics and Space Administration +gobiernousa.gov,General Services Administration +godirect.gov,Department of Treasury +goes-r.gov,Department of Commerce +governmentjobs.gov,Office of Personnel Management +govloans.gov,Department of Labor +gps.gov,Department of Commerce +grants.gov,Department of Health and Human Services +grantsolutions.gov,Department of Health and Human Services +gsa.gov,General Services Administration +gsaadvantage.gov,General Services Administration +gsaauctions.gov,General Services Administration +gsaig.gov,General Services Administration +gsaxcess.gov,General Services Administration +guideline.gov,Department of Health and Human Services +guidelines.gov,Department of Health and Human Services +hanford.gov,Department of Energy +hc.gov,Department of Health and Human Services +health.gov,Department of Health and Human Services +healthcare.gov,Department of Health and Human Services +healthdata.gov,Department of Health and Human Services +healthfinder.gov,Department of Health and Human Services +healthindicators.gov,Department of Health and Human Services +healthit.gov,Department of Health and Human Services +healthypeople.gov,Department of Health and Human Services +hearttruth.gov,Department of Health and Human Services +hearttruth.gov,National Institutes of Health +helpwithmybank.gov,Department of Treasury +helpwithmycheckingaccount.gov,Department of Treasury +helpwithmycreditcard.gov,Department of Treasury +helpwithmycreditcardbank.gov,Department of Treasury +helpwithmymortgage.gov,Department of Treasury +helpwithmymortgagebank.gov,Department of Treasury +hhs.gov,Department of Health and Human Services +hhsoig.gov,Department of Health and Human Services +hhsops.gov,Department of Health and Human Services +hirevets.gov,Department of Labor +hiv.gov,Department of Health and Human Services +homeenergyscore.gov,Department of Energy +homelandsecurity.gov,Department of Homeland Security +homesales.gov,Department of Housing and Urban Development +hrsa.gov,Department of Health and Human Services +hud.gov,Department of Housing and Urban Development +hudoig.gov,Department of Housing and Urban Development +huduser.gov,Department of Housing and Urban Development +hurricanes.gov,Department of Commerce +hydrogen.gov,Department of Energy +iawg.gov,Department of State +ibwc.gov,Department of State +ic3.gov,Department of Justice +icams-portal.gov,Department of Commerce +ice.gov,Department of Homeland Security +ice.gov,U.S. Immigration and Customs Enforcement +idealab.gov,Department of Health and Human Services +identitysandbox.gov,General Services Administration +idmanagement.gov,General Services Administration +iedison.gov,Department of Health and Human Services +iedison.gov,National Institutes of Health +ihs.gov,Department of Health and Human Services +indianaffairs.gov,Department of the Interior +info.gov,General Services Administration +inl.gov,Department of Energy +innovation.gov,General Services Administration +insurekidsnow.gov,Department of Health and Human Services +interior.gov,Department of the Interior +interpol.gov,Department of Justice +invasivespecies.gov,Department of the Interior +invasivespeciesinfo.gov,Department of Agriculture +ipp.gov,Department of Treasury +iprcenter.gov,Department of Justice +irs.gov,Department of Treasury +irsauctions.gov,Department of Treasury +irssales.gov,Department of Treasury +irsvideos.gov,Department of Treasury +isotope.gov,Department of Energy +isotopes.gov,Department of Energy +itap.gov,Department of Agriculture +its.gov,Department of Treasury +jem.gov,Department of the Interior +jobcorps.gov,Department of Labor +justice.gov,Department of Justice +justthinktwice.gov,Department of Justice +juvenilecouncil.gov,Department of Justice +kids.gov,General Services Administration +lacoast.gov,Department of the Interior +landfire.gov,Department of the Interior +landimaging.gov,Department of the Interior +lanl.gov,Department of Energy +lbl.gov,Department of Energy +lcacommons.gov,Department of Agriculture +lcrmscp.gov,Department of the Interior +learnatf.gov,Department of Justice +learndoj.gov,Department of Justice +lep.gov,Department of Justice +listo.gov,Department of Homeland Security +llnl.gov,Department of Energy +lmvsci.gov,Department of the Interior +locatorplus.gov,Department of Health and Human Services +locatorplus.gov,National Institutes of Health +login.gov,General Services Administration +longtermcare.gov,Department of Health and Human Services +luca-appeals.gov,Census +makinghomeaffordable.gov,Department of Treasury +malwareinvestigator.gov,Department of Justice +manufacturing.gov,Department of Commerce +marinecadastre.gov,Department of Commerce +max.gov,Executive Office of the President +mbda.gov,Department of Commerce +medalofvalor.gov,Department of Justice +medicaid.gov,Department of Health and Human Services +medicalcountermeasures.gov,Department of Health and Human Services +medicare.gov,Department of Health and Human Services +medlineplus.gov,Department of Health and Human Services +medlineplus.gov,National Institutes of Health +mentalhealth.gov,Department of Health and Human Services +mesh.gov,Department of Health and Human Services +mesh.gov,National Institutes of Health +mgi.gov,Department of Commerce +mha.gov,Department of Treasury +mitigationcommission.gov,Department of the Interior +moneyfactory.gov,Department of Treasury +moneyfactorystore.gov,Department of Treasury +mrlc.gov,Department of the Interior +msb.gov,Department of Treasury +msha.gov,Department of Labor +mtbs.gov,Department of Agriculture +my2020census.gov,Census +myira.gov,Department of Treasury +mymedicare.gov,Department of Health and Human Services +mymoney.gov,Department of Treasury +myplate.gov,Department of Agriculture +myra.gov,Department of Treasury +nafri.gov,Department of Agriculture +nagb.gov,Department of Education +namus.gov,Department of Justice +nasa.gov,National Aeronautics and Space Administration +nationalbank.gov,Department of Treasury +nationalbankhelp.gov,Department of Treasury +nationalbanknet.gov,Department of Treasury +nationalgangcenter.gov,Department of Justice +nationalhousing.gov,Department of Housing and Urban Development +nationalhousinglocator.gov,Department of Housing and Urban Development +nationalmap.gov,Department of the Interior +nationsreportcard.gov,Department of Education +navycash.gov,Department of Treasury +nbib.gov,Office of Personnel Management +nccs.gov,Department of Energy +ncifcrf.gov,Department of Health and Human Services +ncifcrf.gov,National Institutes of Health +ncirc.gov,Department of Justice +ncjrs.gov,Department of Justice +ncrc.gov,Department of Energy +neglecteddiseases.gov,USAID +nehrp.gov,Department of Commerce +nemi.gov,Department of the Interior +nepa.gov,Executive Office of the President +nersc.gov,Department of Energy +neup.gov,Department of Energy +nfpors.gov,Department of the Interior +nhl.gov,Department of Housing and Urban Development +nhtsa.gov,Department of Transportation +nic.gov,General Services Administration +nicic.gov,Department of Justice +nicsezcheckfbi.gov,Department of Justice +niem.gov,Department of Homeland Security +nifc.gov,Department of the Interior +nih.gov,Department of Health and Human Services +nih.gov,National Institutes of Health +nij.gov,Department of Justice +nist.gov,Department of Commerce +nist.gov,National Institute of Standards and Technology +nlm.gov,Department of Health and Human Services +nlm.gov,National Institutes of Health +nls.gov,Department of Housing and Urban Development +nmvtis.gov,Department of Justice +nnlm.gov,Department of Health and Human Services +nnlm.gov,National Institutes of Health +noaa.gov,National Oceanic and Atmospheric Administration +nps.gov,U.S. National Park Service +nrc-gateway.gov,Nuclear Regulatory Commission +nrc.gov,Nuclear Regulatory Commission +nrel.gov,Department of Energy +nsf.gov,National Science Foundation +nsopr.gov,Department of Justice +nsopw.gov,Department of Justice +ntia.gov,Department of Commerce +ntis.gov,Department of Commerce +nuclear.gov,Department of Energy +nutrition.gov,Department of Agriculture +nvtc.gov,Department of Justice +nwbc.gov,Small Business Administration +occ.gov,Department of Treasury +occhelps.gov,Department of Treasury +ojjdp.gov,Department of Justice +ojp.gov,Department of Justice +omb.gov,Executive Office of the President +ondcp.gov,Executive Office of the President +onhir.gov,Department of the Interior +onrr.gov,Department of the Interior +opioids.gov,Department of Health and Human Services +opm.gov,Office of Personnel Management +opportunityzones.gov,Department of Housing and Urban Development +orau.gov,Department of Energy +organdonor.gov,Department of Health and Human Services +ornl.gov,Department of Energy +osac.gov,Department of State +osha.gov,Department of Labor +osmre.gov,Department of the Interior +osti.gov,Department of Energy +ostp.gov,Executive Office of the President +ots.gov,Department of Treasury +ovc.gov,Department of Justice +ovcttac.gov,Department of Justice +pandemicflu.gov,Department of Health and Human Services +papahanaumokuakea.gov,Department of Commerce +pay.gov,Department of Treasury +paymentaccuracy.gov,General Services Administration +pbrb.gov,General Services Administration +pci.gov,Executive Office of the President +pepfar.gov,Department of State +performance.gov,General Services Administration +phe.gov,Department of Health and Human Services +pic.gov,General Services Administration +piedrasblancas.gov,Department of the Interior +pif.gov,General Services Administration +pitc.gov,Executive Office of the President +plainlanguage.gov,General Services Administration +pmf.gov,Office of Personnel Management +pmi.gov,USAID +pnl.gov,Department of Energy +pnnl.gov,Department of Energy +pppl.gov,Department of Energy +pregunteleakaren.gov,Department of Agriculture +presidentialinnovationfellows.gov,General Services Administration +privacyshield.gov,Department of Commerce +projectsafechildhood.gov,Department of Justice +projectsafeneighborhoods.gov,Department of Justice +psc.gov,Department of Health and Human Services +pscr.gov,Department of Commerce +psob.gov,Department of Justice +pubmed.gov,Department of Health and Human Services +pubmed.gov,National Institutes of Health +qatesttwai.gov,Department of Treasury +quantum.gov,National Science Foundation +rcfl.gov,Department of Justice +reach.gov,Department of Veterans Affairs +ready.gov,Department of Homeland Security +readybusiness.gov,Federal Emergency Management Agency +realestatesales.gov,General Services Administration +realpropertyprofile.gov,General Services Administration +rec.gov,Department of Agriculture +recoverymonth.gov,Department of Health and Human Services +recreation.gov,Department of Agriculture +reginfo.gov,General Services Administration +regulations.gov,General Services Administration +relocatefeds.gov,Environmental Protection Agency +reo.gov,Department of Agriculture +reportband.gov,Department of the Interior +reporting.gov,General Services Administration +research.gov,National Science Foundation +rivers.gov,Department of the Interior +rocis.gov,General Services Administration +sac.gov,National Science Foundation +safecar.gov,Department of Transportation +safecom.gov,Department of the Interior +safeocs.gov,Department of Transportation +safercar.gov,Department of Transportation +safertruck.gov,Department of Transportation +safetyact.gov,Science and Technology Directorate +salmonrecovery.gov,Department of Energy +sam.gov,General Services Administration +samhsa.gov,Department of Health and Human Services +sandia.gov,Department of Energy +savingsbond.gov,Department of Treasury +savingsbonds.gov,Department of Treasury +savingsbondwizard.gov,Department of Treasury +sba.gov,Small Business Administration +sbir.gov,Small Business Administration +sbst.gov,General Services Administration +schoolsafety.gov,Department of Homeland Security +scidac.gov,Department of Energy +science.gov,Department of Energy +sciencebase.gov,Department of the Interior +scijinks.gov,National Aeronautics and Space Administration +scra.gov,Department of Justice +search.gov,General Services Administration +secretservice.gov,U.S. Secret Service +section508.gov,General Services Administration +segurosocial.gov,Social Security Administration +selectagents.gov,Department of Health and Human Services +selectusa.gov,Department of Commerce +servicemembers.gov,Department of Justice +sftool.gov,General Services Administration +sharetheroadsafely.gov,Department of Transportation +sierrawild.gov,Department of the Interior +sigpr.gov,Department of Treasury +sigtarp.gov,Department of Treasury +simplereport.gov,Department of Health and Human Services +slgs.gov,Department of Treasury +smart.gov,Department of Justice +smartgrid.gov,Department of Energy +smokefree.gov,Department of Health and Human Services +smokefree.gov,National Institutes of Health +sns.gov,Department of Energy +socialsecurity.gov,Social Security Administration +spaceweather.gov,Department of Commerce +spectrum.gov,Department of Commerce +ssa.gov,Social Security Administration +sss.gov,Selective Service System +standards.gov,Department of Commerce +state.gov,Department of State +stopalcoholabuse.gov,Department of Health and Human Services +stopbullying.gov,Department of Health and Human Services +stopfakes.gov,Department of Commerce +stopfraud.gov,Department of Justice +studentaid.gov,Department of Education +studentloans.gov,Department of Education +surgeongeneral.gov,Department of Health and Human Services +sustainability.gov,Environmental Protection Agency +sworm.gov,Department of Commerce +swpa.gov,Department of Energy +taaps.gov,Department of Treasury +tasefiling.gov,Department of Commerce +tax.gov,Department of Treasury +tcis.gov,Department of Treasury +telework.gov,Office of Personnel Management +therealcost.gov,Department of Health and Human Services +thisfreelife.gov,Department of Health and Human Services +tigta.gov,Department of Treasury +time.gov,Department of Commerce +tobacco.gov,Department of Health and Human Services +tox21.gov,Department of Health and Human Services +trade.gov,Department of Commerce +trafficsafetymarketing.gov,Department of Transportation +trainingproviderresults.gov,Department of Labor +transportation.gov,Department of Transportation +treas.gov,Department of Treasury +treaslockbox.gov,Department of Treasury +treasury.fed.us,Department of Treasury +treasury.gov,Department of Treasury +treasuryauctions.gov,Treasury Auction +treasurydirect.gov,Department of Treasury +treasuryecm.gov,Department of Treasury +treasuryhunt.gov,Department of Treasury +treasuryscams.gov,Department of Treasury +tribaljusticeandsafety.gov,Department of Justice +tsa.gov,Department of Homeland Security +tsa.gov,Transportation Security Administration +tsunami.gov,Department of Commerce +ttbonline.gov,Department of Treasury +twai.gov,Department of Treasury +unicor.gov,Department of Justice +unionreports.gov,Department of Labor +unlocktalent.gov,Office of Personnel Management +unrpnet.gov,Department of Energy +urbanwaters.gov,Environmental Protection Agency +us-cert.gov,Department of Homeland Security +us.gov,General Services Administration +usa.gov,General Services Administration +usability.gov,General Services Administration +usagov.gov,General Services Administration +usaid.gov,USAID +usajobs.gov,Office of Personnel Management +usalearning.gov,Office of Personnel Management +usap.gov,National Science Foundation +usaseanconnect.gov,Department of State +usaspending.gov,General Services Administration +usaspending.gov,Department of Treasury +usastaffing.gov,Office of Personnel Management +usbr.gov,Department of the Interior +uscis.gov,US Citizenship and Immigration Services +usconsulate.gov,Department of State +usda.gov,Department of Agriculture +usdebitcard.gov,Department of Treasury +usdigitalservice.gov,Executive Office of the President +usdoj.gov,Department of Justice +usds.gov,Executive Office of the President +usembassy.gov,Department of State +userra.gov,Department of Justice +usgeo.gov,National Aeronautics and Space Administration +usgs.gov,Department of the Interior +usicecenter.gov,Department of Commerce +usmarshals.gov,Department of Justice +usmint.gov,Department of Treasury +usmission.gov,Department of State +usphs.gov,Department of Health and Human Services +uspto.gov,U.S. Patent and Trademark Office +ussm.gov,General Services Administration +usss.gov,U.S. Secret Service +ustda.gov,U.S. Trade and Development Agency +ustr.gov,Executive Office of the President +ustreas.gov,Department of Treasury +utahfireinfo.gov,Department of the Interior +va.gov,Department of Veterans Affairs +vaccines.gov,Department of Health and Human Services +vcf.gov,Department of Justice +vehiclehistory.gov,Department of Justice +veterans.gov,Department of Labor +vets.gov,Department of Veterans Affairs +volcano.gov,Department of the Interior +vote.gov,General Services Administration +wapa.gov,Department of Energy +watermonitor.gov,Department of the Interior +wdol.gov,General Services Administration +weather.gov,Department of Commerce +wh.gov,Executive Office of the President +whaging.gov,Department of Health and Human Services +whistleblowers.gov,Department of Labor +whitehouse.gov,Executive Office of the President +whitehouseconferenceonaging.gov,Department of Health and Human Services +whitehousedrugpolicy.gov,Executive Office of the President +wizard.gov,Department of Treasury +wlci.gov,Department of the Interior +womenshealth.gov,Department of Health and Human Services +worker.gov,Department of Labor +wrp.gov,Department of Labor +xd.gov,Census +youth.gov,Department of Health and Human Services +youthrules.gov,Department of Labor \ No newline at end of file diff --git a/src/pe_source/data/dnsmonitor/source.py b/src/pe_source/data/dnsmonitor/source.py new file mode 100644 index 0000000..e87b051 --- /dev/null +++ b/src/pe_source/data/dnsmonitor/source.py @@ -0,0 +1,80 @@ +"""DNSMonitor API calls and DNS lookups.""" +# Standard Python Libraries +import ipaddress +import socket + +# Third-Party Libraries +import dns.resolver +import pandas as pd +import requests + + +def get_monitored_domains(token): + """Get the domains being monitored.""" + org_names_df = pd.read_csv( + "src/pe_source/data/dnsmonitor/root_domains_dnsmonitor.csv" + ) + url = "https://dns.argosecure.com/dhs/api/GetDomains" + payload = {} + headers = {} + headers["authorization"] = f"Bearer {token}" + response = requests.request("GET", url, headers=headers, data=payload).json() + domain_df = pd.DataFrame(response) + + # Merge dataframes to get domain IDs for each organization + merged_df = domain_df.merge( + org_names_df, left_on="domainName", right_on="domain_name", how="left" + ) + merged_df["org"].fillna("NA", inplace=True) + merged_df.drop(columns=["domain_name"], inplace=True) + + return merged_df + + +def get_domain_alerts(token, domain_ids, from_date, to_date): + """Get domain alerts.""" + url = "https://dns.argosecure.com/dhs/api/GetAlerts" + payload = ( + '{\r\n "domainIds": %s,\r\n "fromDate": "%s",\r\n "toDate": "%s",\r\n "alertType": null,\r\n "showBufferPeriod": false\r\n}' + % (domain_ids, from_date, to_date) + ) + headers = {} + headers["authorization"] = f"Bearer {token}" + headers["Content-Type"] = "application/json" + response = requests.request("GET", url, headers=headers, data=payload).json() + return pd.DataFrame(response) + + +def get_dns_records(dom_perm): + """Get DNS records.""" + # NS + try: + ns_list = [] + dom_ns = dns.resolver.resolve(dom_perm, "NS") + for data in dom_ns: + ns_list.append(str(data.target)) + except Exception: + ns_list = [] + # MX + try: + mx_list = [] + dom_mx = dns.resolver.resolve(dom_perm, "MX") + for data in dom_mx: + mx_list.append(str(data.exchange)) + except Exception: + mx_list = [] + + # A + try: + ip_str = str(socket.gethostbyname(dom_perm)) + if ipaddress.ip_address(ip_str).version == 6: + ipv6 = ip_str + ipv4 = "" + else: + ipv4 = ip_str + ipv6 = "" + except Exception: + ipv4 = "" + ipv6 = "" + + return str(mx_list), str(ns_list), ipv4, ipv6 diff --git a/src/pe_source/data/helpers/redact_pii.py b/src/pe_source/data/helpers/redact_pii.py new file mode 100644 index 0000000..19c0eee --- /dev/null +++ b/src/pe_source/data/helpers/redact_pii.py @@ -0,0 +1,562 @@ +"""Functions to redact PII from a dataframe.""" + +# Standard Python Libraries +import re + +# Third-Party Libraries +from presidio_analyzer import AnalyzerEngine +from presidio_anonymizer import AnonymizerEngine +import scrubadub +import scrubadub.detectors.date_of_birth + +# List of unique regexes to identify each state's Drivers License format in a larger string +CA = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{7}(?=$|\s)"] +CO = [r"(?:(?<=\s)|(?<=^))\d{2}-\d{3}-\d{4}(?=$|\s)"] +FL = [ + r"(?:(?<=\s)|(?<=^))[a-zA-Z] \d{3} \d{3} \d{3} \d{3}(?=$|\s)", + r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{3}-\d{3}-\d{2}-\d{3}-\d(?=$|\s)", + r"(?:(?<=\s)|(?<=^))[a-zA-Z]-\d{3}-\d{3}-\d{3}-\d{3}(?=$|\s)", +] +IA = [r"(?:(?<=\s)|(?<=^))\d{3}[a-zA-Z]{2}\d{4}(?=$|\s)"] +ID = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]{2}\d{6}[a-zA-Z](?=$|\s)"] +IL = [ + r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{3}-\d{4}-\d{4}(?=$|\s)", + r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{11}(?=$|\s)", +] +IN = [r"(?:(?<=\s)|(?<=^))\d{4}-\d{2}-\d{4}(?=$|\s)"] +KS = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{2}-\d{2}-\d{4}(?=$|\s)"] +KY = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{2}-\d{3}-\d{3}(?=$|\s)"] +MD = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]-\d{3}-\d{3}-\d{3}-\d{3}(?=$|\s)"] +MI = [r"(?:(?<=\s)|(?<=^))[a-zA-Z] \d{3} \d{3} \d{3} \d{3}(?=$|\s)"] +ND = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]{3}-\d{2}-\d{4}(?=$|\s)"] +NH = [ + r"(?:(?<=\s)|(?<=^))([0][1-9]|[1][0-2])[a-zA-Z]{3}\d{2}(0[1-9]|[1-2][0-9]|3[0-1])\d(?=$|\s)" +] +NJ = [ + r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{4}-\d{5}-\d{5}(?=$|\s)", + r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{14}(?=$|\s)", +] +NY = [r"(?:(?<=\s)|(?<=^))\d{3} \d{3} \d{3}(?=$|\s)"] +OH = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]{3}-\d{2}-\d{4}(?=$|\s)"] +PA = [r"(?:(?<=\s)|(?<=^))\d{2} \d{3} \d{3}(?=$|\s)"] +VA = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{2}-\d{2}-\d{4}(?=$|\s)"] +VT = [r"(?:(?<=\s)|(?<=^))\d{7}[a-zA-Z](?=$|\s)"] +WA = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]{3}\*\*[a-zA-Z]{2}\d{3}[a-zA-Z]\d(?=$|\s)"] +WI = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{3}-\d{4}-\d{4}-\d{2}(?=$|\s)"] +WV = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{6}(?=$|\s)"] +WY = [r"(?:(?<=\s)|(?<=^))\d{6}-\d{3}(?=$|\s)"] + +# List of regexes that are shared by multiple states, these are separated to +# show the end user the redacted value could be from any of the included states +HI_NE_VA = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{8}(?=$|\s)"] +MN_FL_MD_MI = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{12}(?=$|\s)"] +MO_OK = [r"(?:(?<=\s)|(?<=^))[a-zA-Z]\d{9}(?=$|\s)"] + +# Build detectors to find Drivers License ID + + +class CA_DLFilth(scrubadub.filth.Filth): + """Create filth class for CA drivers licenses.""" + + type = "CA_drivers_license" + + +class CA_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify CA drivers licenses.""" + + name = "CA_drivers_license" + regex = re.compile("|".join(CA), re.IGNORECASE) + filth_cls = CA_DLFilth + + +class CO_DLFilth(scrubadub.filth.Filth): + """Create filth class for CO drivers licenses.""" + + type = "CO_drivers_license" + + +class CO_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify CO drivers licenses.""" + + name = "CO_drivers_license" + regex = re.compile("|".join(CO), re.IGNORECASE) + filth_cls = CO_DLFilth + + +class FL_DLFilth(scrubadub.filth.Filth): + """Create filth class for FL drivers licenses.""" + + type = "FL_drivers_license" + + +class FL_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify FL drivers licenses.""" + + name = "FL_drivers_license" + regex = re.compile("|".join(FL), re.IGNORECASE) + filth_cls = FL_DLFilth + + +class HI_NE_VA_DLFilth(scrubadub.filth.Filth): + """Create filth class for HI, NE, and VA drivers licenses.""" + + type = "HI_NE_VA_drivers_license" + + +class HI_NE_VA_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify HI, NE, and VA drivers licenses.""" + + name = "HI_NE_VA_drivers_license" + regex = re.compile("|".join(HI_NE_VA), re.IGNORECASE) + filth_cls = HI_NE_VA_DLFilth + + +class IA_DLFilth(scrubadub.filth.Filth): + """Create filth class for IA drivers licenses.""" + + type = "IA_drivers_license" + + +class IA_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify IA drivers licenses.""" + + name = "IA_drivers_license" + regex = re.compile("|".join(IA), re.IGNORECASE) + filth_cls = IA_DLFilth + + +class ID_DLFilth(scrubadub.filth.Filth): + """Create filth class for ID drivers licenses.""" + + type = "ID_drivers_license" + + +class ID_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify ID drivers licenses.""" + + name = "ID_drivers_license" + regex = re.compile("|".join(ID), re.IGNORECASE) + filth_cls = ID_DLFilth + + +class IL_DLFilth(scrubadub.filth.Filth): + """Create filth class for IL drivers licenses.""" + + type = "IL_drivers_license" + + +class IL_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify IL drivers licenses.""" + + name = "IL_drivers_license" + regex = re.compile("|".join(IL), re.IGNORECASE) + filth_cls = IL_DLFilth + + +class IN_DLFilth(scrubadub.filth.Filth): + """Create filth class for IN drivers licenses.""" + + type = "IN_drivers_license" + + +class IN_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify IN drivers licenses.""" + + name = "IN_drivers_license" + regex = re.compile("|".join(IN), re.IGNORECASE) + filth_cls = IN_DLFilth + + +class KS_DLFilth(scrubadub.filth.Filth): + """Create filth class for KS drivers licenses.""" + + type = "KS_drivers_license" + + +class KS_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify KS drivers licenses.""" + + name = "KS_drivers_license" + regex = re.compile("|".join(KS), re.IGNORECASE) + filth_cls = KS_DLFilth + + +class KY_DLFilth(scrubadub.filth.Filth): + """Create filth class for KY drivers licenses.""" + + type = "KY_drivers_license" + + +class KY_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify KY drivers licenses.""" + + name = "KY_drivers_license" + regex = re.compile("|".join(KY), re.IGNORECASE) + filth_cls = KY_DLFilth + + +class MD_DLFilth(scrubadub.filth.Filth): + """Create filth class for MD drivers licenses.""" + + type = "MD_drivers_license" + + +class MD_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify MD drivers licenses.""" + + name = "MD_drivers_license" + regex = re.compile("|".join(MD), re.IGNORECASE) + filth_cls = MD_DLFilth + + +class MI_DLFilth(scrubadub.filth.Filth): + """Create filth class for MI drivers licenses.""" + + type = "MI_drivers_license" + + +class MI_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify MI drivers licenses.""" + + name = "MI_drivers_license" + regex = re.compile("|".join(MI), re.IGNORECASE) + filth_cls = MI_DLFilth + + +class MN_FL_MD_MI_DLFilth(scrubadub.filth.Filth): + """Create filth class for MN, FL, MD, and MI drivers licenses.""" + + type = "MN_FL_MD_MI_drivers_license" + + +class MN_FL_MD_MI_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify MN, FL, MD, and MI drivers licenses.""" + + name = "MN_FL_MD_MI_drivers_license" + regex = re.compile("|".join(MN_FL_MD_MI), re.IGNORECASE) + filth_cls = MN_FL_MD_MI_DLFilth + + +class MO_OK_DLFilth(scrubadub.filth.Filth): + """Create filth class for MO and OK drivers licenses.""" + + type = "MO_OK_drivers_license" + + +class MO_OK_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify MO and OK drivers licenses.""" + + name = "MO_OK_drivers_license" + regex = re.compile("|".join(MO_OK), re.IGNORECASE) + filth_cls = MO_OK_DLFilth + + +class ND_DLFilth(scrubadub.filth.Filth): + """Create filth class for ND drivers licenses.""" + + type = "ND_drivers_license" + + +class ND_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify ND drivers licenses.""" + + name = "ND_drivers_license" + regex = re.compile("|".join(ND), re.IGNORECASE) + filth_cls = ND_DLFilth + + +class NH_DLFilth(scrubadub.filth.Filth): + """Create filth class for NH drivers licenses.""" + + type = "NH_drivers_license" + + +class NH_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify NH drivers licenses.""" + + name = "NH_drivers_license" + regex = re.compile("|".join(NH), re.IGNORECASE) + filth_cls = NH_DLFilth + + +class NJ_DLFilth(scrubadub.filth.Filth): + """Create filth class for NJ drivers licenses.""" + + type = "NJ_drivers_license" + + +class NJ_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify NJ drivers licenses.""" + + name = "NJ_drivers_license" + regex = re.compile("|".join(NJ), re.IGNORECASE) + filth_cls = NJ_DLFilth + + +class NY_DLFilth(scrubadub.filth.Filth): + """Create filth class for NY drivers licenses.""" + + type = "NY_drivers_license" + + +class NY_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify NY drivers licenses.""" + + name = "NY_drivers_license" + regex = re.compile("|".join(NY), re.IGNORECASE) + filth_cls = NY_DLFilth + + +class OH_DLFilth(scrubadub.filth.Filth): + """Create filth class for OH drivers licenses.""" + + type = "OH_drivers_license" + + +class OH_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify OH drivers licenses.""" + + name = "OH_drivers_license" + regex = re.compile("|".join(OH), re.IGNORECASE) + filth_cls = OH_DLFilth + + +class PA_DLFilth(scrubadub.filth.Filth): + """Create filth class for PA drivers licenses.""" + + type = "PA_drivers_license" + + +class PA_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify PA drivers licenses.""" + + name = "PA_drivers_license" + regex = re.compile("|".join(PA), re.IGNORECASE) + filth_cls = PA_DLFilth + + +class VA_DLFilth(scrubadub.filth.Filth): + """Create filth class for VA drivers licenses.""" + + type = "VA_drivers_license" + + +class VA_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify VA drivers licenses.""" + + name = "VA_drivers_license" + regex = re.compile("|".join(VA), re.IGNORECASE) + filth_cls = VA_DLFilth + + +class VT_DLFilth(scrubadub.filth.Filth): + """Create filth class for VT drivers licenses.""" + + type = "VT_drivers_license" + + +class VT_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify VT drivers licenses.""" + + name = "VT_drivers_license" + regex = re.compile("|".join(VT), re.IGNORECASE) + filth_cls = VT_DLFilth + + +class WA_DLFilth(scrubadub.filth.Filth): + """Create filth class for WA drivers licenses.""" + + type = "WA_drivers_license" + + +class WA_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify WA drivers licenses.""" + + name = "WA_drivers_license" + regex = re.compile("|".join(WA), re.IGNORECASE) + filth_cls = WA_DLFilth + + +class WI_DLFilth(scrubadub.filth.Filth): + """Create filth class for WI drivers licenses.""" + + type = "WI_drivers_license" + + +class WI_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify WI drivers licenses.""" + + name = "WI_drivers_license" + regex = re.compile("|".join(WI), re.IGNORECASE) + filth_cls = WI_DLFilth + + +class WV_DLFilth(scrubadub.filth.Filth): + """Create filth class for WV drivers licenses.""" + + type = "WV_drivers_license" + + +class WV_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify WV drivers licenses.""" + + name = "WV_drivers_license" + regex = re.compile("|".join(WV), re.IGNORECASE) + filth_cls = WV_DLFilth + + +class WY_DLFilth(scrubadub.filth.Filth): + """Create filth class for WY drivers licenses.""" + + type = "WY_drivers_license" + + +class WY_DLDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify WY drivers licenses.""" + + name = "WY_drivers_license" + regex = re.compile("|".join(WY), re.IGNORECASE) + filth_cls = WY_DLFilth + + +# Build a detector to find Social security numbers with no spaces +class SSNFilth(scrubadub.filth.Filth): + """Create filth class for Social Security numbers.""" + + type = "no_space_social_security_number" + + +class SSNDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify Social Security numbers.""" + + name = "no_space_ssn" + regex = re.compile( + r"(?:(?<=\s)|(?<=^))(social security number|Social Security No|Social Security #|social|ssn)\W*(?!219099999|078051120)(?!666|000|9\d{2})\d{3}(?!00)\d{2}(?!0{4})\d{4}(?=$|\s)", + re.IGNORECASE, + ) + filth_cls = SSNFilth + + +# Build a detector that finds passport numbers based off of previous context +class PassportFilth(scrubadub.filth.Filth): + """Create filth class for passport numbers.""" + + type = "passport" + + +class PassportDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify passport numbers.""" + + name = "passport" + regex = re.compile( + r"(Passport Number|Passport No|Passport #|Passport#|PassportID|Passportno|passportnumber)\W*\d{9}", + re.IGNORECASE, + ) + filth_cls = PassportFilth + + +# Build a detector that identifies Alien Registration Numbers +class AlienRegistrationFilth(scrubadub.filth.Filth): + """Create filth class for Alien Registration Numbers.""" + + type = "alien registration" + + +class AlienRegistrationDetector(scrubadub.detectors.RegexDetector): + """Create detector class to identify Alien Registration Numbers.""" + + name = "alien registration" + regex = re.compile( + r"^(([A-Za-z]{3}[0-9]{10})|([A-Za-z]{3}(\s)([0-9]{2}(\s)[0-9]{3}(\s)[0-9]{5})))$", + re.IGNORECASE, + ) + filth_cls = AlienRegistrationFilth + + +# Create various regex identifiers +email = r"\b([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\"([]!#-[^-~ \t]|(\\[\t -~]))+\")@([-!#-'*+/-9=?A-Z^-~]+(\.[-!#-'*+/-9=?A-Z^-~]+)*|\[[\t -Z^-~]*])\b" +all_cards = r"\b((4\d{3}|5[1-5]\d{2}|2\d{3}|3[47]\d{1,2})[\s\-]?\d{4,6}[\s\-]?\d{4,6}?([\s\-]\d{3,4})?(\d{3})?)\b" +US_phones = r"((\+|\b)[1l][\-\. ])?\(?\b[\dOlZSB]{3,5}([\-\. ]|\) ?)[\dOlZSB]{3}[\-\. ][\dOlZSB]{4}\b" +US_street_address = r"\d{1,8}\b[\s\S]{10,100}?\b(AK|AL|AR|AZ|CA|CO|CT|DC|DE|FL|GA|HI|IA|ID|IL|IN|KS|KY|LA|MA|MD|ME|MI|MN|MO|MS|MT|NC|ND|NE|NH|NJ|NM|NV|NY|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VA|VT|WA|WI|WV|WY)\b\s\d{5}" + + +def redact_pii(df, column_list=[]): + """Run through provided columns and redact PII.""" + if column_list: + for column in column_list: + df = scrub(df, column) + df[column] = df[column].replace( + regex={ + all_cards: "{{CREDIT_CARD}}", + US_street_address: "{{ADDRESS}}", + email: "{{EMAIL}}", + } + ) + else: + for column in df.columns: + df = scrub(df, column) + df = df.replace( + regex={ + all_cards: "{{CREDIT_CARD}}", + US_street_address: "{{ADDRESS}}", + email: "{{EMAIL}}", + } + ) + return df + + +def scrub(df, column): + """Add different scrubber classes and run column through scrubadub.""" + scrubber = scrubadub.Scrubber() + scrubber.remove_detector("url") + scrubber.remove_detector("twitter") + scrubber.remove_detector("email") + scrubber.add_detector(SSNDetector) + scrubber.add_detector(PassportDetector) + scrubber.add_detector(AlienRegistrationDetector) + scrubber.add_detector(FL_DLDetector) + scrubber.add_detector(HI_NE_VA_DLDetector) + scrubber.add_detector(IL_DLDetector) + scrubber.add_detector(MN_FL_MD_MI_DLDetector) + scrubber.add_detector(MO_OK_DLDetector) + scrubber.add_detector(MD_DLDetector) + scrubber.add_detector(CA_DLDetector) + scrubber.add_detector(CO_DLDetector) + scrubber.add_detector(ID_DLDetector) + scrubber.add_detector(NJ_DLDetector) + scrubber.add_detector(NY_DLDetector) + scrubber.add_detector(ND_DLDetector) + scrubber.add_detector(OH_DLDetector) + scrubber.add_detector(PA_DLDetector) + scrubber.add_detector(VT_DLDetector) + scrubber.add_detector(VA_DLDetector) + scrubber.add_detector(WA_DLDetector) + scrubber.add_detector(WV_DLDetector) + scrubber.add_detector(WI_DLDetector) + scrubber.add_detector(WY_DLDetector) + scrubber.add_detector(NH_DLDetector) + scrubber.add_detector(IN_DLDetector) + scrubber.add_detector(IA_DLDetector) + scrubber.add_detector(KS_DLDetector) + scrubber.add_detector(KY_DLDetector) + scrubber.add_detector(MI_DLDetector) + df[column] = df[column].apply(lambda x: scrubber.clean(x)) + + analyzer = AnalyzerEngine() + anonymizer = AnonymizerEngine() + entities = [ + "CREDIT_CARD", + "EMAIL_ADDRESS", + "IP_ADDRESS", + "PHONE_NUMBER", + "US_DRIVER_LICENSE", + "US_SSN", + ] + + df[column] = df[column].apply( + lambda x: anonymizer.anonymize( + text=x, + analyzer_results=analyzer.analyze(text=x, entities=entities, language="en"), + ).text + ) + return df diff --git a/src/pe_source/data/pe_db/config.py b/src/pe_source/data/pe_db/config.py new file mode 100644 index 0000000..79fed38 --- /dev/null +++ b/src/pe_source/data/pe_db/config.py @@ -0,0 +1,137 @@ +"""Get PE Source API credentials.""" + +# Standard Python Libraries +from configparser import ConfigParser +import logging +import os + +# Third-Party Libraries +from importlib_resources import files +import requests +import shodan + +# Configuration +REPORT_DB_CONFIG = files("pe_source").joinpath("data/database.ini") + + +# Setup logging to central file +# To avoid a circular reference error which occurs when calling app.config["LOGGER"] +# we are directly calling the logger here +LOGGER = logging.getLogger(__name__) + + +def config(filename=REPORT_DB_CONFIG, section="postgres"): + """Parse Postgres configuration details from database configuration file.""" + parser = ConfigParser() + parser.read(filename, encoding="utf-8") + db = dict() + if parser.has_section(section): + for key, value in parser.items(section): + db[key] = value + else: + raise Exception(f"Section {section} not found in {filename}") + return db + + +def shodan_api_init(): + """Connect to Shodan API.""" + section = "shodan" + api_list = [] + if os.path.isfile(REPORT_DB_CONFIG): + parser = ConfigParser() + parser.read(REPORT_DB_CONFIG, encoding="utf-8") + if parser.has_section(section): + params = parser.items(section) + else: + raise Exception( + "Section {} not found in the {} file".format(section, REPORT_DB_CONFIG) + ) + else: + raise Exception( + "Database.ini file not found at this path: {}".format(REPORT_DB_CONFIG) + ) + + for key in params: + try: + api = shodan.Shodan(key[1]) + # Test api key + api.info() + except Exception: + LOGGER.error("Invalid Shodan API key: {}".format(key)) + continue + api_list.append(api) + LOGGER.info("Number of valid Shodan API keys: {}".format(len(api_list))) + return api_list + + +def cybersix_token(): + """Retrieve bearer token from Cybersixgill client.""" + section = "sixgill" + if os.path.isfile(REPORT_DB_CONFIG): + parser = ConfigParser() + parser.read(REPORT_DB_CONFIG, encoding="utf-8") + if parser.has_section(section): + params = parser.items(section) + _id, _secret = params[0], params[1] + client_id = _id[1] + client_secret = _secret[1] + else: + raise Exception( + "Section {} not found in the {} file".format(section, REPORT_DB_CONFIG) + ) + else: + raise Exception( + "Database.ini file not found at this path: {}".format(REPORT_DB_CONFIG) + ) + url = "https://api.cybersixgill.com/auth/token/" + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Cache-Control": "no-cache", + } + payload = { + "grant_type": "client_credentials", + "client_id": client_id, + "client_secret": client_secret, + } + resp = requests.post(url, headers=headers, data=payload).json() + return resp["access_token"] + + +def get_params(section): + """Get data source parameters.""" + if os.path.isfile(REPORT_DB_CONFIG): + parser = ConfigParser() + parser.read(REPORT_DB_CONFIG, encoding="utf-8") + if parser.has_section(section): + params = parser.items(section) + else: + raise Exception( + "Section {} not found in the {} file".format(section, REPORT_DB_CONFIG) + ) + else: + raise Exception( + "Database.ini file not found at this path: {}".format(REPORT_DB_CONFIG) + ) + return params + + +def dnsmonitor_token(): + """Retreive the DNSMonitor bearer token.""" + section = "dnsmonitor" + params = get_params(section) + client_id, client_secret = params[0][1], params[1][1] + scope = "DNSMonitorAPI" + url = "https://argosecure.com/dhs/connect/token" + + payload = { + "client_id": client_id, + "client_secret": client_secret, + "grant_type": "client_credentials", + "scope": scope, + } + headers = {} + files = [] + response = requests.request( + "POST", url, headers=headers, data=payload, files=files + ).json() + return response["access_token"] diff --git a/src/pe_source/data/pe_db/db_query_source.py b/src/pe_source/data/pe_db/db_query_source.py new file mode 100644 index 0000000..a68743f --- /dev/null +++ b/src/pe_source/data/pe_db/db_query_source.py @@ -0,0 +1,591 @@ +#!/usr/bin/env python +"""Query the PE PostgreSQL database.""" + +# Standard Python Libraries +from datetime import datetime +import logging +import socket +import sys + +# Third-Party Libraries +import pandas as pd +import psycopg2 +from psycopg2 import OperationalError +import psycopg2.extras as extras + +# cisagov Libraries +from pe_source.data.pe_db.config import config + +# Setup logging to central file +LOGGER = logging.getLogger(__name__) + +CONN_PARAMS_DIC = config() + + +def show_psycopg2_exception(err): + """Handle errors for PostgreSQL issues.""" + err_type, err_obj, traceback = sys.exc_info() + LOGGER.error( + "Database connection error: %s on line number: %s", err, traceback.tb_lineno + ) + + +def connect(): + """Connect to PostgreSQL database.""" + try: + conn = psycopg2.connect(**CONN_PARAMS_DIC) + except OperationalError as err: + show_psycopg2_exception(err) + conn = None + return conn + + +def close(conn): + """Close connection to PostgreSQL.""" + conn.close() + + +def get_orgs(): + """Query organizations that receive reports and demo organizations.""" + conn = connect() + try: + cur = conn.cursor() + sql = """SELECT * FROM organizations where report_on or demo""" + cur.execute(sql) + pe_orgs = cur.fetchall() + keys = ("org_uid", "org_name", "cyhy_db_name") + pe_orgs = [dict(zip(keys, values)) for values in pe_orgs] + cur.close() + return pe_orgs + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.error("There was a problem with your database query %s", error) + finally: + if conn is not None: + close(conn) + + +def get_ips(org_uid): + """Get IP data.""" + conn = connect() + sql1 = """SELECT i.ip_hash, i.ip, ct.network FROM ips i + JOIN cidrs ct on ct.cidr_uid = i.origin_cidr + JOIN organizations o on o.organizations_uid = ct.organizations_uid + where o.organizations_uid = %(org_uid)s + and i.origin_cidr is not null + and i.shodan_results is True;""" + df1 = pd.read_sql(sql1, conn, params={"org_uid": org_uid}) + ips1 = list(df1["ip"].values) + + sql2 = """select i.ip_hash, i.ip + from ips i + join ips_subs is2 ON i.ip_hash = is2.ip_hash + join sub_domains sd on sd.sub_domain_uid = is2.sub_domain_uid + join root_domains rd on rd.root_domain_uid = sd.root_domain_uid + JOIN organizations o on o.organizations_uid = rd.organizations_uid + where o.organizations_uid = %(org_uid)s + and i.shodan_results is True;""" + df2 = pd.read_sql(sql2, conn, params={"org_uid": org_uid}) + ips2 = list(df2["ip"].values) + + in_first = set(ips1) + in_second = set(ips2) + + in_second_but_not_in_first = in_second - in_first + + ips = ips1 + list(in_second_but_not_in_first) + conn.close() + + return ips + + +def get_data_source_uid(source): + """Get data source uid.""" + conn = connect() + cur = conn.cursor() + sql = """SELECT * FROM data_source WHERE name = '{}'""" + cur.execute(sql.format(source)) + source = cur.fetchone()[0] + cur.close() + cur = conn.cursor() + # Update last_run in data_source table + date = datetime.today().strftime("%Y-%m-%d") + sql = """update data_source set last_run = '{}' + where name = '{}';""" + cur.execute(sql.format(date, source)) + cur.close() + close(conn) + return source + + +def insert_sixgill_alerts(df): + """Insert sixgill alert data.""" + conn = connect() + columns_to_subset = [ + "alert_name", + "content", + "date", + "sixgill_id", + "read", + "severity", + "site", + "threat_level", + "threats", + "title", + "user_id", + "category", + "lang", + "organizations_uid", + "data_source_uid", + "content_snip", + "asset_mentioned", + "asset_type", + ] + try: + df = df.loc[:, df.columns.isin(columns_to_subset)] + except Exception as e: + LOGGER.error(e) + table = "alerts" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (sixgill_id) DO UPDATE SET + content = EXCLUDED.content, + content_snip = EXCLUDED.content_snip, + asset_mentioned = EXCLUDED.asset_mentioned, + asset_type = EXCLUDED.asset_type;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info("Successfully inserted/updated alert data into PE database.") + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.error(error) + conn.rollback() + cursor.close() + + +def insert_sixgill_mentions(df): + """Insert sixgill mention data.""" + conn = connect() + columns_to_subset = [ + "organizations_uid", + "data_source_uid", + "category", + "collection_date", + "content", + "creator", + "date", + "sixgill_mention_id", + "lang", + "post_id", + "rep_grade", + "site", + "site_grade", + "sub_category", + "title", + "type", + "url", + "comments_count", + "tags", + ] + try: + df = df.loc[:, df.columns.isin(columns_to_subset)] + except Exception as e: + LOGGER.error(e) + + # Remove any "[\x00|NULL]" characters + df = df.apply( + lambda col: col.str.replace(r"[\x00|NULL]", "", regex=True) + if col.dtype == object + else col + ) + table = "mentions" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (sixgill_mention_id) DO NOTHING;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info("Successfully inserted/updated mention data into PE database.") + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.error(error) + conn.rollback() + cursor.close() + + +def insert_sixgill_breaches(df): + """Insert sixgill breach data.""" + conn = connect() + table = "credential_breaches" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (breach_name) DO UPDATE SET + password_included = EXCLUDED.password_included;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info("Successfully inserted/updated breaches into PE database.") + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.info(error) + conn.rollback() + cursor.close() + + +def get_breaches(): + """Get credential breaches.""" + conn = connect() + try: + cur = conn.cursor() + sql = """SELECT breach_name, credential_breaches_uid FROM credential_breaches""" + cur.execute(sql) + pe_orgs = cur.fetchall() + cur.close() + return pe_orgs + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.error("There was a problem with your database query %s", error) + finally: + if conn is not None: + close(conn) + + +def insert_sixgill_credentials(df): + """Insert sixgill credential data.""" + conn = connect() + table = "credential_exposures" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (breach_name, email) DO UPDATE SET + modified_date = EXCLUDED.modified_date;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info( + "Successfully inserted/updated exposed credentials into PE database." + ) + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.info(error) + conn.rollback() + cursor.close() + + +def insert_sixgill_topCVEs(df): + """Insert sixgill top CVEs.""" + conn = connect() + table = "top_cves" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (cve_id, date) DO NOTHING;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info("Successfully inserted/updated top cve data into PE database.") + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.info(error) + conn.rollback() + cursor.close() + + +def insert_shodan_data(dataframe, table, thread, org_name, failed): + """Insert Shodan data into database.""" + conn = connect() + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (organizations_uid, ip, port, protocol, timestamp) + DO NOTHING;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + sql.format( + table, + cols, + ), + tpls, + ) + conn.commit() + LOGGER.info( + "{} Data inserted using execute_values() successfully - {}".format( + thread, org_name + ) + ) + except Exception as e: + LOGGER.error("{} failed inserting into {}".format(org_name, table)) + LOGGER.error("{} {} - {}".format(thread, e, org_name)) + failed.append("{} failed inserting into {}".format(org_name, table)) + conn.rollback() + cursor.close() + return failed + + +def getSubdomain(domain): + """Get subdomain.""" + conn = connect() + try: + cur = conn.cursor() + sql = """select * from sub_domains sd + where sd.sub_domain = %s;""" + cur.execute(sql, [domain]) + sub = cur.fetchall() + cur.close() + return sub[0][0] + except (Exception, psycopg2.DatabaseError): + print("Adding domain to the sub-domain table") + finally: + if conn is not None: + close(conn) + + +def getRootdomain(domain): + """Get root domain.""" + conn = connect() + cur = conn.cursor() + sql = """SELECT * FROM root_domains rd + WHERE rd.root_domain = '{}'""" + cur.execute(sql.format(domain)) + root = cur.fetchone() + cur.close() + return root + + +def addRootdomain(root_domain, pe_org_uid, source_uid, org_name): + """Add root domain.""" + conn = connect() + ip_address = str(socket.gethostbyname(root_domain)) + sql = """insert into root_domains(root_domain, organizations_uid, organization_name, data_source_uid, ip_address) + values ('{}', '{}', '{}', '{}', '{}');""" + cur = conn.cursor() + cur.execute(sql.format(root_domain, pe_org_uid, org_name, source_uid, ip_address)) + conn.commit() + cur.close() + + +def addSubdomain(conn, domain, pe_org_uid, root): + """Add a subdomain into the database.""" + conn = connect() + if root: + root_domain = domain + else: + root_domain = domain.split(".")[-2:] + root_domain = ".".join(root_domain) + cur = conn.cursor() + date = datetime.today().strftime("%Y-%m-%d") + cur.callproc( + "insert_sub_domain", + (False, date, domain, pe_org_uid, "findomain", root_domain, None), + ) + LOGGER.info("Success adding domain %s to subdomains table.", domain) + conn.commit() + close(conn) + + +def org_root_domains(conn, org_uid): + """Get root domains from database given the org_uid.""" + conn = connect() + try: + cur = conn.cursor() + sql = """select * from root_domains rd + where rd.organizations_uid = %s;""" + cur.execute(sql, [org_uid]) + roots = cur.fetchall() + keys = ( + "root_uid", + "org_uid", + "root_domain", + "ip_address", + "data_source_uid", + "enumerate_subs", + ) + roots = [dict(zip(keys, values)) for values in roots] + cur.close() + return roots + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.error("There was a problem with your database query %s", error) + finally: + if conn is not None: + close(conn) + + +def query_orgs_rev(): + """Query orgs in reverse.""" + conn = connect() + sql = "SELECT * FROM organizations WHERE report_on is True ORDER BY organizations_uid DESC;" + df = pd.read_sql_query(sql, conn) + return df + + +def insert_intelx_breaches(df): + """Insert IntelX breach data.""" + conn = connect() + table = "credential_breaches" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (breach_name) DO UPDATE SET + password_included = EXCLUDED.password_included;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info("Successfully inserted/updated IntelX breaches into PE database.") + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.info(error) + conn.rollback() + cursor.close() + + +def get_intelx_breaches(source_uid): + """Get IntelX credential breaches.""" + conn = connect() + try: + cur = conn.cursor() + sql = """SELECT breach_name, credential_breaches_uid FROM credential_breaches where data_source_uid = %s""" + cur.execute(sql, [source_uid]) + all_breaches = cur.fetchall() + cur.close() + return all_breaches + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.error("There was a problem with your database query %s", error) + finally: + if conn is not None: + close(conn) + + +def insert_intelx_credentials(df): + """Insert IntelX credential data.""" + conn = connect() + table = "credential_exposures" + # Create a list of tuples from the dataframe values + tuples = [tuple(x) for x in df.to_numpy()] + # Comma-separated dataframe columns + cols = ",".join(list(df.columns)) + # SQL query to execute + query = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (breach_name, email) DO UPDATE SET + modified_date = EXCLUDED.modified_date;""" + cursor = conn.cursor() + try: + extras.execute_values( + cursor, + query.format( + table, + cols, + ), + tuples, + ) + conn.commit() + LOGGER.info( + "Successfully inserted/updated exposed IntelX credentials into PE database." + ) + except (Exception, psycopg2.DatabaseError) as error: + LOGGER.info(error) + conn.rollback() + cursor.close() + + +def execute_dnsmonitor_data(dataframe, table): + """Insert DNSMonitor data.""" + conn = connect() + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (domain_permutation, organizations_uid) + DO UPDATE SET ipv4 = EXCLUDED.ipv4, + ipv6 = EXCLUDED.ipv6, + date_observed = EXCLUDED.date_observed, + mail_server = EXCLUDED.mail_server, + name_server = EXCLUDED.name_server, + sub_domain_uid = EXCLUDED.sub_domain_uid, + data_source_uid = EXCLUDED.data_source_uid;""" + cursor = conn.cursor() + extras.execute_values( + cursor, + sql.format(table, cols), + tpls, + ) + conn.commit() + + +def execute_dnsmonitor_alert_data(dataframe, table): + """Insert DNSMonitor alerts.""" + conn = connect() + tpls = [tuple(x) for x in dataframe.to_numpy()] + cols = ",".join(list(dataframe.columns)) + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (alert_type, sub_domain_uid, date, new_value) + DO NOTHING;""" + cursor = conn.cursor() + extras.execute_values( + cursor, + sql.format(table, cols), + tpls, + ) + conn.commit() diff --git a/src/pe_source/data/shodan/shodan_search.py b/src/pe_source/data/shodan/shodan_search.py new file mode 100644 index 0000000..1ac2216 --- /dev/null +++ b/src/pe_source/data/shodan/shodan_search.py @@ -0,0 +1,421 @@ +"""Script for querying the Shodan API.""" + +# Standard Python Libraries +import datetime +import logging +import time + +# Third-Party Libraries +import pandas as pd +import requests +import shodan + +# cisagov Libraries +from pe_source.data.pe_db.db_query_source import ( + get_data_source_uid, + get_ips, + insert_shodan_data, +) + +LOGGER = logging.getLogger(__name__) +RETRY_COUNT = 7 + + +def run_shodan_thread(api, org_chunk, thread_name): + """Run a Shodan thread.""" + failed = [] + for org in org_chunk: + org_name = org["cyhy_db_name"] + org_uid = org["org_uid"] + LOGGER.info("{} Running IPs for {}".format(thread_name, org_name)) + start, end = get_dates() + try: + ips = get_ips(org_uid) + except Exception as e: + LOGGER.error("{} Failed fetching IPs for {}.".format(thread_name, org_name)) + LOGGER.error("{} {} - {}".format(thread_name, e, org_name)) + failed.append("{} fetching IPs".format(org_name)) + continue + + if len(ips) == 0: + LOGGER.error("{} No IPs for {}.".format(thread_name, org_name)) + failed.append("{} has 0 IPs".format(org_name)) + continue + + failed = search_shodan( + thread_name, ips, api, start, end, org_uid, org_name, failed + ) + + if len(failed) > 0: + LOGGER.critical("{} Failures: {}".format(thread_name, failed)) + + +def get_dates(): + """Get dates for the query.""" + now = datetime.datetime.now() + days_back = datetime.timedelta(days=30) + days_forward = datetime.timedelta(days=1) + start = now - days_back + end = now + days_forward + start_time = time_to_utc(start) + end_time = time_to_utc(end) + return start_time, end_time + + +def time_to_utc(in_time): + """Convert time to UTC.""" + # If time does not have timezone info, assume it is local + if in_time.tzinfo is None: + local_tz = datetime.datetime.now().astimezone().tzinfo + in_time = in_time.replace(tzinfo=local_tz) + utc_time = in_time.astimezone(datetime.timezone.utc) + return utc_time + + +def search_circl(cve): + """Fetch CVE info from Circl.""" + re = requests.get(f"https://cve.circl.lu/api/cve/{cve}") + return re + + +def search_shodan(thread_name, ips, api, start, end, org_uid, org_name, failed): + """Search IPs in the Shodan API.""" + # Initialize lists to store Shodan results + data = [] + risk_data = [] + vuln_data = [] + + # Build dictionaries for naming conventions and definitions + risky_ports, name_dict, risk_dict, av_dict, ac_dict, ci_dict = get_shodan_dicts() + + # Break up IPs into chunks of 100 + tot_ips = len(ips) + ip_chunks = [ips[i : i + 10] for i in range(0, tot_ips, 10)] + tot = len(ip_chunks) + LOGGER.info( + "{} Split {} IPs into {} chunks - {}".format( + thread_name, tot_ips, tot, org_name + ) + ) + + # Loop through chunks and query Shodan + for i, ip_chunk in enumerate(ip_chunks): + count = i + 1 + try_count = 1 + while try_count < RETRY_COUNT: + try: + results = api.host(ip_chunk) + for r in results: + for d in r["data"]: + # Convert Shodan date string to UTC datetime + shodan_datetime = datetime.datetime.strptime( + d["timestamp"], "%Y-%m-%dT%H:%M:%S.%f" + ) + shodan_utc = time_to_utc(shodan_datetime) + # Only include results in the timeframe + if shodan_utc > start and shodan_utc < end: + prod = d.get("product", None) + serv = d.get("http", {}).get("server") + asn = d.get("ASN", None) + vulns = d.get("vulns", None) + location = d.get("location", None) + if vulns is not None: + unverified = [] + for cve in list(vulns.keys()): + # Check if CVEs are verified + unverified, vuln_data = is_verified( + vulns, + cve, + av_dict, + ac_dict, + ci_dict, + vuln_data, + org_uid, + r, + d, + asn, + unverified, + ) + if len(unverified) > 0: + ftype = "Pontentially Vulnerable Product" + name = prod + risk = unverified + mitigation = "Verify asset is up to date, supported by the vendor, and configured securely" + risk_data.append( + { + "asn": asn, + "domains": r["domains"], + "hostnames": r["hostnames"], + "ip": r["ip_str"], + "isn": r["isp"], + "mitigation": mitigation, + "name": name, + "organization": r["org"], + "organizations_uid": org_uid, + "port": d["port"], + "potential_vulns": risk, + "product": prod, + "protocol": d["_shodan"]["module"], + "server": serv, + "tags": r["tags"], + "timestamp": d["timestamp"], + "type": ftype, + "is_verified": False, + } + ) + elif d["_shodan"]["module"] in risky_ports: + ftype = "Insecure Protocol" + name = name_dict[d["_shodan"]["module"]] + risk = [risk_dict[d["_shodan"]["module"]]] + mitigation = "Confirm open port has a required business use for internet exposure and ensure necessary safeguards are in place through TCP wrapping, TLS encryption, or authentication requirements" + risk_data.append( + { + "ac_description": None, + "ai_description": None, + "asn": asn, + "attack_complexity": None, + "attack_vector": None, + "av_description": None, + "availability_impact": None, + "ci_description": None, + "confidentiality_impact": None, + "cve": None, + "cvss": None, + "domains": r["domains"], + "hostnames": r["hostnames"], + "ii_Description": None, + "integrity_impact": None, + "ip": r["ip_str"], + "isn": r["isp"], + "mitigation": mitigation, + "name": name, + "organization": r["org"], + "organizations_uid": org_uid, + "port": d["port"], + "potential_vulns": risk, + "product": prod, + "protocol": d["_shodan"]["module"], + "server": serv, + "severity": None, + "summary": None, + "tags": r["tags"], + "timestamp": d["timestamp"], + "type": ftype, + "is_verified": False, + } + ) + + data.append( + { + "asn": asn, + "domains": r["domains"], + "hostnames": r["hostnames"], + "ip": r["ip_str"], + "isn": r["isp"], + "organization": r["org"], + "organizations_uid": org_uid, + "port": d["port"], + "product": prod, + "protocol": d["_shodan"]["module"], + "server": serv, + "tags": r["tags"], + "timestamp": d["timestamp"], + "country_code": location["country_code"], + "location": str(location), + } + ) + + time.sleep(1) + break + except shodan.APIError as e: + if try_count == 5: + LOGGER.error( + "{} Failed 5 times. Continuing to next chunk - {}".format( + thread_name, org_name + ) + ) + failed.append( + "{} chunk {} failed 5 times and skipped".format(org_name, count) + ) + break + LOGGER.error("{} {} - {}".format(thread_name, e, org_name)) + LOGGER.error( + "{} Try #{} failed. Calling the API again. - {}".format( + thread_name, try_count, org_name + ) + ) + try_count += 1 + # Most likely too many API calls per second so sleep + time.sleep(5) + except Exception as e: + LOGGER.error("{} {} - {}".format(thread_name, e, org_name)) + LOGGER.error( + "{} Not a shodan API error. Continuing to next chunk - {}".format( + thread_name, org_name + ) + ) + failed.append("{} chunk {} failed and skipped".format(org_name, count)) + break + + LOGGER.info("{} {}/{} complete - {}".format(thread_name, count, tot, org_name)) + + df = pd.DataFrame(data) + risk_df = pd.DataFrame(risk_data) + vuln_df = pd.DataFrame(vuln_data) + all_vuln_df = vuln_df.append(risk_df, ignore_index=True) + # Grab the data source uid and add to each dataframe + source_uid = get_data_source_uid("Shodan") + df["data_source_uid"] = source_uid + risk_df["data_source_uid"] = source_uid + vuln_df["data_source_uid"] = source_uid + all_vuln_df["data_source_uid"] = source_uid + + # Insert data into the PE database + failed = insert_shodan_data(df, "shodan_assets", thread_name, org_name, failed) + failed = insert_shodan_data( + all_vuln_df, "shodan_vulns", thread_name, org_name, failed + ) + + return failed + + +def is_verified( + vulns, cve, av_dict, ac_dict, ci_dict, vuln_data, org_uid, r, d, asn, unverified +): + """Check if a CVE is verified.""" + v = vulns[cve] + if v["verified"]: + re = search_circl(cve) + r_json = re.json() + if r_json is not None: + summary = r_json.get("summary") + product = r_json.get("vulnerable_product") + attack_vector = r_json.get("access", {}).get("vector") + av = av_dict.get(attack_vector) + attack_complexity = r_json.get("access", {}).get("complexity") + ac = ac_dict.get(attack_complexity) + conf_imp = r_json.get("impact", {}).get("confidentiality") + ci = ci_dict.get(conf_imp) + int_imp = r_json.get("impact", {}).get("integrity") + ii = ci_dict.get(int_imp) + avail_imp = r_json.get("impact", {}).get("availability") + ai = ci_dict.get(avail_imp) + cvss = r_json.get("cvss") + if cvss == 10: + severity = "Critical" + elif cvss >= 7: + severity = "High" + elif cvss >= 4: + severity = "Medium" + elif cvss > 0: + severity = "Low" + else: + severity = None + vuln_data.append( + { + "ac_description": ac or "", + "ai_description": ai or "", + "asn": asn, + "attack_complexity": attack_complexity or "", + "attack_vector": attack_vector or "", + "av_description": av or "", + "availability_impact": avail_imp or "", + "ci_description": ci or "", + "confidentiality_impact": conf_imp or "", + "cve": cve, + "cvss": cvss or None, + "domains": r["domains"], + "hostnames": r["hostnames"], + "ii_Description": ii or "", + "integrity_impact": int_imp or "", + "ip": r["ip_str"], + "isn": r["isp"], + "mitigation": None, + "name": None, + "organization": r["org"], + "organizations_uid": org_uid, + "port": d["port"], + "potential_vulns": None, + "product": product or "", + "protocol": d["_shodan"]["module"], + "server": None, + "severity": severity or "", + "summary": summary or "", + "tags": r["tags"], + "timestamp": d["timestamp"], + "type": None, + "is_verified": True, + } + ) + else: + unverified.append(cve) + + return unverified, vuln_data + + +def get_shodan_dicts(): + """Build Shodan dictionaries that hold definitions and naming conventions.""" + risky_ports = [ + "ftp", + "telnet", + "http", + "smtp", + "pop3", + "imap", + "netbios", + "snmp", + "ldap", + "smb", + "sip", + "rdp", + "vnc", + "kerberos", + ] + name_dict = { + "ftp": "File Transfer Protocol", + "telnet": "Telnet", + "http": "Hypertext Transfer Protocol", + "smtp": "Simple Mail Transfer Protocol", + "pop3": "Post Office Protocol 3", + "imap": "Internet Message Access Protocol", + "netbios": "Network Basic Input/Output System", + "snmp": "Simple Network Management Protocol", + "ldap": "Lightweight Directory Access Protocol", + "smb": "Server Message Block", + "sip": "Session Initiation Protocol", + "rdp": "Remote Desktop Protocol", + "kerberos": "Kerberos", + } + risk_dict = { + "ftp": "FTP", + "telnet": "Telnet", + "http": "HTTP", + "smtp": "SMTP", + "pop3": "POP3", + "imap": "IMAP", + "netbios": "NetBIOS", + "snmp": "SNMP", + "ldap": "LDAP", + "smb": "SMB", + "sip": "SIP", + "rdp": "RDP", + "vnc": "VNC", + "kerberos": "Kerberos", + } + # Create dictionaries for CVSSv2 vector definitions using https://nvd.nist.gov/vuln-metrics/cvss/v3-calculator + av_dict = { + "NETWORK": "A vulnerability exploitable with network access means the vulnerable software is bound to the network stack and the attacker does not require local network access or local access. Such a vulnerability is often termed “remotely exploitable”. An example of a network attack is an RPC buffer overflow.", + "ADJACENT_NETWORK": "A vulnerability exploitable with adjacent network access requires the attacker to have access to either the broadcast or collision domain of the vulnerable software. Examples of local networks include local IP subnet, Bluetooth, IEEE 802.11, and local Ethernet segment.", + "LOCAL": "A vulnerability exploitable with only local access requires the attacker to have either physical access to the vulnerable system or a local (shell) account. Examples of locally exploitable vulnerabilities are peripheral attacks such as Firewire/USB DMA attacks, and local privilege escalations (e.g., sudo).", + } + ac_dict = { + "LOW": "Specialized access conditions or extenuating circumstances do not exist. The following are examples: The affected product typically requires access to a wide range of systems and users, possibly anonymous and untrusted (e.g., Internet-facing web or mail server). The affected configuration is default or ubiquitous. The attack can be performed manually and requires little skill or additional information gathering. The 'race condition' is a lazy one (i.e., it is technically a race but easily winnable).", + "MEDIUM": "The access conditions are somewhat specialized; the following are examples: The attacking party is limited to a group of systems or users at some level of authorization, possibly untrusted. Some information must be gathered before a successful attack can be launched. The affected configuration is non-default, and is not commonly configured (e.g., a vulnerability present when a server performs user account authentication via a specific scheme, but not present for another authentication scheme). The attack requires a small amount of social engineering that might occasionally fool cautious users (e.g., phishing attacks that modify a web browser’s status bar to show a false link, having to be on someone’s “buddy” list before sending an IM exploit).", + "HIGH": "Specialized access conditions exist. For example, in most configurations, the attacking party must already have elevated privileges or spoof additional systems in addition to the attacking system (e.g., DNS hijacking). The attack depends on social engineering methods that would be easily detected by knowledgeable people. For example, the victim must perform several suspicious or atypical actions. The vulnerable configuration is seen very rarely in practice. If a race condition exists, the window is very narrow.", + } + ci_dict = { + "NONE": "There is no impact to the confidentiality of the system", + "PARTIAL": "There is considerable informational disclosure. Access to some system files is possible, but the attacker does not have control over what is obtained, or the scope of the loss is constrained. An example is a vulnerability that divulges only certain tables in a database.", + "COMPLETE": "There is total information disclosure, resulting in all system files being revealed. The attacker is able to read all of the system's data (memory, files, etc.).", + } + return risky_ports, name_dict, risk_dict, av_dict, ac_dict, ci_dict diff --git a/src/pe_source/data/sixgill/api.py b/src/pe_source/data/sixgill/api.py new file mode 100644 index 0000000..316f3cf --- /dev/null +++ b/src/pe_source/data/sixgill/api.py @@ -0,0 +1,212 @@ +"""Cybersixgill API calls.""" + +# Standard Python Libraries +import logging +import time + +# Third-Party Libraries +import pandas as pd +import requests +from retry import retry + +# cisagov Libraries +from pe_source.data.pe_db.config import cybersix_token + +LOGGER = logging.getLogger(__name__) + + +def get_sixgill_organizations(): + """Get the list of organizations.""" + url = "https://api.cybersixgill.com/multi-tenant/organization" + auth = cybersix_token() + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + orgs = requests.get(url, headers=headers).json() + df_orgs = pd.DataFrame(orgs) + sixgill_dict = df_orgs.set_index("name").agg(list, axis=1).to_dict() + return sixgill_dict + + +def org_assets(org_id): + """Get organization assets.""" + url = f"https://api.cybersixgill.com/multi-tenant/organization/{org_id}/assets" + auth = cybersix_token() + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + payload = {"organization_id": org_id} + count = 1 + while count < 7: + try: + resp = requests.get(url, headers=headers, params=payload).json() + break + except Exception: + time.sleep(5) + LOGGER.info("Error. Trying query post again...") + count += 1 + continue + resp = requests.get(url, headers=headers, params=payload).json() + return resp + + +def intel_post(auth, query, frm, scroll, result_size): + """Get intel items - advanced variation.""" + url = "https://api.cybersixgill.com/intel/intel_items" + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + payload = { + "query": query, + "partial_content": False, + "results_size": result_size, + "scroll": scroll, + "from": frm, + "sort": "date", + "sort_type": "desc", + "highlight": False, + "recent_items": False, + "safe_content_size": True, + } + resp = requests.post(url, headers=headers, json=payload).json() + return resp + + +def alerts_list(auth, organization_id, fetch_size, offset): + """Get actionable alerts by ID using organization_id with optional filters.""" + url = "https://api.cybersixgill.com/alerts/actionable-alert" + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + payload = { + "organization_id": organization_id, + "fetch_size": fetch_size, + "offset": offset, + } + resp = requests.get(url, headers=headers, params=payload) + return resp + + +def alerts_count(auth, organization_id): + """Get the total read and unread actionable alerts by organization.""" + url = "https://api.cybersixgill.com/alerts/actionable_alert/count" + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + payload = {"organization_id": organization_id} + resp = requests.get(url, headers=headers, params=payload).json() + return resp + + +def alerts_content(auth, organization_id, alert_id): + """Get total alert content.""" + url = f"https://api.cybersixgill.com/alerts/actionable_alert_content/{alert_id}" + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + payload = {"organization_id": organization_id, "limit": 10000} + content = requests.get(url, headers=headers, params=payload).json() + try: + content = content["content"]["items"][0] + if "_source" in content: + content = content["_source"]["content"] + elif "description" in content: + content = content["description"] + else: + content = "" + except Exception as e: + LOGGER.error("Failed getting content snip: %s", e) + content = "" + return content + + +def dve_top_cves(): + """Get data about a specific CVE.""" + url = "https://api.cybersixgill.com/dve_enrich/summary" + auth = cybersix_token() + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + resp = requests.get(url, headers=headers).json() + sorted_values = sorted( + resp["values"], + key=lambda x: x["score"]["sixgill"]["current"] + if x["score"]["sixgill"]["current"] is not None + else float("-inf"), + reverse=True, + ) + top_10_cves = sorted_values[:10] + + # Printing the top 10 CVEs + clean_top_10_cves = [] + for cve in top_10_cves: + print(cve["id"], "- Current rating:", cve["score"]["sixgill"]["current"]) + print(cve) + clean_cve = { + "cve_id": cve["id"], + "dynamic_rating": cve["score"]["sixgill"]["current"], + "nvd_base_score": cve["score"]["nvd"]["score"], + } + clean_top_10_cves.append(clean_cve) + return clean_top_10_cves + + +def credential_auth(params): + """Get data about a specific CVE.""" + url = "https://api.cybersixgill.com/credentials/leaks" + auth = cybersix_token() + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + resp = requests.get(url, headers=headers, params=params).json() + return resp + + +@retry(tries=10, delay=1, logger=LOGGER) +def get_bulk_cve_resp(cve_list): + """ + Make API call to retrieve the corresponding info for a list of CVE names (10 max). + + Args: + cve_list: list of cve names (i.e. ['CVE-2022-123', 'CVE-2022-456'...]) + + Returns: + Raw API response for CVE list + + """ + c6g_url = "https://api.cybersixgill.com/dve_enrich/enrich" + auth = cybersix_token() + headers = { + "Content-Type": "application/json", + "Cache-Control": "no-cache", + "Authorization": "Bearer " + auth, + } + body = { + "filters": {"ids": cve_list}, + "results_size": len(cve_list), + "from_index": 0, + } + # Make API call for specified CVE list + try: + # Attempt API call + resp = requests.post(c6g_url, headers=headers, json=body).json() + # Return response + return resp + except Exception as e: + LOGGER.error("Error making bulk CVE API call: %s", e) diff --git a/src/pe_source/data/sixgill/source.py b/src/pe_source/data/sixgill/source.py new file mode 100644 index 0000000..cd2b823 --- /dev/null +++ b/src/pe_source/data/sixgill/source.py @@ -0,0 +1,308 @@ +"""Scripts for importing Sixgill data into PE Postgres database.""" + +# Standard Python Libraries +import logging +import time + +# Third-Party Libraries +import pandas as pd +import requests + +# cisagov Libraries +from pe_source.data.pe_db.config import cybersix_token + +from .api import ( + alerts_content, + alerts_count, + alerts_list, + credential_auth, + dve_top_cves, + get_bulk_cve_resp, + intel_post, + org_assets, +) + +LOGGER = logging.getLogger(__name__) + + +def alias_organization(org_id): + """List an organization's aliases.""" + assets = org_assets(org_id) + df_assets = pd.DataFrame(assets) + aliases = df_assets["organization_aliases"].loc["explicit":].tolist()[0] + return aliases + + +def all_assets_list(org_id): + """List an organization's aliases.""" + assets = org_assets(org_id) + df_assets = pd.DataFrame(assets) + aliases = df_assets["organization_aliases"].loc["explicit":].tolist()[0] + alias_dict = dict.fromkeys(aliases, "alias") + domain_names = df_assets["domain_names"].loc["explicit":].tolist()[0] + domain_dict = dict.fromkeys(domain_names, "domain") + ips = df_assets["ip_addresses"].loc["explicit":].tolist()[0] + ip_dict = dict.fromkeys(ips, "ip") + assets_dict = {**alias_dict, **domain_dict, **ip_dict} + return assets_dict + + +def root_domains(org_id): + """Get root domains.""" + assets = org_assets(org_id) + df_assets = pd.DataFrame(assets) + root_domains = df_assets["domain_names"].loc["explicit":].tolist()[0] + return root_domains + + +def mentions(date, aliases, soc_media_included=False): + """Pull dark web mentions data for an organization.""" + token = cybersix_token() + + # Build the query using the org's aliases + mentions = "" + for mention in aliases: + mentions += '"' + mention + '"' + "," + mentions = mentions[:-1] + if soc_media_included: + query = "date:" + date + " AND " + "(" + str(mentions) + ")" + else: + query = ( + "date:" + + date + + " AND " + + "(" + + str(mentions) + + """) + NOT site:(twitter, Twitter, reddit, Reddit, Parler, parler, + linkedin, Linkedin, discord, forum_discord, raddle, telegram, + jabber, ICQ, icq, mastodon)""" + ) + + # Get the total number of mentions + count = 1 + while count < 7: + try: + LOGGER.info("Total mentions try #%s", count) + resp = intel_post(token, query, frm=0, scroll=False, result_size=1) + break + except Exception: + LOGGER.info("Error. Trying to get mentions count again...") + count += 1 + continue + total_mentions = resp["total_intel_items"] + LOGGER.info("Total Mentions: %s", total_mentions) + + # Fetch mentions in segments + # Recommended segment is 50. The maximum is 400. + i = 0 + segment_size = 100 + smaller_segment_count = 1 + all_mentions = [] + while i < total_mentions: + # Try to get a mentions segment 3 times + try_count = 1 + while try_count < 4: + try: + # If segment size was decreased, only use for 10 iterations + if smaller_segment_count == 10: + LOGGER.info("Switching back to a segment size of 100.") + segment_size = 100 + smaller_segment_count = 1 + if segment_size <= 10: + smaller_segment_count += 1 + # API post + resp = intel_post( + token, query, frm=i, scroll=False, result_size=segment_size + ) + i += segment_size + LOGGER.info( + "Got %s-%s of %s...", + i - segment_size, + i, + total_mentions, + ) + intel_items = resp["intel_items"] + df_mentions = pd.DataFrame.from_dict(intel_items) + all_mentions.append(df_mentions) + df_all_mentions = pd.concat(all_mentions).reset_index(drop=True) + break + except Exception: + # Sleep for 2 seconds + time.sleep(2) + # If the API post failed 3 times + if try_count == 3: + # If a segment was already decreased to 1, skip the mention + if segment_size == 1: + LOGGER.critical("Failed 3 times fetching 1 post. Skipping it.") + i += segment_size + break + # Decrease the segment to 10, then if still failing, to 1 + if segment_size == 10: + segment_size = 1 + smaller_segment_count = 1 + else: + segment_size = 10 + LOGGER.error( + "Failed 3 times. Switching to a segment size of %s", + segment_size, + ) + try_count = 1 + continue + LOGGER.error("Try %s/3 failed.", try_count) + try_count += 1 + return df_all_mentions + + +def alerts(org_id): + """Get actionable alerts for an organization.""" + token = cybersix_token() + count = alerts_count(token, org_id) + LOGGER.info(count) + count_total = count["total"] + LOGGER.info("Total Alerts: %s", count_total) + + # Recommended "fetch_size" is 25. The maximum is 400. + fetch_size = 25 + all_alerts = [] + + for offset in range(0, count_total, fetch_size): + try: + resp = alerts_list(token, org_id, fetch_size, offset).json() + df_alerts = pd.DataFrame.from_dict(resp) + all_alerts.append(df_alerts) + df_all_alerts = pd.concat(all_alerts).reset_index(drop=True) + except Exception as e: + print(e) + continue + + # Fetch the full content of each alert + for alert_index, alert_row in df_all_alerts.iterrows(): + content = alerts_content(token, org_id, alert_row["id"]) + df_all_alerts.at[alert_index, "content"] = content + + return df_all_alerts + + +def get_alerts_content(organization_id, alert_id, org_assets_dict): + """Get alert content snippet.""" + token = cybersix_token() + asset_mentioned = "" + snip = "" + asset_type = "" + content = alerts_content(token, organization_id, alert_id) + if content: + for asset, type in org_assets_dict.items(): + if asset in content: + index = content.index(asset) + snip = content[(index - 100) : (index + len(asset) + 100)] + snip = "..." + snip + "..." + asset_mentioned = asset + asset_type = type + LOGGER.info("Asset mentioned: %s", asset_mentioned) + return snip, asset_mentioned, asset_type + + +def top_cves(): + """Top 10 CVEs mentioned in the dark web.""" + resp = dve_top_cves() + return pd.DataFrame(resp) + + +def cve_summary(cveid): + """Get CVE summary data.""" + url = f"https://cve.circl.lu/api/cve/{cveid}" + return requests.get(url).json() + + +def creds(domain, from_date, to_date): + """Get credentials.""" + skip = 0 + params = { + "domain": domain, + "from_date": from_date, + "to_date": to_date, + "max_results": 100, + "skip": skip, + } + resp = credential_auth(params) + total_hits = resp["total_results"] + resp = resp["leaks"] + while total_hits > len(resp): + skip += 1 + params["skip"] = skip + next_resp = credential_auth(params) + resp = resp + next_resp["leaks"] + print(len(resp)) + resp = pd.DataFrame(resp) + df = resp.drop_duplicates( + subset=["email", "breach_name"], keep="first" + ).reset_index(drop=True) + return df + + +def extract_bulk_cve_info(cve_list): + """ + Make API call to C6G and retrieve/extract relevant info for a list of CVE names (10 max). + + Args: + cve_list: list of cve names (i.e. ['CVE-2022-123', 'CVE-2022-456'...]) + + Returns: + A dataframe with the name and all relevant info for the CVEs listed + """ + # Call get_bulk_cve_info() function to get response + resp = get_bulk_cve_resp(cve_list) + # Check if there was a good response + if resp is None: + # If no response, return empty dataframe + return pd.DataFrame() + else: + # Proceed if there is a response + resp_list = resp.get("objects") + # Dataframe to hold finalized data + resp_df = pd.DataFrame() + # For each cve in api response, extract data + for i in range(0, len(resp_list)): + # CVE name + cve_name = resp_list[i].get("name") + # CVSS 2.0 info + cvss_2_info = resp_list[i].get("x_sixgill_info").get("nvd").get("v2") + if cvss_2_info is not None: + cvss_2_0 = cvss_2_info.get("current") + cvss_2_0_sev = cvss_2_info.get("severity") + cvss_2_0_vec = cvss_2_info.get("vector") + else: + [cvss_2_0, cvss_2_0_sev, cvss_2_0_vec] = [None, None, None] + # CVSS 3.0 info + cvss_3_info = resp_list[i].get("x_sixgill_info").get("nvd").get("v3") + if cvss_3_info is not None: + cvss_3_0 = cvss_3_info.get("current") + cvss_3_0_sev = cvss_3_info.get("severity") + cvss_3_0_vec = cvss_3_info.get("vector") + else: + [cvss_3_0, cvss_3_0_sev, cvss_3_0_vec] = [None, None, None] + # DVE info + dve_info = resp_list[i].get("x_sixgill_info").get("score") + if dve_info is not None: + dve_score = dve_info.get("current") + else: + dve_score = None + + # Append this row of CVE info to the resp_df + curr_info = { + "cve_name": cve_name, + "cvss_2_0": cvss_2_0, + "cvss_2_0_severity": cvss_2_0_sev, + "cvss_2_0_vector": cvss_2_0_vec, + "cvss_3_0": cvss_3_0, + "cvss_3_0_severity": cvss_3_0_sev, + "cvss_3_0_vector": cvss_3_0_vec, + "dve_score": dve_score, + } + resp_df = pd.concat( + [resp_df, pd.DataFrame(curr_info, index=[0])], + ignore_index=True, + ) + # Return dataframe of relevant CVE/CVSS/DVE info + return resp_df diff --git a/src/pe_source/dnsmonitor.py b/src/pe_source/dnsmonitor.py new file mode 100644 index 0000000..ce0ee88 --- /dev/null +++ b/src/pe_source/dnsmonitor.py @@ -0,0 +1,185 @@ +"""Collect DNSMonitor data.""" + +# Standard Python Libraries +import datetime +import logging + +from .data.dnsmonitor.source import ( + get_dns_records, + get_domain_alerts, + get_monitored_domains, +) +from .data.pe_db.config import dnsmonitor_token +from .data.pe_db.db_query_source import ( + addSubdomain, + execute_dnsmonitor_alert_data, + execute_dnsmonitor_data, + get_data_source_uid, + get_orgs, + getSubdomain, +) + +NOW = datetime.datetime.now() +DAYS_BACK = datetime.timedelta(days=20) +DAY = datetime.timedelta(days=1) +START_DATE = NOW - DAYS_BACK +END_DATE = NOW + DAY + +LOGGER = logging.getLogger(__name__) + + +class DNSMonitor: + """Fetch DNSMonitor data.""" + + def __init__(self, orgs_list): + """Initialize Shodan class.""" + self.orgs_list = orgs_list + + def run_dnsMonitor(self): + """Run DNSMonitor calls.""" + orgs_list = self.orgs_list + + # Get orgs from PE database + pe_orgs = get_orgs() + + # Filter orgs if specified + if orgs_list == "all": + pe_orgs_final = pe_orgs + else: + pe_orgs_final = [] + for pe_org in pe_orgs: + if pe_org["cyhy_db_name"] in orgs_list: + pe_orgs_final.append(pe_org) + else: + continue + + # Fetch the bearer token + token = dnsmonitor_token() + # Get all of the Domains being monitored + domain_df = get_monitored_domains(token) + + failed = [] + # Iterate through each org + for org in pe_orgs_final: + org_name = org["org_name"] + org_uid = org["org_uid"] + org_code = org["cyhy_db_name"] + LOGGER.info("\nRunning DNSMonitor on %s", org_code) + + # Get respective domain IDs + domain_ids = domain_df[domain_df["org"] == org_name] + LOGGER.info("Found %s root domains being monitored.", len(domain_ids)) + domain_ids = str(domain_ids["domainId"].tolist()) + + # Get Alerts for a specific org based on the list of domain IDs + if domain_ids == "[]": + LOGGER.error("Can't match org to any domains...") + failed.append(f"{org_code} - No domains") + continue + else: + alerts_df = get_domain_alerts(token, domain_ids, START_DATE, END_DATE) + LOGGER.info("Fetched %s alerts.", len(alerts_df.index)) + + # If no alerts, continue + if alerts_df.empty: + LOGGER.error("No alerts for %s", org_code) + failed.append(f"{org_code} - No alerts") + continue + + for alert_index, alert_row in alerts_df.iterrows(): + # Get subdomain_uid + root_domain = alert_row["rootDomain"] + sub_domain = getSubdomain(root_domain) + if not sub_domain: + LOGGER.info( + "Root domain, %s, isn't in subdomain table as a sub_domain.", + root_domain, + ) + try: + addSubdomain(None, root_domain, org_uid) + LOGGER.info( + "Success adding %s to subdomain table.", root_domain + ) + except Exception as e: + LOGGER.error("Failure adding root domain to subdomain table.") + LOGGER.error(e) + failed.append( + f"{org_code} - {root_domain} - Failed inserting into subdomain table" + ) + sub_domain = getSubdomain(root_domain) + + alerts_df.at[alert_index, "sub_domain_uid"] = sub_domain + + # Get DNS records for each domain permutation + dom_perm = alert_row["domainPermutation"] + mx_list, ns_list, ipv4, ipv6 = get_dns_records(dom_perm) + + # Add records to the dataframe + alerts_df.at[alert_index, "mail_server"] = mx_list + alerts_df.at[alert_index, "name_server"] = ns_list + alerts_df.at[alert_index, "ipv4"] = ipv4 + alerts_df.at[alert_index, "ipv6"] = ipv6 + + # Set the data_source_uid and organization_uid + alerts_df["data_source_uid"] = get_data_source_uid("DNSMonitor") + alerts_df["organizations_uid"] = org_uid + + # Format dataframe and insert into domain_permutations table + alerts_df = alerts_df.rename( + columns={ + "domainPermutation": "domain_permutation", + "dateCreated": "date_observed", + "alertType": "alert_type", + "previousValue": "previous_value", + "newValue": "new_value", + } + ) + dom_perm_df = alerts_df[ + [ + "organizations_uid", + "sub_domain_uid", + "data_source_uid", + "domain_permutation", + "ipv4", + "ipv6", + "mail_server", + "name_server", + "date_observed", + ] + ] + dom_perm_df = dom_perm_df.drop_duplicates( + subset=["domain_permutation"], keep="last" + ) + try: + execute_dnsmonitor_data(dom_perm_df, "domain_permutations") + LOGGER.info("Success inserting into domain_permutations - %s", org_code) + except Exception as e: + LOGGER.error("Failed inserting into domain_permutations - %s", org_code) + LOGGER.error(e) + failed.append(f"{org_code} - Failed inserting into dom_perms") + + # Format dataframe and insert into domain_alerts table + alerts_df = alerts_df.rename(columns={"date_observed": "date"}) + domain_alerts = alerts_df[ + [ + "organizations_uid", + "sub_domain_uid", + "data_source_uid", + "alert_type", + "message", + "previous_value", + "new_value", + "date", + ] + ] + try: + execute_dnsmonitor_alert_data(domain_alerts, "domain_alerts") + LOGGER.info("Success inserting into domain_alerts - %s", org_code) + except Exception as e: + LOGGER.error("Failed inserting into domain_alerts - %s", org_code) + LOGGER.error(e) + failed.append(f"{org_code} - Failed inserting into dom_alerts") + + # Output any failures + if len(failed) > 0: + LOGGER.error("Failures: %s", failed) diff --git a/src/pe_source/dnstwistscript.py b/src/pe_source/dnstwistscript.py new file mode 100644 index 0000000..9935f85 --- /dev/null +++ b/src/pe_source/dnstwistscript.py @@ -0,0 +1,270 @@ +"""Use DNS twist to fuzz domain names and cross check with a blacklist.""" +# Standard Python Libraries +import contextlib +import datetime +import json +import logging +import pathlib +import traceback + +# Third-Party Libraries +import dnstwist +import dshield +import psycopg2.extras as extras +import requests + +from .data.pe_db.db_query_source import ( + addSubdomain, + connect, + get_data_source_uid, + get_orgs, + getSubdomain, + org_root_domains, +) + +date = datetime.datetime.now().strftime("%Y-%m-%d") +LOGGER = logging.getLogger(__name__) + + +def checkBlocklist(dom, sub_domain_uid, source_uid, pe_org_uid, perm_list): + """Cross reference the dnstwist results with DShield Blocklist.""" + malicious = False + attacks = 0 + reports = 0 + if "original" in dom["fuzzer"]: + return None, perm_list + elif "dns_a" not in dom: + return None, perm_list + else: + if str(dom["dns_a"][0]) == "!ServFail": + return None, perm_list + + # Check IP in Blocklist API + response = requests.get( + "http://api.blocklist.de/api.php?ip=" + str(dom["dns_a"][0]) + ).content + + if str(response) != "b'attacks: 0
reports: 0
'": + try: + malicious = True + attacks = int(str(response).split("attacks: ")[1].split("<")[0]) + reports = int(str(response).split("reports: ")[1].split("<")[0]) + except Exception: + malicious = False + dshield_attacks = 0 + dshield_count = 0 + + # Check IP in DSheild API + try: + results = dshield.ip(str(dom["dns_a"][0]), return_format=dshield.JSON) + results = json.loads(results) + threats = results["ip"]["threatfeeds"] + attacks = results["ip"]["attacks"] + attacks = int(0 if attacks is None else attacks) + malicious = True + dshield_attacks = attacks + dshield_count = len(threats) + except Exception: + dshield_attacks = 0 + dshield_count = 0 + + # Check IPv6 + if "dns_aaaa" not in dom: + dom["dns_aaaa"] = [""] + elif str(dom["dns_aaaa"][0]) == "!ServFail": + dom["dns_aaaa"] = [""] + else: + # Check IP in Blocklist API + response = requests.get( + "http://api.blocklist.de/api.php?ip=" + str(dom["dns_aaaa"][0]) + ).content + if str(response) != "b'attacks: 0
reports: 0
'": + try: + malicious = True + attacks = int(str(response).split("attacks: ")[1].split("<")[0]) + reports = int(str(response).split("reports: ")[1].split("<")[0]) + except Exception: + malicious = False + dshield_attacks = 0 + dshield_count = 0 + try: + results = dshield.ip(str(dom["dns_aaaa"][0]), return_format=dshield.JSON) + results = json.loads(results) + threats = results["ip"]["threatfeeds"] + attacks = results["ip"]["attacks"] + attacks = int(0 if attacks is None else attacks) + malicious = True + dshield_attacks = attacks + dshield_count = len(threats) + except Exception: + dshield_attacks = 0 + dshield_count = 0 + + # Clean-up other fields + if "ssdeep_score" not in dom: + dom["ssdeep_score"] = "" + if "dns_mx" not in dom: + dom["dns_mx"] = [""] + if "dns_ns" not in dom: + dom["dns_ns"] = [""] + + # Ignore duplicates + permutation = dom["domain"] + if permutation in perm_list: + return None, perm_list + else: + perm_list.append(permutation) + + domain_dict = { + "organizations_uid": pe_org_uid, + "data_source_uid": source_uid, + "sub_domain_uid": sub_domain_uid, + "domain_permutation": dom["domain"], + "ipv4": dom["dns_a"][0], + "ipv6": dom["dns_aaaa"][0], + "mail_server": dom["dns_mx"][0], + "name_server": dom["dns_ns"][0], + "fuzzer": dom["fuzzer"], + "date_active": date, + "ssdeep_score": dom["ssdeep_score"], + "malicious": malicious, + "blocklist_attack_count": attacks, + "blocklist_report_count": reports, + "dshield_record_count": dshield_count, + "dshield_attack_count": dshield_attacks, + } + return domain_dict, perm_list + + +def execute_dnstwist(root_domain, test=0): + """Run dnstwist on each root domain.""" + pathtoDict = str(pathlib.Path(__file__).parent.resolve()) + "/data/common_tlds.dict" + dnstwist_result = dnstwist.run( + registered=True, + tld=pathtoDict, + format="json", + threads=8, + domain=root_domain, + ) + if test == 1: + return dnstwist_result + finalorglist = dnstwist_result + [] + for dom in dnstwist_result: + if ("tld-swap" not in dom["fuzzer"]) and ("original" not in dom["fuzzer"]): + secondlist = dnstwist.run( + registered=True, + tld=pathtoDict, + format="json", + threads=8, + domain=dom["domain"], + ) + finalorglist += secondlist + return finalorglist + + +def run_dnstwist(orgs_list): + """Run DNStwist on certain domains and upload findings to database.""" + PE_conn = connect() + source_uid = get_data_source_uid("DNSTwist") + + """ Get P&E Orgs """ + orgs = get_orgs() + failures = [] + for org in orgs: + pe_org_uid = org["org_uid"] + org_name = org["org_name"] + pe_org_id = org["cyhy_db_name"] + + # Only run on orgs in the org list + if pe_org_id in orgs_list or orgs_list == "all": + LOGGER.info("Running DNSTwist on %s", pe_org_id) + + """Collect DNSTwist data from Crossfeed""" + try: + # Get root domains + root_dict = org_root_domains(PE_conn, pe_org_uid) + domain_list = [] + perm_list = [] + for root in root_dict: + root_domain = root["root_domain"] + if root_domain == "Null_Root": + continue + LOGGER.info("\tRunning on root domain: %s", root["root_domain"]) + + with open( + "src/pe_source/data/dnstwist_output.txt", "w" + ) as f, contextlib.redirect_stdout(f): + finalorglist = execute_dnstwist(root_domain) + + # Get subdomain uid + sub_domain = root_domain + try: + sub_domain_uid = getSubdomain(sub_domain) + except Exception: + # TODO: Create custom exceptions. + # Issue 265: https://github.com/cisagov/pe-reports/issues/265 + # Add and then get it + addSubdomain(PE_conn, sub_domain, pe_org_uid, True) + sub_domain_uid = getSubdomain(sub_domain) + + # Check Blocklist + for dom in finalorglist: + domain_dict, perm_list = checkBlocklist( + dom, sub_domain_uid, source_uid, pe_org_uid, perm_list + ) + if domain_dict is not None: + domain_list.append(domain_dict) + except Exception: + # TODO: Create custom exceptions. + # Issue 265: https://github.com/cisagov/pe-reports/issues/265 + LOGGER.info("Failed selecting DNSTwist data.") + failures.append(org_name) + LOGGER.info(traceback.format_exc()) + + """Insert cleaned data into PE database.""" + try: + cursor = PE_conn.cursor() + try: + columns = domain_list[0].keys() + except Exception: + LOGGER.critical("No data in the domain list.") + failures.append(org_name) + continue + table = "domain_permutations" + sql = """INSERT INTO {}({}) VALUES %s + ON CONFLICT (domain_permutation,organizations_uid) + DO UPDATE SET malicious = EXCLUDED.malicious, + blocklist_attack_count = EXCLUDED.blocklist_attack_count, + blocklist_report_count = EXCLUDED.blocklist_report_count, + dshield_record_count = EXCLUDED.dshield_record_count, + dshield_attack_count = EXCLUDED.dshield_attack_count, + data_source_uid = EXCLUDED.data_source_uid, + date_active = EXCLUDED.date_active;""" + + values = [[value for value in dict.values()] for dict in domain_list] + extras.execute_values( + cursor, + sql.format( + table, + ",".join(columns), + ), + values, + ) + PE_conn.commit() + LOGGER.info("Data inserted using execute_values() successfully..") + + except Exception: + # TODO: Create custom exceptions. + # Issue 265: https://github.com/cisagov/pe-reports/issues/265 + LOGGER.info("Failure inserting data into database.") + failures.append(org_name) + LOGGER.info(traceback.format_exc()) + + PE_conn.close() + if failures != []: + LOGGER.error("These orgs failed:") + LOGGER.error(failures) + + +if __name__ == "__main__": + run_dnstwist("all") diff --git a/src/pe_source/intelx_identity.py b/src/pe_source/intelx_identity.py new file mode 100644 index 0000000..d55e7d1 --- /dev/null +++ b/src/pe_source/intelx_identity.py @@ -0,0 +1,296 @@ +"""Collect IntelX credential leak data.""" +# Standard Python Libraries +import datetime +import logging +import sys +import time + +# Third-Party Libraries +import numpy as np +import pandas as pd +import requests + +from .data.pe_db.config import get_params +from .data.pe_db.db_query_source import ( + connect, + get_data_source_uid, + get_intelx_breaches, + get_orgs, + insert_intelx_breaches, + insert_intelx_credentials, + org_root_domains, +) + +# Calculate datetimes for collection period +TODAY = datetime.date.today() +DAYS_BACK = datetime.timedelta(days=16) +START_DATE = (TODAY - DAYS_BACK).strftime("%Y-%m-%d %H:%M:%S") +END_DATE = TODAY.strftime("%Y-%m-%d %H:%M:%S") + + +section = "intelx" +params = get_params(section) +api_key = params[0][1] + +LOGGER = logging.getLogger(__name__) + + +class IntelX: + """Fetch IntelX data.""" + + def __init__(self, orgs_list): + """Initialize IntelX class.""" + LOGGER.info("Initialized IntelX") + self.orgs_list = orgs_list + + def run_intelx(self): + """Run IntelX API calls.""" + orgs_list = self.orgs_list + + pe_orgs = get_orgs() + for pe_org in pe_orgs: + cyhy_org_id = pe_org["cyhy_db_name"] + pe_org_uid = pe_org["org_uid"] + + # Verify the org is in the list of orgs to scan + if cyhy_org_id in orgs_list or orgs_list == "all": + if self.get_credentials(cyhy_org_id, pe_org_uid) == 1: + LOGGER.error("Failed to get credentials for %s", cyhy_org_id) + + def get_credentials(self, cyhy_org_id, pe_org_uid): + """Get credentials for a provided org.""" + LOGGER.info("Fetching credential data for %s.", cyhy_org_id) + source_uid = get_data_source_uid("IntelX") + try: + conn = connect() + roots_df = org_root_domains(conn, pe_org_uid) + LOGGER.info("Got roots for %s", cyhy_org_id) + except Exception as e: + LOGGER.error("Failed fetching root domains for %s", cyhy_org_id) + LOGGER.error(e) + return 1 + + leaks_json = self.find_credential_leaks( + roots_df["root_domain"].values.tolist(), START_DATE, END_DATE + ) + if len(leaks_json) < 1: + LOGGER.info("No credentials found for %s", cyhy_org_id) + return 0 + creds_df, breaches_df = self.process_leaks_results(leaks_json, pe_org_uid) + # Insert breach data into the PE database + try: + insert_intelx_breaches(breaches_df) + except Exception as e: + LOGGER.error("Failed inserting IntelX breaches for %s", cyhy_org_id) + LOGGER.error(e) + return 1 + + breach_dict = get_intelx_breaches(source_uid) + breach_dict = dict(breach_dict) + for cred_index, cred_row in creds_df.iterrows(): + breach_uid = breach_dict[cred_row["breach_name"]] + creds_df.at[cred_index, "credential_breaches_uid"] = breach_uid + try: + insert_intelx_credentials(creds_df) + except Exception as e: + LOGGER.error("Failed inserting IntelX credentials for %s", cyhy_org_id) + LOGGER.error(e) + return 1 + return 0 + + def query_identity_api(self, domain, start_date, end_date): + """Create an initial search and return the search id.""" + url = f"https://3.intelx.io/accounts/csv?selector={domain}&k={api_key}&datefrom={start_date}&dateto={end_date}" + payload = {} + headers = {} + attempts = 0 + while True: + try: + response = requests.request("GET", url, headers=headers, data=payload) + break + except requests.exceptions.Timeout: + time.sleep(5) + attempts += 1 + if attempts == 5: + LOGGER.error("IntelX Identity is not responding. Exiting program.") + sys.exit() + LOGGER.info("IntelX Identity API response timed out. Trying again.") + except Exception as e: + LOGGER.error("Error occurred getting search id: %s", e) + return 0 + LOGGER.info("Acquired search id.") + time.sleep(5) + return response.json() + + def get_search_results(self, id): + """Search IntelX for email leaks.""" + url = f"https://3.intelx.io/live/search/result?id={id}&format=1&k={api_key}" + + payload = {} + headers = {} + attempts = 0 + while True: + try: + response = requests.request("GET", url, headers=headers, data=payload) + break + except requests.exceptions.Timeout: + time.sleep(5) + attempts += 1 + if attempts == 5: + LOGGER.error("IntelX Identity is not responding. Exiting program.") + sys.exit() + LOGGER.info("IntelX Identity API response timed out. Trying again.") + except Exception as e: + LOGGER.error(f"Error occurred getting search results: {e}") + return 0 + response = response.json() + + return response + + def find_credential_leaks(self, domain_list, start_date, end_date): + """Find leaks for a domain between two dates.""" + all_results_list = [] + for domain in domain_list: + LOGGER.info("Finding credentials leaked associated with %s", domain) + response = self.query_identity_api(domain, start_date, end_date) + if not response: + continue + search_id = response["id"] + while True: + results = self.get_search_results(search_id) + if not results: + break + if results["status"] == 0: + current_results = results["records"] + if current_results: + # Add the root_domain to each result object + LOGGER.info( + "IntelX returned %s more credentials for %s", + len(current_results), + domain, + ) + result = [ + dict(item, **{"root_domain": domain}) + for item in current_results + ] + all_results_list = all_results_list + result + time.sleep(3) + # If still waiting on new results wait + elif results["status"] == 1: + LOGGER.info("IntelX still searching for more credentials") + time.sleep(7) + # if status is two collect the last remaining values and exit loop + elif results["status"] == 2: + current_results = results["records"] + if current_results: + # Add the root_domain to each result object + LOGGER.info( + "IntelX returned %s more credentials for %s", + len(current_results), + domain, + ) + result = [ + dict(item, **{"root_domain": domain}) + for item in current_results + ] + all_results_list = all_results_list + result + break + elif results["status"] == 3: + LOGGER.error("Search id not found") + break + LOGGER.info("Identified %s credential leak combos.", len(all_results_list)) + return all_results_list + + def process_leaks_results(self, leaks_json, org_uid): + """Prepare and format credentials and breach dataframes.""" + # Convert json into a dataframe + all_df = pd.DataFrame.from_dict(leaks_json) + + # format email to all lowercase and remove duplicates + all_df["user"] = all_df["user"].str.lower() + LOGGER.info("%s unique emails found", all_df["user"].nunique()) + LOGGER.info("%s unique posts", all_df["sourceshort"].nunique()) + all_df = all_df.drop_duplicates(subset=["user", "sourceshort"], keep="first") + LOGGER.info( + "%s emails found after removing duplicates in the same post", + len(leaks_json), + ) + + # Format date + all_df["datetime"] = pd.to_datetime(all_df["date"]) + all_df["date"] = all_df["datetime"].dt.strftime("%Y-%m-%d") + + # Create boolean column for if password is included + all_df["password_included"] = np.where( + (pd.isna(all_df["password"])) | (all_df["password"] == ""), 0, 1 + ) + # Create new column for subdomain, organization uid, and data source uid + all_df["sub_domain"] = all_df["user"].str.split("@").str[1] + all_df["organizations_uid"] = org_uid + all_df["data_source_uid"] = get_data_source_uid("IntelX") + + # rename fields to match database + all_df.rename( + columns={ + "user": "email", + "sourceshort": "breach_name", + "date": "modified_date", + "systemid": "intelx_system_id", + "passwordtype": "hash_type", + }, + inplace=True, + ) + + creds_df = all_df[ + [ + "email", + "organizations_uid", + "root_domain", + "sub_domain", + "breach_name", + "modified_date", + "data_source_uid", + "password", + "hash_type", + "intelx_system_id", + ] + ].reset_index(drop=True) + + # group results by breaches + breaches_df = all_df.groupby( + ["breach_name", "modified_date", "bucket", "data_source_uid"] + ).aggregate({"email": "count", "password_included": "sum"}) + breaches_df = breaches_df.reset_index() + breaches_df["password_included"] = breaches_df["password_included"] > 0 + + breaches_df.rename(columns={"email": "exposed_cred_count"}, inplace=True) + # Build breach description + breaches_df["description"] = ( + breaches_df["breach_name"] + + " was identified on " + + breaches_df["modified_date"] + + ". The post " + + ( + "does not contain" + if breaches_df["password_included"] is True + else "contains" + ) + + " passwords. It falls in the following category: " + + breaches_df["bucket"] + ) + + breaches_df["breach_date"] = breaches_df["modified_date"] + breaches_df["added_date"] = breaches_df["modified_date"] + breaches_df = breaches_df[ + [ + "breach_name", + "description", + "breach_date", + "added_date", + "modified_date", + "password_included", + "data_source_uid", + ] + ] + + return creds_df, breaches_df diff --git a/src/pe_source/pe_source.py b/src/pe_source/pe_source.py new file mode 100644 index 0000000..81b1666 --- /dev/null +++ b/src/pe_source/pe_source.py @@ -0,0 +1,129 @@ +"""A tool for gathering pe source data. + +Usage: + pe-source DATA_SOURCE [--log-level=LEVEL] [--orgs=ORG_LIST] [--cybersix-methods=METHODS] [--soc-med-included] + +Arguments: + DATA_SOURCE Source to collect data from. Valid values are "cybersixgill", + "dnstwist", "hibp", "intelx", "dnsmonitor", and "shodan". + +Options: + -h --help Show this message. + -v --version Show version information. + -l --log-level=LEVEL If specified, then the log level will be set to + the specified value. Valid values are "debug", "info", + "warning", "error", and "critical". [default: info] + -o --orgs=ORG_LIST A comma-separated list of orgs to collect data for. + If not specified, data will be collected for all + orgs in the pe database. Orgs in the list must match the + IDs in the cyhy-db. E.g. DHS,DHS_ICE,DOC + [default: all] + -c --cybersix-methods=METHODS A comma-separated list of cybersixgill methods to run. + If not specified, all will run. Valid values are "alerts", + "credentials", "mentions", "topCVEs". E.g. alerts,mentions. + [default: all] + -s --soc-med-included Include social media posts from cybersixgill in data collection. +""" + +# Standard Python Libraries +import logging +import sys +from typing import Any, Dict + +# Third-Party Libraries +import docopt +from schema import And, Schema, SchemaError, Use + +# cisagov Libraries +import pe_source + +from ._version import __version__ +from .cybersixgill import Cybersixgill +from .dnsmonitor import DNSMonitor +from .dnstwistscript import run_dnstwist +from .intelx_identity import IntelX +from .shodan import Shodan + +LOGGER = logging.getLogger(__name__) + + +def run_pe_script(source, orgs_list, cybersix_methods, soc_med_included): + """Collect data from the source specified.""" + # If not "all", separate orgs string into a list of orgs + if orgs_list != "all": + orgs_list = orgs_list.split(",") + # If not "all", separate Cybersixgill methods string into a list + if cybersix_methods == "all": + cybersix_methods = ["alerts", "mentions", "credentials", "topCVEs"] + else: + cybersix_methods = cybersix_methods.split(",") + + LOGGER.info("Running %s on these orgs: %s", source, orgs_list) + + if source == "cybersixgill": + cybersix = Cybersixgill(orgs_list, cybersix_methods, soc_med_included) + cybersix.run_cybersixgill() + elif source == "shodan": + shodan = Shodan(orgs_list) + shodan.run_shodan() + elif source == "dnsmonitor": + dnsMonitor = DNSMonitor(orgs_list) + dnsMonitor.run_dnsMonitor() + elif source == "dnstwist": + run_dnstwist(orgs_list) + elif source == "intelx": + intelx = IntelX(orgs_list) + intelx.run_intelx() + else: + logging.error( + "Not a valid source name. Correct values are cybersixgill or shodan." + ) + sys.exit(1) + + +def main(): + """Set up logging and call the run_pe_script function.""" + args: Dict[str, str] = docopt.docopt(__doc__, version=__version__) + # Validate and convert arguments as needed + schema: Schema = Schema( + { + "--log-level": And( + str, + Use(str.lower), + lambda n: n in ("debug", "info", "warning", "error", "critical"), + error="Possible values for --log-level are " + + "debug, info, warning, error, and critical.", + ), + str: object, # Don't care about other keys, if any + } + ) + + try: + validated_args: Dict[str, Any] = schema.validate(args) + except SchemaError as err: + # Exit because one or more of the arguments were invalid + print(err, file=sys.stderr) + sys.exit(1) + + # Assign validated arguments to variables + log_level: str = validated_args["--log-level"] + + # Set up logging + logging.basicConfig( + filename=pe_source.CENTRAL_LOGGING_FILE, + filemode="a", + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%m/%d/%Y %I:%M:%S", + level=log_level.upper(), + ) + + # Run pe script on specified source + run_pe_script( + validated_args["DATA_SOURCE"], + validated_args["--orgs"], + validated_args["--cybersix-methods"], + validated_args["--soc-med-included"], + ) + + # Stop logging and clean up + logging.shutdown() diff --git a/src/pe_source/shodan.py b/src/pe_source/shodan.py new file mode 100644 index 0000000..b943a05 --- /dev/null +++ b/src/pe_source/shodan.py @@ -0,0 +1,61 @@ +"""Collect Shodan data.""" + +# Standard Python Libraries +import threading + +# Third-Party Libraries +import numpy + +from .data.pe_db.config import shodan_api_init +from .data.pe_db.db_query_source import get_orgs +from .data.shodan.shodan_search import run_shodan_thread + + +class Shodan: + """Fetch Shodan data.""" + + def __init__(self, orgs_list): + """Initialize Shodan class.""" + self.orgs_list = orgs_list + + def run_shodan(self): + """Run Shodan calls.""" + orgs_list = self.orgs_list + + # Get orgs from PE database + pe_orgs = get_orgs() + + # Filter orgs if specified + if orgs_list == "all": + pe_orgs_final = pe_orgs + else: + pe_orgs_final = [] + for pe_org in pe_orgs: + if pe_org["cyhy_db_name"] in orgs_list: + pe_orgs_final.append(pe_org) + else: + continue + + # Get list of initialized API objects + api_list = shodan_api_init() + + # Split orgs into chunks. # of chunks = # of valid API keys = # of threads + chunk_size = len(api_list) + chunked_orgs_list = numpy.array_split(numpy.array(pe_orgs_final), chunk_size) + + i = 0 + thread_list = [] + while i < len(chunked_orgs_list): + thread_name = f"Thread {i+1}:" + # Start thread + t = threading.Thread( + target=run_shodan_thread, + args=(api_list[i], chunked_orgs_list[i], thread_name), + ) + t.start() + thread_list.append(t) + i += 1 + + # Wait until all threads finish to continue + for thread in thread_list: + thread.join() diff --git a/tests/data/cybersix_breach_insert.json b/tests/data/cybersix_breach_insert.json new file mode 100644 index 0000000..7cb85d8 --- /dev/null +++ b/tests/data/cybersix_breach_insert.json @@ -0,0 +1,50 @@ +[ + { + "breach_date": "2022-05-07", + "breach_name": "Breach 2", + "data_source_uid": "source_uid", + "description": "Description 2", + "modified_date": "2022-05-07", + "password_included": true + }, + { + "breach_date": "2022-04-15", + "breach_name": "Breach 3", + "data_source_uid": "source_uid", + "description": "Description 3", + "modified_date": "2022-04-15", + "password_included": false + }, + { + "breach_date": "2022-05-10", + "breach_name": "Breach 4", + "data_source_uid": "source_uid", + "description": "Description 4", + "modified_date": "2022-05-10", + "password_included": false + }, + { + "breach_date": "2022-05-06", + "breach_name": "Breach 5", + "data_source_uid": "source_uid", + "description": "Description 5", + "modified_date": "2022-05-06", + "password_included": false + }, + { + "breach_date": "2022-05-08", + "breach_name": "Breach 6", + "data_source_uid": "source_uid", + "description": "Description 6", + "modified_date": "2022-05-08", + "password_included": false + }, + { + "breach_date": "2022-04-26", + "breach_name": "Cybersixgill_1", + "data_source_uid": "source_uid", + "description": "Description 1", + "modified_date": "2022-04-26", + "password_included": true + } +] diff --git a/tests/data/cybersix_creds.json b/tests/data/cybersix_creds.json new file mode 100644 index 0000000..23084e1 --- /dev/null +++ b/tests/data/cybersix_creds.json @@ -0,0 +1,156 @@ +[ + { + "breach_date": "2022-04-26", + "breach_id": 1, + "breach_name": "", + "create_time": "2022-04-26 20:53:13", + "description": "Description 1", + "domain": "sample.com", + "email": "person1@sample.com", + "hash_type": "plain", + "login_id": "", + "name": "", + "password": "askjna", + "phone": "" + }, + { + "breach_date": "2022-04-26", + "breach_id": 1, + "breach_name": "", + "create_time": "2022-04-26 16:15:41", + "description": "Description 1", + "domain": "sample.com", + "email": "person2@sample.com", + "hash_type": "plain", + "login_id": "", + "name": "", + "password": "asfvas", + "phone": "" + }, + { + "breach_date": "2022-04-26", + "breach_id": 1, + "breach_name": "", + "create_time": "2022-04-26 08:29:03", + "description": "Description 1", + "domain": "sample.com", + "email": "person3@sample.com", + "hash_type": "plain", + "login_id": "", + "name": "", + "password": "ksfjnva", + "phone": "" + }, + { + "breach_date": "2022-05-07", + "breach_id": 2, + "breach_name": "Breach 2", + "create_time": "2022-05-07 16:15:40", + "description": "Description 2", + "domain": "sample.com", + "email": "person4@sample.com", + "hash_type": "plain", + "login_id": "", + "name": "", + "password": "safvas", + "phone": "" + }, + { + "breach_date": "2022-05-07", + "breach_id": 2, + "breach_name": "Breach 2", + "create_time": "2022-05-07 10:56:11", + "description": "Description 2", + "domain": "sample.com", + "email": "person5@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "asdvasv", + "phone": "None" + }, + { + "breach_date": "2022-04-15", + "breach_id": 3, + "breach_name": "Breach 3", + "create_time": "2022-04-15 21:43:27", + "description": "Description 3", + "domain": "sample.com", + "email": "person6@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "", + "phone": "None" + }, + { + "breach_date": "2022-04-15", + "breach_id": 3, + "breach_name": "Breach 3", + "create_time": "2022-04-15 09:22:26", + "description": "Description 3", + "domain": "sample.com", + "email": "person7@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "", + "phone": "None" + }, + { + "breach_date": "2022-05-10", + "breach_id": 4, + "breach_name": "Breach 4", + "create_time": "2022-04-15 09:39:37", + "description": "Description 4", + "domain": "sample.com", + "email": "person8@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "", + "phone": "None" + }, + { + "breach_date": "2022-05-10", + "breach_id": 4, + "breach_name": "Breach 4", + "create_time": "2022-04-15 09:36:13", + "description": "Description 4", + "domain": "sample.com", + "email": "person9@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "", + "phone": "None" + }, + { + "breach_date": "2022-05-06", + "breach_id": 5, + "breach_name": "Breach 5", + "create_time": "2022-05-06 09:38:21", + "description": "Description 5", + "domain": "sample.com", + "email": "person10@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "", + "phone": "None" + }, + { + "breach_date": "2022-05-08", + "breach_id": 6, + "breach_name": "Breach 6", + "create_time": "2022-05-08 09:27:44", + "description": "Description 6", + "domain": "sample.com", + "email": "person11@sample.com", + "hash_type": "plain", + "login_id": "None", + "name": "None", + "password": "", + "phone": "None" + } +] diff --git a/tests/data/cybersix_creds_insert.json b/tests/data/cybersix_creds_insert.json new file mode 100644 index 0000000..073e0f1 --- /dev/null +++ b/tests/data/cybersix_creds_insert.json @@ -0,0 +1,156 @@ +[ + { + "breach_name": "Cybersixgill_1", + "credential_breaches_uid": "breach_uid_1", + "data_source_uid": "source_uid", + "email": "person1@sample.com", + "hash_type": "plain", + "login_id": "", + "modified_date": "2022-04-26", + "name": "", + "organizations_uid": "pe_org_uid", + "password": "askjna", + "phone": "", + "sub_domain": "sample.com" + }, + { + "breach_name": "Cybersixgill_1", + "credential_breaches_uid": "breach_uid_1", + "data_source_uid": "source_uid", + "email": "person2@sample.com", + "hash_type": "plain", + "login_id": "", + "modified_date": "2022-04-26", + "name": "", + "organizations_uid": "pe_org_uid", + "password": "asfvas", + "phone": "", + "sub_domain": "sample.com" + }, + { + "breach_name": "Cybersixgill_1", + "credential_breaches_uid": "breach_uid_1", + "data_source_uid": "source_uid", + "email": "person3@sample.com", + "hash_type": "plain", + "login_id": "", + "modified_date": "2022-04-26", + "name": "", + "organizations_uid": "pe_org_uid", + "password": "ksfjnva", + "phone": "", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 2", + "credential_breaches_uid": "breach_uid_2", + "data_source_uid": "source_uid", + "email": "person4@sample.com", + "hash_type": "plain", + "login_id": "", + "modified_date": "2022-05-07", + "name": "", + "organizations_uid": "pe_org_uid", + "password": "safvas", + "phone": "", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 2", + "credential_breaches_uid": "breach_uid_2", + "data_source_uid": "source_uid", + "email": "person5@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-05-07", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "asdvasv", + "phone": "None", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 3", + "credential_breaches_uid": "breach_uid_3", + "data_source_uid": "source_uid", + "email": "person6@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-04-15", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "", + "phone": "None", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 3", + "credential_breaches_uid": "breach_uid_3", + "data_source_uid": "source_uid", + "email": "person7@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-04-15", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "", + "phone": "None", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 4", + "credential_breaches_uid": "breach_uid_4", + "data_source_uid": "source_uid", + "email": "person8@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-05-10", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "", + "phone": "None", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 4", + "credential_breaches_uid": "breach_uid_4", + "data_source_uid": "source_uid", + "email": "person9@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-05-10", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "", + "phone": "None", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 5", + "credential_breaches_uid": "breach_uid_5", + "data_source_uid": "source_uid", + "email": "person10@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-05-06", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "", + "phone": "None", + "sub_domain": "sample.com" + }, + { + "breach_name": "Breach 6", + "credential_breaches_uid": "breach_uid_6", + "data_source_uid": "source_uid", + "email": "person11@sample.com", + "hash_type": "plain", + "login_id": "None", + "modified_date": "2022-05-08", + "name": "None", + "organizations_uid": "pe_org_uid", + "password": "", + "phone": "None", + "sub_domain": "sample.com" + } +] diff --git a/tests/test_example.py b/tests/test_example.py deleted file mode 100644 index f8dea67..0000000 --- a/tests/test_example.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env pytest -vs -"""Tests for example.""" - -# Standard Python Libraries -import logging -import os -import sys -from unittest.mock import patch - -# Third-Party Libraries -import pytest - -# cisagov Libraries -import example - -div_params = [ - (1, 1, 1), - (2, 2, 1), - (0, 1, 0), - (8, 2, 4), -] - -log_levels = ( - "debug", - "info", - "warning", - "error", - "critical", -) - -# define sources of version strings -RELEASE_TAG = os.getenv("RELEASE_TAG") -PROJECT_VERSION = example.__version__ - - -def test_stdout_version(capsys): - """Verify that version string sent to stdout agrees with the module version.""" - with pytest.raises(SystemExit): - with patch.object(sys, "argv", ["bogus", "--version"]): - example.example.main() - captured = capsys.readouterr() - assert ( - captured.out == f"{PROJECT_VERSION}\n" - ), "standard output by '--version' should agree with module.__version__" - - -def test_running_as_module(capsys): - """Verify that the __main__.py file loads correctly.""" - with pytest.raises(SystemExit): - with patch.object(sys, "argv", ["bogus", "--version"]): - # F401 is a "Module imported but unused" warning. This import - # emulates how this project would be run as a module. The only thing - # being done by __main__ is importing the main entrypoint of the - # package and running it, so there is nothing to use from this - # import. As a result, we can safely ignore this warning. - # cisagov Libraries - import example.__main__ # noqa: F401 - captured = capsys.readouterr() - assert ( - captured.out == f"{PROJECT_VERSION}\n" - ), "standard output by '--version' should agree with module.__version__" - - -@pytest.mark.skipif( - RELEASE_TAG in [None, ""], reason="this is not a release (RELEASE_TAG not set)" -) -def test_release_version(): - """Verify that release tag version agrees with the module version.""" - assert ( - RELEASE_TAG == f"v{PROJECT_VERSION}" - ), "RELEASE_TAG does not match the project version" - - -@pytest.mark.parametrize("level", log_levels) -def test_log_levels(level): - """Validate commandline log-level arguments.""" - with patch.object(sys, "argv", ["bogus", f"--log-level={level}", "1", "1"]): - with patch.object(logging.root, "handlers", []): - assert ( - logging.root.hasHandlers() is False - ), "root logger should not have handlers yet" - return_code = None - try: - example.example.main() - except SystemExit as sys_exit: - return_code = sys_exit.code - assert return_code is None, "main() should return success" - assert ( - logging.root.hasHandlers() is True - ), "root logger should now have a handler" - assert ( - logging.getLevelName(logging.root.getEffectiveLevel()) == level.upper() - ), f"root logger level should be set to {level.upper()}" - assert return_code is None, "main() should return success" - - -def test_bad_log_level(): - """Validate bad log-level argument returns error.""" - with patch.object(sys, "argv", ["bogus", "--log-level=emergency", "1", "1"]): - return_code = None - try: - example.example.main() - except SystemExit as sys_exit: - return_code = sys_exit.code - assert return_code == 1, "main() should exit with error" - - -@pytest.mark.parametrize("dividend, divisor, quotient", div_params) -def test_division(dividend, divisor, quotient): - """Verify division results.""" - result = example.example_div(dividend, divisor) - assert result == quotient, "result should equal quotient" - - -@pytest.mark.slow -def test_slow_division(): - """Example of using a custom marker. - - This test will only be run if --runslow is passed to pytest. - Look in conftest.py to see how this is implemented. - """ - # Standard Python Libraries - import time - - result = example.example_div(256, 16) - time.sleep(4) - assert result == 16, "result should equal be 16" - - -def test_zero_division(): - """Verify that division by zero throws the correct exception.""" - with pytest.raises(ZeroDivisionError): - example.example_div(1, 0) - - -def test_zero_divisor_argument(): - """Verify that a divisor of zero is handled as expected.""" - with patch.object(sys, "argv", ["bogus", "1", "0"]): - return_code = None - try: - example.example.main() - except SystemExit as sys_exit: - return_code = sys_exit.code - assert return_code == 1, "main() should exit with error" diff --git a/tests/test_pe_source.py b/tests/test_pe_source.py new file mode 100644 index 0000000..748810f --- /dev/null +++ b/tests/test_pe_source.py @@ -0,0 +1,368 @@ +"""Tests for the pe-source module.""" + +# Standard Python Libraries +import logging +import sys +from unittest.mock import patch + +# Third-Party Libraries +import pandas as pd +import pytest + +# cisagov Libraries +from pe_source import CENTRAL_LOGGING_FILE +import pe_source.cybersixgill +import pe_source.data.sixgill.api +import pe_source.dnstwistscript +import pe_source.pe_source +import pe_source.shodan + +log_levels = ( + "debug", + "info", + "warning", + "error", + "critical", +) + +# Setup logging to file +logging.basicConfig( + filename=CENTRAL_LOGGING_FILE, + filemode="a", + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + datefmt="%m/%d/%Y %I:%M:%S", + level="INFO", +) + +logger = logging.getLogger(__name__) + + +PROJECT_VERSION = pe_source.__version__ + + +# TODO: Replace current dummy test with useful tests +# Issue - https://github.com/cisagov/pe-reports/issues/3#issue-909531010 + + +def test_source_stdout_version(capsys): + """Verify that version string sent to stdout agrees with the module version.""" + with pytest.raises(SystemExit): + with patch.object(sys, "argv", ["bogus", "--version"]): + pe_source.pe_source.main() + captured = capsys.readouterr() + assert ( + captured.out == f"{PROJECT_VERSION}\n" + ), "standard output by '--version' should agree with module.__version__" + + +def test_source_running_as_module(capsys): + """Verify that the __main__.py file loads correctly.""" + with pytest.raises(SystemExit): + with patch.object(sys, "argv", ["bogus", "--version"]): + # F401 is a "Module imported but unused" warning. This import + # emulates how this project would be run as a module. The only thing + # being done by __main__ is importing the main entrypoint of the + # package and running it, so there is nothing to use from this + # import. As a result, we can safely ignore this warning. + # cisagov Libraries + import pe_source.__main__ # noqa: F401 + captured = capsys.readouterr() + assert ( + captured.out == f"{PROJECT_VERSION}\n" + ), "standard output by '--version' should agree with module.__version__" + + +@pytest.mark.parametrize("level", log_levels) +def test_source_log_levels(level): + """Validate commandline log-level arguments.""" + with patch.object( + sys, + "argv", + [ + "pe-source", + "shodan", + f"--log-level={level}", + ], + ): + with patch.object(logging.root, "handlers", []): + with patch.object(pe_source.shodan.Shodan, "run_shodan"): + assert ( + logging.root.hasHandlers() is False + ), "root logger should not have handlers yet" + return_code = None + try: + pe_source.pe_source.main() + except SystemExit as sys_exit: + return_code = sys_exit.code + assert ( + logging.root.hasHandlers() is True + ), "root logger should now have a handler" + assert ( + logging.getLevelName(logging.root.getEffectiveLevel()) + == level.upper() + ), f"root logger level should be set to {level.upper()}" + assert return_code is None, "main() should return success" + + +def test_source_bad_log_level(): + """Validate bad log-level argument returns error.""" + with patch.object( + sys, + "argv", + [ + "pe-source", + "shodan", + "--log-level=emergency", + ], + ): + return_code = None + try: + pe_source.pe_source.main() + except SystemExit as sys_exit: + return_code = sys_exit.code + assert return_code == 1, "main() should exit with error" + + +# Test source argument +def test_source_is_cybersixgill(): + """Validate source argument runs Cybersixgill.""" + with patch.object( + sys, + "argv", + [ + "pe-source", + "cybersixgill", + ], + ): + with patch.object( + pe_source.cybersixgill.Cybersixgill, "run_cybersixgill" + ) as mock_sixgill: + with patch.object(pe_source.shodan.Shodan, "run_shodan") as mock_shodan: + pe_source.pe_source.main() + mock_sixgill.assert_called_with(), "cybersixgill should be called" + mock_shodan.assert_not_called(), "shodan should not be called" + + +def test_source_is_shodan(): + """Validate source argument runs Shodan.""" + with patch.object( + sys, + "argv", + [ + "pe-source", + "shodan", + ], + ): + with patch.object( + pe_source.cybersixgill.Cybersixgill, "run_cybersixgill" + ) as mock_sixgill: + with patch.object(pe_source.shodan.Shodan, "run_shodan") as mock_shodan: + pe_source.pe_source.main() + mock_shodan.assert_called_with(), "shodan should be called" + mock_sixgill.assert_not_called(), "cybersixgill should not be called" + + +def test_bad_source(): + """Validate bad source argument returns error.""" + with patch.object( + sys, + "argv", + [ + "pe-source", + "bad_source", + ], + ): + return_code = None + try: + pe_source.pe_source.main() + except SystemExit as sys_exit: + return_code = sys_exit.code + assert return_code == 1, "should exit with error" + + +# Cybersixgill +@patch.object(pe_source.cybersixgill.Cybersixgill, "get_topCVEs") +@patch.object(pe_source.cybersixgill.Cybersixgill, "get_credentials") +@patch.object(pe_source.cybersixgill.Cybersixgill, "get_mentions") +@patch.object(pe_source.cybersixgill.Cybersixgill, "get_alerts") +@patch("pe_source.cybersixgill.get_data_source_uid") +@patch("pe_source.cybersixgill.get_sixgill_organizations") +@patch("pe_source.cybersixgill.get_orgs") +def test_cybersix_methods_all( + mock_get_orgs, + mock_get_sixgill_orgs, + mock_get_source_id, + mock_sixgill_alerts, + mock_sixgill_mentions, + mock_sixgill_credentials, + mock_sixgill_topCVEs, +): + """Validate all Cybersixgill methods are called correctly.""" + with patch.object(sys, "argv", ["pe-source", "cybersixgill"]): + mock_get_orgs.return_value = [ + {"org_uid": "pe_org_uid", "org_name": "Test Org", "cyhy_db_name": "TestOrg"} + ] + mock_get_sixgill_orgs.return_value = { + "TestOrg": [ + "role", + "user", + "customer", + "image", + [], + "sixgill_org_id", + ] + } + mock_get_source_id.return_value = "source_uid" + pe_source.pe_source.main() + mock_sixgill_alerts.assert_called_with( + "TestOrg", "sixgill_org_id", "pe_org_uid", "source_uid", False + ) + mock_sixgill_mentions.assert_called_with( + "TestOrg", "sixgill_org_id", "pe_org_uid", "source_uid", False + ) + mock_sixgill_credentials.assert_called_with( + "TestOrg", "sixgill_org_id", "pe_org_uid", "source_uid" + ) + mock_sixgill_topCVEs.assert_called_with("source_uid") + + +@patch.object(pe_source.cybersixgill.Cybersixgill, "get_alerts") +@patch("pe_source.cybersixgill.get_data_source_uid") +@patch("pe_source.cybersixgill.get_sixgill_organizations") +@patch("pe_source.cybersixgill.get_orgs") +def test_cybersix_methods_alerts( + mock_get_orgs, + mock_get_sixgill_orgs, + mock_get_source_id, + mock_sixgill_alerts, +): + """Validate only the Cybersixgill alert method is called.""" + with patch.object( + sys, "argv", ["pe-source", "cybersixgill", "--cybersix-methods=alerts"] + ): + mock_get_orgs.return_value = [ + {"org_uid": "pe_org_uid", "org_name": "Test Org", "cyhy_db_name": "TestOrg"} + ] + mock_get_sixgill_orgs.return_value = { + "TestOrg": [ + "role", + "user", + "customer", + "image", + [], + "sixgill_org_id", + ] + } + mock_get_source_id.return_value = "source_uid" + pe_source.pe_source.main() + mock_sixgill_alerts.assert_called_with( + "TestOrg", "sixgill_org_id", "pe_org_uid", "source_uid", False + ) + + +# Test Credentials +@patch("pe_source.cybersixgill.insert_sixgill_credentials") +@patch("pe_source.cybersixgill.get_breaches") +@patch("pe_source.cybersixgill.insert_sixgill_breaches") +@patch("pe_source.cybersixgill.creds") +@patch("pe_source.cybersixgill.root_domains") +def test_cybersix_credentials( + mock_root_domains, + mock_creds_df, + mock_insert_breaches, + mock_breaches, + mock_insert_creds, +): + """Validate credential breach data is parsed and cleaned correctly.""" + mock_root_domains.return_value = ["sample.com"] + # Mock credentials from cybersixgill + mock_creds_df.return_value = pd.read_json("tests/data/cybersix_creds.json") + mock_breaches.return_value = [ + ("Cybersixgill_1", "breach_uid_1"), + ("Breach 2", "breach_uid_2"), + ("Breach 3", "breach_uid_3"), + ("Breach 4", "breach_uid_4"), + ("Breach 5", "breach_uid_5"), + ("Breach 5", "breach_uid_5"), + ("Breach 6", "breach_uid_6"), + ] + + result = pe_source.cybersixgill.Cybersixgill( + ["TestOrg"], ["credentials"], False + ).get_credentials("org_id", "sixgill_org_id", "pe_org_uid", "source_uid") + + # Assert insert breaches function is called with the correct data + breach_insert_df = pd.read_json("tests/data/cybersix_breach_insert.json") + pd.testing.assert_frame_equal( + mock_insert_breaches.call_args[0][0].sort_index(axis=1), + breach_insert_df.sort_index(axis=1), + ) + # Assert insert credentials function is called with the correct data + creds_insert_df = pd.read_json("tests/data/cybersix_creds_insert.json") + pd.testing.assert_frame_equal( + mock_insert_creds.call_args[0][0].sort_index(axis=1), + creds_insert_df.sort_index(axis=1), + ) + # Assert function completes without errors + assert result == 0 + + +# Test Shodan +@patch("pe_source.shodan.run_shodan_thread") +@patch("pe_source.shodan.shodan_api_init") +@patch("pe_source.shodan.get_orgs") +def test_shodan_search( + mock_get_orgs, + mock_shodan_api, + mock_shodan_thread, +): + """Validate Shodan search is called.""" + with patch.object(sys, "argv", ["pe-source", "shodan"]): + mock_get_orgs.return_value = [ + {"org_uid": "pe_org_uid", "org_name": "Test Org", "cyhy_db_name": "TestOrg"} + ] + mock_shodan_api.return_value = ["api-key-1"] + pe_source.pe_source.main() + mock_shodan_thread.assert_called_with( + "api-key-1", + [ + { + "org_uid": "pe_org_uid", + "org_name": "Test Org", + "cyhy_db_name": "TestOrg", + } + ], + "Thread 1:", + ) + + +def test_dnstwistfuzzing(): + """Test if dnstwist is installed correctly.""" + res = pe_source.dnstwistscript.execute_dnstwist("a.com", test=1) + assert len(res) != 0 + assert res[1]["fuzzer"] == "addition" + assert res[1]["domain"] != "" + assert ( + len(res[1]["dns_ns"]) != 0 + ) # all domains returned should be registered so this must have something + + +def test_blocklist(): + """Test if blocklist is working correctly.""" + dom = { + "fuzzer": "addition", + "domain": "a0.com", + "dns_ns": ["liz.ns.cloudflare.com"], + "dns_a": ["104.21.34.160"], + "dns_aaaa": ["2606:4700:3036::6815:22a0"], + "dns_mx": ["alt1.aspmx.l.google.com"], + "ssdeep_score": "", + } + test1, test2 = pe_source.dnstwistscript.checkBlocklist(dom, 1, 1, 1, []) + assert test1["data_source_uid"] == 1 + assert test1["domain_permutation"] == "a0.com" + assert test2[0] == "a0.com" + + +# TODO: Add shodan search once this issue is addressed +# Issue - https://github.com/cisagov/pe-reports/issues/171