From 1cc1d79650615234a277939a6a7e3472986314d6 Mon Sep 17 00:00:00 2001 From: Alicia Bargar Date: Mon, 4 Mar 2024 20:58:09 +0100 Subject: [PATCH] Handle 'real' lists and sets in the IOU function (#49) * unpacking error * add missing env variable reference * handle actual lists and sets being passed in --- app.py | 2 +- modules/crawler.py | 1 + modules/matcher.py | 23 +++++++++++----- tests/test__matcher.py | 62 +++++++++++++++++++++++++++++++++--------- 4 files changed, 67 insertions(+), 21 deletions(-) diff --git a/app.py b/app.py index 3fe23be..34b2867 100644 --- a/app.py +++ b/app.py @@ -257,7 +257,7 @@ def content_gui(): # Error message if neither is provided flash("Please provide at least a title or content query.") else: - results, csv_data = None + results, csv_data = (None, None) results, csv_data = content(request) return render_template('index.html', results=results, csv_data=csv_data, countries=COUNTRIES, languages=LANGUAGES, indicator_metadata=INDICATOR_METADATA) diff --git a/modules/crawler.py b/modules/crawler.py index 8175912..8c0d384 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -30,6 +30,7 @@ URLSCAN_API_KEY = os.getenv('URLSCAN_API_KEY') SCRAPER_API_KEY = os.getenv('SCRAPER_API_KEY') +MYIPMS_API_PATH = os.getenv('MYIPMS_API_PATH', '') visited = set() diff --git a/modules/matcher.py b/modules/matcher.py index 1aa5f45..e12aee1 100644 --- a/modules/matcher.py +++ b/modules/matcher.py @@ -6,6 +6,7 @@ import logging import numpy as np import pandas as pd +from pandas.api.types import is_list_like from pathlib import Path import traceback from typing import Dict, Any @@ -25,20 +26,28 @@ def basic_preprocess(df: pd.DataFrame, feature: str) -> pd.DataFrame: return df -def column_contains_list(column: pd.Series) -> bool: +def column_contains_list_string(column: pd.Series) -> bool: # Note: this works off the assumption that all values will have the same type - return column.iloc[0].startswith("[") + try: + return column.iloc[0].startswith("[") + except AttributeError: + return False -def column_contains_set(column: pd.Series) -> bool: - return column.iloc[0].startswith("{") +def column_contains_set_string(column: pd.Series) -> bool: + try: + return column.iloc[0].startswith("{") + except AttributeError: + return False def group_indicators(df: pd.DataFrame) -> pd.Series: - df_copy = df.copy() # avoid side effects with ast.literal - if column_contains_list(df_copy[INDICATOR]) or column_contains_set(df_copy[INDICATOR]): + if is_list_like(df[INDICATOR].iloc[0]): + return df.groupby(DOMAIN)[INDICATOR].agg(lambda x: set(chain.from_iterable(x))) + elif column_contains_list_string(df[INDICATOR]) or column_contains_set_string(df[INDICATOR]): + df_copy = df.copy() # avoid side effects with ast.literal df_copy[INDICATOR] = df_copy[INDICATOR].map(ast.literal_eval) return df_copy.groupby(DOMAIN)[INDICATOR].agg(lambda x: set(chain.from_iterable(x))) else: - return df_copy.groupby(DOMAIN)[INDICATOR].apply(set) + return df.groupby(DOMAIN)[INDICATOR].apply(set) diff --git a/tests/test__matcher.py b/tests/test__matcher.py index f885cd7..aa6eac9 100644 --- a/tests/test__matcher.py +++ b/tests/test__matcher.py @@ -16,6 +16,15 @@ ) def feature_group_as_list_1(): + return pd.DataFrame( + [ + {DOMAIN: "a", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: [1, 2, 3]}, + {DOMAIN: "b", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: [3, 4, 5]}, + {DOMAIN: "c", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: [4, 5, 6]}, + ] + ) + +def feature_group_as_list_str_1(): return pd.DataFrame( [ {DOMAIN: "a", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: "[1, 2, 3]"}, @@ -24,7 +33,7 @@ def feature_group_as_list_1(): ] ) -def feature_group_as_list_2(): +def feature_group_as_list_str_2(): return pd.DataFrame( columns=[DOMAIN, INDICATOR, INDICATOR_TYPE], data=[ @@ -96,8 +105,35 @@ def test__find_direct_matches(feature_df, compare_df, expected_results): @pytest.mark.parametrize( "feature_df,compare_df,expected_results", [ - ( feature_group_as_list_1(), + pytest.param( + feature_group_as_list_str_1(), + feature_group_as_list_str_1(), + pd.DataFrame( + [ + { + "domain_name_x": "a", + "domain_name_y": "b", + "match_type": "feature", + "match_value": 0.2, + }, + { + "domain_name_x": "a", + "domain_name_y": "c", + "match_type": "feature", + "match_value": 0.0, + }, + { + "domain_name_x": "b", + "domain_name_y": "c", + "match_type": "feature", + "match_value": 0.5, + }, + ] + ), + id="listlike strings, same values"), + pytest.param( feature_group_as_list_1(), + feature_group_as_list_str_1(), pd.DataFrame( [ { @@ -120,8 +156,8 @@ def test__find_direct_matches(feature_df, compare_df, expected_results): }, ] ), - ), - ( + id="one list, one listlike string, same values"), + pytest.param( feature_group_as_string_1(), feature_group_as_string_1(), pd.DataFrame( @@ -146,14 +182,14 @@ def test__find_direct_matches(feature_df, compare_df, expected_results): }, ] ), - ), - ( - feature_group_as_list_1(), - feature_group_as_list_2(), + id="two set-like strings, same values"), + pytest.param( + feature_group_as_list_str_1(), + feature_group_as_list_str_2(), pd.DataFrame( columns=["domain_name_x", "domain_name_y", "match_type", "match_value"] ), - ), + id="two listlike strings, different values"), ], ) def test__find_iou_matches(feature_df, compare_df, expected_results): @@ -175,8 +211,8 @@ def test__parse_certificate_matches(): "feature_df,compare_df,expected_results", [ ( - feature_group_as_list_1(), - feature_group_as_list_1(), + feature_group_as_list_str_1(), + feature_group_as_list_str_1(), pd.DataFrame( [ { @@ -219,8 +255,8 @@ def test__parse_certificate_matches(): ) ), ( - feature_group_as_list_1(), - feature_group_as_list_2(), + feature_group_as_list_str_1(), + feature_group_as_list_str_2(), pd.DataFrame( columns=["domain_name_x", "domain_name_y", "match_type", "match_value"], data=[]