Skip to content

Commit

Permalink
Handle 'real' lists and sets in the IOU function (#49)
Browse files Browse the repository at this point in the history
* unpacking error

* add missing env variable reference

* handle actual lists and sets being passed in
  • Loading branch information
abargar authored Mar 4, 2024
1 parent 97ffd64 commit 1cc1d79
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 21 deletions.
2 changes: 1 addition & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def content_gui():
# Error message if neither is provided
flash("Please provide at least a title or content query.")
else:
results, csv_data = None
results, csv_data = (None, None)
results, csv_data = content(request)

return render_template('index.html', results=results, csv_data=csv_data, countries=COUNTRIES, languages=LANGUAGES, indicator_metadata=INDICATOR_METADATA)
Expand Down
1 change: 1 addition & 0 deletions modules/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

URLSCAN_API_KEY = os.getenv('URLSCAN_API_KEY')
SCRAPER_API_KEY = os.getenv('SCRAPER_API_KEY')
MYIPMS_API_PATH = os.getenv('MYIPMS_API_PATH', '')

visited = set()

Expand Down
23 changes: 16 additions & 7 deletions modules/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import numpy as np
import pandas as pd
from pandas.api.types import is_list_like
from pathlib import Path
import traceback
from typing import Dict, Any
Expand All @@ -25,20 +26,28 @@ def basic_preprocess(df: pd.DataFrame, feature: str) -> pd.DataFrame:

return df

def column_contains_list(column: pd.Series) -> bool:
def column_contains_list_string(column: pd.Series) -> bool:
# Note: this works off the assumption that all values will have the same type
return column.iloc[0].startswith("[")
try:
return column.iloc[0].startswith("[")
except AttributeError:
return False

def column_contains_set(column: pd.Series) -> bool:
return column.iloc[0].startswith("{")
def column_contains_set_string(column: pd.Series) -> bool:
try:
return column.iloc[0].startswith("{")
except AttributeError:
return False

def group_indicators(df: pd.DataFrame) -> pd.Series:
df_copy = df.copy() # avoid side effects with ast.literal
if column_contains_list(df_copy[INDICATOR]) or column_contains_set(df_copy[INDICATOR]):
if is_list_like(df[INDICATOR].iloc[0]):
return df.groupby(DOMAIN)[INDICATOR].agg(lambda x: set(chain.from_iterable(x)))
elif column_contains_list_string(df[INDICATOR]) or column_contains_set_string(df[INDICATOR]):
df_copy = df.copy() # avoid side effects with ast.literal
df_copy[INDICATOR] = df_copy[INDICATOR].map(ast.literal_eval)
return df_copy.groupby(DOMAIN)[INDICATOR].agg(lambda x: set(chain.from_iterable(x)))
else:
return df_copy.groupby(DOMAIN)[INDICATOR].apply(set)
return df.groupby(DOMAIN)[INDICATOR].apply(set)



Expand Down
62 changes: 49 additions & 13 deletions tests/test__matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
)

def feature_group_as_list_1():
return pd.DataFrame(
[
{DOMAIN: "a", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: [1, 2, 3]},
{DOMAIN: "b", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: [3, 4, 5]},
{DOMAIN: "c", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: [4, 5, 6]},
]
)

def feature_group_as_list_str_1():
return pd.DataFrame(
[
{DOMAIN: "a", INDICATOR_TYPE: INDICATOR_TYPE, INDICATOR: "[1, 2, 3]"},
Expand All @@ -24,7 +33,7 @@ def feature_group_as_list_1():
]
)

def feature_group_as_list_2():
def feature_group_as_list_str_2():
return pd.DataFrame(
columns=[DOMAIN, INDICATOR, INDICATOR_TYPE],
data=[
Expand Down Expand Up @@ -96,8 +105,35 @@ def test__find_direct_matches(feature_df, compare_df, expected_results):
@pytest.mark.parametrize(
"feature_df,compare_df,expected_results",
[
( feature_group_as_list_1(),
pytest.param(
feature_group_as_list_str_1(),
feature_group_as_list_str_1(),
pd.DataFrame(
[
{
"domain_name_x": "a",
"domain_name_y": "b",
"match_type": "feature",
"match_value": 0.2,
},
{
"domain_name_x": "a",
"domain_name_y": "c",
"match_type": "feature",
"match_value": 0.0,
},
{
"domain_name_x": "b",
"domain_name_y": "c",
"match_type": "feature",
"match_value": 0.5,
},
]
),
id="listlike strings, same values"),
pytest.param(
feature_group_as_list_1(),
feature_group_as_list_str_1(),
pd.DataFrame(
[
{
Expand All @@ -120,8 +156,8 @@ def test__find_direct_matches(feature_df, compare_df, expected_results):
},
]
),
),
(
id="one list, one listlike string, same values"),
pytest.param(
feature_group_as_string_1(),
feature_group_as_string_1(),
pd.DataFrame(
Expand All @@ -146,14 +182,14 @@ def test__find_direct_matches(feature_df, compare_df, expected_results):
},
]
),
),
(
feature_group_as_list_1(),
feature_group_as_list_2(),
id="two set-like strings, same values"),
pytest.param(
feature_group_as_list_str_1(),
feature_group_as_list_str_2(),
pd.DataFrame(
columns=["domain_name_x", "domain_name_y", "match_type", "match_value"]
),
),
id="two listlike strings, different values"),
],
)
def test__find_iou_matches(feature_df, compare_df, expected_results):
Expand All @@ -175,8 +211,8 @@ def test__parse_certificate_matches():
"feature_df,compare_df,expected_results",
[
(
feature_group_as_list_1(),
feature_group_as_list_1(),
feature_group_as_list_str_1(),
feature_group_as_list_str_1(),
pd.DataFrame(
[
{
Expand Down Expand Up @@ -219,8 +255,8 @@ def test__parse_certificate_matches():
)
),
(
feature_group_as_list_1(),
feature_group_as_list_2(),
feature_group_as_list_str_1(),
feature_group_as_list_str_2(),
pd.DataFrame(
columns=["domain_name_x", "domain_name_y", "match_type", "match_value"],
data=[]
Expand Down

0 comments on commit 1cc1d79

Please sign in to comment.