Skip to content

Commit

Permalink
Various updates, including language updates and fixing gdelt/copyscap…
Browse files Browse the repository at this point in the history
…e inputs, UI updates
  • Loading branch information
Peter Benzoni committed Jun 6, 2024
1 parent e6e4904 commit 4e65722
Show file tree
Hide file tree
Showing 6 changed files with 430 additions and 777 deletions.
130 changes: 61 additions & 69 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,9 +374,6 @@ def parse_content_search():
else:
return render_template('index.html', request=request, results=results, csv_data=csv_data, engines=ENGINES, countries=COUNTRIES, languages=LANGUAGES, indicator_metadata=INDICATOR_METADATA)




def content(request, title_query=None, content_query=None):
if request.method == 'POST':
title_query = title_query if title_query is not None else request.form.get('titleQuery')
Expand Down Expand Up @@ -428,14 +425,14 @@ def parse_url(request, urlToParse=None):
if request.method == 'POST':
url = urlToParse if urlToParse is not None else request.form.get('url', '')
url = format_url(url)
engines = request.form.getlist('search_engines', [])
engines = request.form.getlist('search_engines')
combineOperator = request.form.get('combineOperator', 'OR')
language = request.form.get('language', 'en')
country = request.form.get('country', 'us')
elif request.method == 'GET':
url = urlToParse if urlToParse is not None else request.args.get('url', '')
url = format_url(url)
engines = request.args.getlist('search_engines', [])
engines = request.args.getlist('search_engines')
combineOperator = request.args.get('combineOperator', 'OR')
language = request.args.get('language', 'en')
country = request.args.get('country', 'us')
Expand Down Expand Up @@ -778,7 +775,7 @@ def indicators(request):
if len(selected_type) > 0 and row['indicator_type'] == selected_type:
truncated_row = {key: value[:100] for key, value in row.items()}
data.append(truncated_row)
unique_types = sorted(set(unique_types_list))
unique_types = sorted(set(unique_types_list))
return data, unique_types, selected_type


Expand Down Expand Up @@ -907,40 +904,22 @@ def fetch_content_results(title_query, content_query, combineOperator, language,
return results, csv_data

def format_copyscape_output(data):
output = {}
output = []
for article in data:
parsed_url = urlparse(article["url"])
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
if domain not in output:
output[domain] = {"count": 0, "links": [],
"concern": False, "source": []}
output[domain]["count"] += 1
output[domain]["links"].append({
"link": article["url"],
output.append({
"url": article["url"],
"title": article["title"],
"snippet": article["textsnippet"],
"count": 1, # Assuming each link is unique and counts as 1
# Placeholder, as the engine is not specified in the data
"engines": ["Plagiarism Checker"]
})
return output

def format_gdelt_output(data):
output = {}
output = []
for article in data.get("articles", []):
parsed_url = urlparse(article["url"])
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
if domain not in output:
output[domain] = {"count": 0, "links": [],
"concern": False, "source": []}
output[domain]["count"] += 1
output[domain]["links"].append({
"link": article["url"],
output.append({
"url": article["url"],
"title": article["title"],
"snippet": "",
"count": 1, # Assuming each link is unique and counts as 1
# Placeholder, as the engine is not specified in the data
"engines": ["GDELT"]
})
return output

Expand Down Expand Up @@ -974,19 +953,14 @@ def normalize_results(results, engine):
if results is None:
return []
for result in results:
if engine == 'copyscape':
if engine == 'copyscape' or engine == 'gdelt':
parsed_url = urlparse(result['url'])
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': result['textsnippet'], 'engine': engine})
elif engine == 'gdelt':
parsed_url = urlparse(result['url'])
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': '', 'engine': engine})
normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': result['snippet'], 'engine': [engine]})
else:
parsed_url = urlparse(result['link'])
domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
normalized_data.append({'domain':domain,'url': result.get('link'), 'title': result.get(
'title'), 'snippet': result.get('snippet') , 'engine': [engine]})
normalized_data.append({'domain':domain,'url': result.get('link'), 'title': result.get('title'), 'snippet': result.get('snippet') , 'engine': [engine]})
return normalized_data

with concurrent.futures.ThreadPoolExecutor() as executor:
Expand All @@ -1010,40 +984,58 @@ def normalize_results(results, engine):
# Temporary dictionary to hold the first occurrence index of each URL

url_indexes = {}
for idx in range(len(all_results) - 1, -1, -1):
result = all_results[idx]
url = result['url']
if url in url_indexes:
# This URL has been seen before; merge information and delete this occurrence
first_occurrence_idx = url_indexes[url]
all_results[first_occurrence_idx]['engines'].extend(result['engine'])
all_results[first_occurrence_idx]['link_count'] += 1
all_results[first_occurrence_idx]['score'] = max(
sequence_match_score(all_results[first_occurrence_idx]['title'], result['title']),
sequence_match_score(all_results[first_occurrence_idx]['snippet'], result['snippet'])
)
all_results.pop(idx)
else:
url_indexes[url] = idx
local_source = local_domains_dict.get(result['domain']) or local_domains_dict.get(result['domain'].split('.')[1]) # Check for FQDN and no subdomain
github_source = "statemedia" if urlparse(result['domain']).netloc.strip() in github_domains else None
all_results[idx]['source'] = []
if local_source is not None:
#aggregated_results["source"].append(local_source)
all_results[idx]['source'] = [local_source]
if github_source is not None:
#aggregated_results["source"].append(github_source)
all_results[idx]['source'] = [github_source]
all_results[idx]['link_count'] = 1
all_results[idx]['domain_count'] = 1
all_results[idx]['engines'] = result['engine']
all_results[idx]['score'] = max(sequence_match_score(title_query, all_results[idx]['title']), sequence_match_score(content_query, all_results[idx]['snippet']))

aggregated_results = []
try:
for idx in range(len(all_results) - 1):

result = all_results[idx]
url = result['url']
if url in url_indexes:
# This URL has been seen before; merge information and delete this occurrence
try:
first_occurrence_idx = url_indexes[url]
aggregated_results[first_occurrence_idx]['engines'].extend(result['engine'])
aggregated_results[first_occurrence_idx]['link_count'] += 1
aggregated_results[first_occurrence_idx]['score'] = max(
aggregated_results[first_occurrence_idx]['score'],
max(
sequence_match_score(title_query, result['title']),
sequence_match_score(content_query, result['snippet']) if result['snippet'] != '' else 0
)
)
if sequence_match_score(result['title'], title_query) > sequence_match_score(aggregated_results[first_occurrence_idx]['title'], title_query):
aggregated_results[first_occurrence_idx]['title'] = result['title']
if sequence_match_score(result['snippet'], content_query) > sequence_match_score(aggregated_results[first_occurrence_idx]['snippet'], content_query):
aggregated_results[first_occurrence_idx]['snippet'] = result['snippet']
except Exception as e:
print(f"Error merging results: {e}")
continue
else:
aggregated_results.append(all_results[idx])
agg_idx = len(aggregated_results) - 1
url_indexes[url] = agg_idx
local_source = local_domains_dict.get(urlparse(result['domain']).netloc.strip()) or local_domains_dict.get(urlparse(result['domain']).netloc.strip().split('.')[1]) # Check for FQDN and no subdomain
github_source = "statemedia" if urlparse(result['domain']).netloc.strip() in github_domains else None
aggregated_results[agg_idx]['source'] = []
if local_source is not None:
aggregated_results[agg_idx]['source'] = local_source
if github_source is not None:
aggregated_results[agg_idx]['source'] = github_source
aggregated_results[agg_idx]['link_count'] = 1
aggregated_results[agg_idx]['domain_count'] = 1
aggregated_results[agg_idx]['engines'] = result['engine']
aggregated_results[agg_idx]['score'] = max(sequence_match_score(title_query, result['title']), sequence_match_score(content_query, result['snippet']) if result['snippet'] != '' else 0)
except Exception as e:
print(f"Error aggregating results: {e}")
app.logger.error(f"Error aggregating results: {e}")
# convet list of engines to set to delete duplicates
for result in aggregated_results:
result['engines'] = list(set(result['engines']))

# Assuming flattened_data is your list of dictionaries
all_results = sorted(all_results, key=lambda x: x['score'], reverse=True)
aggregated_results = sorted(aggregated_results, key=lambda x: x['score'], reverse=True)

return all_results
return aggregated_results


def customize_params_by_platform(title_query, content_query, combineOperator, language, country):
Expand Down
42 changes: 24 additions & 18 deletions modules/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from usp.tree import sitemap_tree_for_homepage

from modules.indicator import Indicator
from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, TRACKING_IDS)
from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, TRACKING_IDS, CRYPTO_IDS)
from modules.reference import LEAD_GEN_INDICATORS

URLSCAN_API_KEY = os.getenv('URLSCAN_API_KEY', '')
Expand Down Expand Up @@ -180,7 +180,7 @@ def parse_sitemaps(url) -> list[Indicator]:
tree = sitemap_tree_for_homepage(url)
logging.info(tree)
entries = set(page.url for page in tree.all_pages())
return [Indicator("4-sitemap_entries", entries)]
return [Indicator("3-sitemap_entries", entries)]

@return_empty_if_fails
def parse_dom_tree(soup) -> list[Indicator]:
Expand Down Expand Up @@ -222,7 +222,7 @@ def parse_meta_tags(soup) -> list[Indicator]:
name = meta_tag.get("name")
prop = meta_tag.get("property")
content = meta_tag.get("content")
if name and "verif" in name.lower():
if name and ("verif" in name.lower() or "valid" in name.lower()):
tag_indicators.append(Indicator("1-verification_id", name + "|" + content))
elif name and name in ["twitter:site", "fb:pages"]:
tag_indicators.append(Indicator("3-meta_social", name + "|" + content))
Expand Down Expand Up @@ -481,7 +481,7 @@ def get_ipms_ip_indicators(ipms_url) -> list[Indicator]:
@return_empty_if_fails
def parse_body(response) -> list[Indicator]:
text = response.text
return find_uuids(text) + find_wallets(text)
return find_uuids(text)


@return_empty_if_fails
Expand Down Expand Up @@ -563,10 +563,13 @@ def add_associated_domains_from_cert(url) -> list[Indicator]:
return []

@return_empty_if_fails
def parse_id_patterns(response, id_patterns: dict[str,str]) -> list[Indicator]:
def parse_id_patterns(response, soup, use_plaintext, id_patterns: dict[str,str]) -> list[Indicator]:
tag_indicators = []
for id_type, pattern in id_patterns.items():
id_indicators = find_with_regex(regex=pattern, text=response.text, indicator_type=id_type)
if use_plaintext:
id_indicators = find_with_regex(regex=pattern, text=soup.get_text(separator=' ', strip=True), indicator_type=id_type)
else:
id_indicators = find_with_regex(regex=pattern, text=response.text, indicator_type=id_type)
tag_indicators.extend(id_indicators)
return tag_indicators

Expand Down Expand Up @@ -744,14 +747,14 @@ def detect_and_parse_feed_content(url) -> list[Indicator]:
feed = feedparser.parse(url)
for entry in feed.entries:
feed_indicators.append(
Indicator("4-content-title", entry.title)
Indicator("3-content-title", entry.title)
)
feed_indicators.append(Indicator("4-content-link", entry.link))
feed_indicators.append(
Indicator("4-content-summary", entry.summary)
Indicator("3-content-summary", entry.summary)
)
feed_indicators.append(
Indicator("4-content-published", entry.published)
Indicator("3-content-published", entry.published)
)

return feed_indicators
Expand All @@ -777,7 +780,7 @@ def get_outbound_domains(url, soup) -> list[Indicator]:
link_domain = f"{td}.{tsu}"
if link_domain != f"{od}.{osu}":
outbound_domains.add(link_domain)
return [Indicator("4-outbound-domain", outbound_domains) ]
return [Indicator("3-outbound-domain", outbound_domains) ]

# parses <domain>.ads.txt file for associated ad networks, exchanges, and other ad-related entities
def parse_ads_txt(url, soup):
Expand Down Expand Up @@ -843,18 +846,20 @@ def crawl(url, run_urlscan=False) -> list[Indicator]:
indicators.extend(parse_id_attributes(soup))
indicators.extend(parse_link_tags(soup))
indicators.extend(parse_footer(soup))
indicators.extend(parse_id_patterns(response=response, id_patterns=EMBEDDED_IDS))
indicators.extend(parse_id_patterns(response=response, id_patterns=FINANCIAL_IDS))
indicators.extend(parse_id_patterns(response=response, id_patterns=SOCIAL_MEDIA_IDS))
indicators.extend(parse_id_patterns(response=response, id_patterns=TRACKING_IDS))
indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=EMBEDDED_IDS))
indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=FINANCIAL_IDS))
indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=SOCIAL_MEDIA_IDS))
indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=TRACKING_IDS))
indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=True, id_patterns=CRYPTO_IDS))

indicators.extend(add_cdn_domains(soup))
indicators.extend(parse_domain_name(url))
indicators.extend(parse_classes(soup))
indicators.extend(get_ipms_indicators(url))
#indicators.extend(get_ipms_indicators(url))
indicators.extend(get_shodan_indicators(url))
indicators.extend(add_associated_domains_from_cert(url))
indicators.extend(get_outbound_domains(url, soup))
indicators.extend(parse_ads_txt(url, response))
#indicators.extend(parse_ads_txt(url, response))
## Uncomment the following if needed
# indicators.extend(add_who_is(url))
# indicators.extend(parse_images(url, soup, response))
Expand Down Expand Up @@ -933,7 +938,7 @@ def remove_json_like_strings(text):
def annotate_indicators(indicators_df):
# iterate across the indicators dataframe, if indicator_type is in the keys of FINANCIAL_IDS, add 'financial' to the indicator_annotation column
for index, row in indicators_df.iterrows():
if row['indicator_type'] in FINANCIAL_IDS.keys():
if row['indicator_type'] in FINANCIAL_IDS.keys() or row['indicator_type'] in CRYPTO_IDS.keys():
indicators_df.at[index, 'indicator_annotation'] = 'financial'
elif row['indicator_type'] in EMBEDDED_IDS.keys():
indicators_df.at[index, 'indicator_annotation'] = 'embedded'
Expand Down Expand Up @@ -999,7 +1004,7 @@ def write_domain_indicators(domain, indicators, output_file):
type=str,
help="file to save final list of match results",
required=False,
default=os.path.join(".", "indicators_output_dmi.csv"),
default=os.path.join(".", "indicators_output.csv"),
)
logging.basicConfig(
level=logging.INFO,
Expand All @@ -1017,6 +1022,7 @@ def write_domain_indicators(domain, indicators, output_file):
domains = input_data[domain_col]
for domain in domains:
try:
print(f"Processing {domain}")
domain_name = get_domain_name(domain)
indicators = crawl(domain, run_urlscan=run_urlscan)
write_domain_indicators(domain_name, indicators, output_file=output_file)
Expand Down
7 changes: 4 additions & 3 deletions modules/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.api.types import is_list_like

from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS,
TRACKING_IDS)
TRACKING_IDS, CRYPTO_IDS)

## Preprocessing

Expand Down Expand Up @@ -300,15 +300,16 @@ def parse_certificate_matches(
"2-urlscanhrefs" : iou_match,
"2-techstack" : iou_match,
"3-footer-text": direct_match,
"4-outbound-domain": iou_match,
"3-outbound-domain": iou_match,
"2-ads_txt": iou_match

}

FEATURE_MATCHING.update({financial_id: direct_match for financial_id in FINANCIAL_IDS})
FEATURE_MATCHING.update({embedded_id: direct_match for embedded_id in EMBEDDED_IDS})
FEATURE_MATCHING.update({social_id: direct_match for social_id in SOCIAL_MEDIA_IDS})
FEATURE_MATCHING.update({tracking_id: direct_match for tracking_id in TRACKING_IDS})
FEATURE_MATCHING.update({crypto_id: direct_match for crypto_id in CRYPTO_IDS})


WHOIS_FEATURES = [
"whois-registrar",
Expand Down
Loading

0 comments on commit 4e65722

Please sign in to comment.