Various updates, including language updates and fixing gdelt/copyscap…

…e inputs, UI updates
ASD-at-GMF · Jun 6, 2024 · 4e65722 · 4e65722
1 parent e6e4904
commit 4e65722
Show file tree

Hide file tree

Showing 6 changed files with 430 additions and 777 deletions.
diff --git a/app.py b/app.py
@@ -374,9 +374,6 @@ def parse_content_search():
     else:
         return render_template('index.html', request=request, results=results, csv_data=csv_data, engines=ENGINES, countries=COUNTRIES, languages=LANGUAGES, indicator_metadata=INDICATOR_METADATA)
 
-
-
-
 def content(request, title_query=None, content_query=None):
     if request.method == 'POST':
         title_query = title_query if title_query is not None else  request.form.get('titleQuery')
@@ -428,14 +425,14 @@ def parse_url(request, urlToParse=None):
     if request.method == 'POST':
         url = urlToParse if urlToParse is not None else request.form.get('url', '')
         url = format_url(url)
-        engines = request.form.getlist('search_engines', [])
+        engines = request.form.getlist('search_engines')
         combineOperator = request.form.get('combineOperator', 'OR')
         language = request.form.get('language', 'en')
         country = request.form.get('country', 'us')
     elif request.method == 'GET':
         url = urlToParse if urlToParse is not None else request.args.get('url', '')
         url = format_url(url)
-        engines = request.args.getlist('search_engines', [])
+        engines = request.args.getlist('search_engines')
         combineOperator = request.args.get('combineOperator', 'OR')
         language = request.args.get('language', 'en')
         country = request.args.get('country', 'us')
@@ -778,7 +775,7 @@ def indicators(request):
             if len(selected_type) > 0 and row['indicator_type'] == selected_type:
                 truncated_row = {key: value[:100] for key, value in row.items()}
                 data.append(truncated_row)
-        unique_types = sorted(set(unique_types_list))
+        unique_types = sorted(set(unique_types_list)) 
     return data, unique_types, selected_type
 
 
@@ -907,40 +904,22 @@ def fetch_content_results(title_query, content_query, combineOperator, language,
     return results, csv_data
 
 def format_copyscape_output(data):
-    output = {}
+    output = []
     for article in data:
-        parsed_url = urlparse(article["url"])
-        domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
-        if domain not in output:
-            output[domain] = {"count": 0, "links": [],
-                              "concern": False, "source": []}
-        output[domain]["count"] += 1
-        output[domain]["links"].append({
-            "link": article["url"],
+        output.append({
+            "url": article["url"],
             "title": article["title"],
             "snippet": article["textsnippet"],
-            "count": 1,  # Assuming each link is unique and counts as 1
-            # Placeholder, as the engine is not specified in the data
-            "engines": ["Plagiarism Checker"]
         })
     return output
 
 def format_gdelt_output(data):
-    output = {}
+    output = []
     for article in data.get("articles", []):
-        parsed_url = urlparse(article["url"])
-        domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
-        if domain not in output:
-            output[domain] = {"count": 0, "links": [],
-                              "concern": False, "source": []}
-        output[domain]["count"] += 1
-        output[domain]["links"].append({
-            "link": article["url"],
+        output.append({
+            "url": article["url"],
             "title": article["title"],
             "snippet": "",
-            "count": 1,  # Assuming each link is unique and counts as 1
-            # Placeholder, as the engine is not specified in the data
-            "engines": ["GDELT"]
         })
     return output
 
@@ -974,19 +953,14 @@ def normalize_results(results, engine):
             if results is None:
                 return []
         for result in results:
-            if engine == 'copyscape':
+            if engine == 'copyscape' or engine == 'gdelt':
                 parsed_url = urlparse(result['url'])
                 domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
-                normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': result['textsnippet'],  'engine': engine})
-            elif engine == 'gdelt':
-                parsed_url = urlparse(result['url'])
-                domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
-                normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': '',  'engine': engine})
+                normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': result['snippet'],  'engine': [engine]})
             else:
                 parsed_url = urlparse(result['link'])
                 domain = f"{parsed_url.scheme}://{parsed_url.netloc}"
-                normalized_data.append({'domain':domain,'url': result.get('link'), 'title': result.get(
-                'title'), 'snippet': result.get('snippet') , 'engine': [engine]})
+                normalized_data.append({'domain':domain,'url': result.get('link'), 'title': result.get('title'), 'snippet': result.get('snippet') , 'engine': [engine]})
         return normalized_data
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -1010,40 +984,58 @@ def normalize_results(results, engine):
     # Temporary dictionary to hold the first occurrence index of each URL
 
     url_indexes = {}
-    for idx in range(len(all_results) - 1, -1, -1):
-        result = all_results[idx]
-        url = result['url']
-        if url in url_indexes:
-        # This URL has been seen before; merge information and delete this occurrence
-            first_occurrence_idx = url_indexes[url]
-            all_results[first_occurrence_idx]['engines'].extend(result['engine'])
-            all_results[first_occurrence_idx]['link_count'] += 1
-            all_results[first_occurrence_idx]['score'] = max(
-                sequence_match_score(all_results[first_occurrence_idx]['title'], result['title']),
-                sequence_match_score(all_results[first_occurrence_idx]['snippet'], result['snippet'])
-            )
-            all_results.pop(idx)
-        else:
-            url_indexes[url] = idx
-            local_source = local_domains_dict.get(result['domain']) or local_domains_dict.get(result['domain'].split('.')[1])  # Check for FQDN and no subdomain
-            github_source = "statemedia" if urlparse(result['domain']).netloc.strip() in github_domains else None
-            all_results[idx]['source'] = []
-            if local_source is not None:
-                #aggregated_results["source"].append(local_source)
-                all_results[idx]['source'] = [local_source]
-            if github_source is not None:
-                #aggregated_results["source"].append(github_source)
-                all_results[idx]['source'] = [github_source]
-            all_results[idx]['link_count'] = 1
-            all_results[idx]['domain_count'] = 1
-            all_results[idx]['engines'] = result['engine'] 
-            all_results[idx]['score'] = max(sequence_match_score(title_query, all_results[idx]['title']), sequence_match_score(content_query, all_results[idx]['snippet']))
-
+    aggregated_results = []
+    try:
+        for idx in range(len(all_results) - 1):
 
+            result = all_results[idx]
+            url = result['url']
+            if url in url_indexes:
+            # This URL has been seen before; merge information and delete this occurrence
+                try:
+                    first_occurrence_idx = url_indexes[url]
+                    aggregated_results[first_occurrence_idx]['engines'].extend(result['engine'])
+                    aggregated_results[first_occurrence_idx]['link_count'] += 1
+                    aggregated_results[first_occurrence_idx]['score'] = max(
+                        aggregated_results[first_occurrence_idx]['score'],
+                        max(
+                            sequence_match_score(title_query, result['title']),
+                            sequence_match_score(content_query, result['snippet']) if result['snippet'] != ''  else 0
+                        )
+                    )
+                    if sequence_match_score(result['title'], title_query) > sequence_match_score(aggregated_results[first_occurrence_idx]['title'], title_query):
+                        aggregated_results[first_occurrence_idx]['title'] = result['title']
+                    if sequence_match_score(result['snippet'], content_query) > sequence_match_score(aggregated_results[first_occurrence_idx]['snippet'], content_query):
+                        aggregated_results[first_occurrence_idx]['snippet'] = result['snippet']
+                except Exception as e:
+                    print(f"Error merging results: {e}")
+                    continue
+            else:
+                aggregated_results.append(all_results[idx])
+                agg_idx = len(aggregated_results) - 1
+                url_indexes[url] = agg_idx
+                local_source = local_domains_dict.get(urlparse(result['domain']).netloc.strip()) or local_domains_dict.get(urlparse(result['domain']).netloc.strip().split('.')[1])  # Check for FQDN and no subdomain
+                github_source = "statemedia" if urlparse(result['domain']).netloc.strip() in github_domains else None
+                aggregated_results[agg_idx]['source'] = []
+                if local_source is not None:
+                    aggregated_results[agg_idx]['source'] = local_source
+                if github_source is not None:
+                    aggregated_results[agg_idx]['source'] = github_source
+                aggregated_results[agg_idx]['link_count'] = 1
+                aggregated_results[agg_idx]['domain_count'] = 1
+                aggregated_results[agg_idx]['engines'] = result['engine'] 
+                aggregated_results[agg_idx]['score'] = max(sequence_match_score(title_query, result['title']), sequence_match_score(content_query, result['snippet']) if result['snippet'] != ''  else 0)
+    except Exception as e:
+        print(f"Error aggregating results: {e}")
+        app.logger.error(f"Error aggregating results: {e}")            
+    # convet list of engines to set to delete duplicates
+    for result in aggregated_results:
+        result['engines'] = list(set(result['engines']))
+
     # Assuming flattened_data is your list of dictionaries
-    all_results = sorted(all_results, key=lambda x: x['score'], reverse=True)
+    aggregated_results = sorted(aggregated_results, key=lambda x: x['score'], reverse=True)
 
-    return all_results
+    return aggregated_results
 
 
 def customize_params_by_platform(title_query, content_query, combineOperator, language, country):

diff --git a/modules/crawler.py b/modules/crawler.py
@@ -31,7 +31,7 @@
 from usp.tree import sitemap_tree_for_homepage
 
 from modules.indicator import Indicator
-from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, TRACKING_IDS)
+from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, TRACKING_IDS, CRYPTO_IDS)
 from modules.reference import LEAD_GEN_INDICATORS
 
 URLSCAN_API_KEY = os.getenv('URLSCAN_API_KEY', '')
@@ -180,7 +180,7 @@ def parse_sitemaps(url) -> list[Indicator]:
     tree = sitemap_tree_for_homepage(url)
     logging.info(tree)
     entries = set(page.url for page in tree.all_pages())
-    return [Indicator("4-sitemap_entries", entries)]
+    return [Indicator("3-sitemap_entries", entries)]
 
 @return_empty_if_fails
 def parse_dom_tree(soup) -> list[Indicator]:
@@ -222,7 +222,7 @@ def parse_meta_tags(soup) -> list[Indicator]:
         name = meta_tag.get("name")
         prop = meta_tag.get("property")
         content = meta_tag.get("content")
-        if name and "verif" in name.lower():
+        if name and ("verif" in name.lower() or "valid" in name.lower()):
             tag_indicators.append(Indicator("1-verification_id", name + "|" + content))
         elif name and name in ["twitter:site", "fb:pages"]:
             tag_indicators.append(Indicator("3-meta_social", name + "|" + content))
@@ -481,7 +481,7 @@ def get_ipms_ip_indicators(ipms_url) -> list[Indicator]:
 @return_empty_if_fails
 def parse_body(response) -> list[Indicator]:
     text = response.text
-    return find_uuids(text) + find_wallets(text)
+    return find_uuids(text)
 
 
 @return_empty_if_fails
@@ -563,10 +563,13 @@ def add_associated_domains_from_cert(url) -> list[Indicator]:
         return []
 
 @return_empty_if_fails
-def parse_id_patterns(response, id_patterns: dict[str,str]) -> list[Indicator]:
+def parse_id_patterns(response, soup, use_plaintext,  id_patterns: dict[str,str]) -> list[Indicator]:
     tag_indicators = []
     for id_type, pattern in id_patterns.items():
-        id_indicators = find_with_regex(regex=pattern, text=response.text, indicator_type=id_type)
+        if use_plaintext:
+            id_indicators = find_with_regex(regex=pattern, text=soup.get_text(separator=' ', strip=True), indicator_type=id_type)
+        else: 
+            id_indicators = find_with_regex(regex=pattern, text=response.text, indicator_type=id_type)
         tag_indicators.extend(id_indicators)
     return tag_indicators
 
@@ -744,14 +747,14 @@ def detect_and_parse_feed_content(url) -> list[Indicator]:
         feed = feedparser.parse(url)
         for entry in feed.entries:
             feed_indicators.append(
-                Indicator("4-content-title", entry.title)
+                Indicator("3-content-title", entry.title)
             )
             feed_indicators.append(Indicator("4-content-link", entry.link))
             feed_indicators.append(
-                Indicator("4-content-summary", entry.summary)
+                Indicator("3-content-summary", entry.summary)
             )
             feed_indicators.append(
-                Indicator("4-content-published", entry.published)
+                Indicator("3-content-published", entry.published)
             )
 
     return feed_indicators
@@ -777,7 +780,7 @@ def get_outbound_domains(url, soup) -> list[Indicator]:
             link_domain = f"{td}.{tsu}"
             if link_domain != f"{od}.{osu}":
                 outbound_domains.add(link_domain)
-    return [Indicator("4-outbound-domain", outbound_domains) ]
+    return [Indicator("3-outbound-domain", outbound_domains) ]
 
 # parses <domain>.ads.txt file for associated ad networks, exchanges, and other ad-related entities
 def parse_ads_txt(url, soup):
@@ -843,18 +846,20 @@ def crawl(url, run_urlscan=False) -> list[Indicator]:
     indicators.extend(parse_id_attributes(soup))
     indicators.extend(parse_link_tags(soup))
     indicators.extend(parse_footer(soup))
-    indicators.extend(parse_id_patterns(response=response, id_patterns=EMBEDDED_IDS))
-    indicators.extend(parse_id_patterns(response=response, id_patterns=FINANCIAL_IDS))
-    indicators.extend(parse_id_patterns(response=response, id_patterns=SOCIAL_MEDIA_IDS))
-    indicators.extend(parse_id_patterns(response=response, id_patterns=TRACKING_IDS))
+    indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=EMBEDDED_IDS))
+    indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=FINANCIAL_IDS))
+    indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=SOCIAL_MEDIA_IDS))
+    indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=TRACKING_IDS))
+    indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=True, id_patterns=CRYPTO_IDS))
+
     indicators.extend(add_cdn_domains(soup))
     indicators.extend(parse_domain_name(url))
     indicators.extend(parse_classes(soup))
-    indicators.extend(get_ipms_indicators(url))
+    #indicators.extend(get_ipms_indicators(url))
     indicators.extend(get_shodan_indicators(url))
     indicators.extend(add_associated_domains_from_cert(url))
     indicators.extend(get_outbound_domains(url, soup))
-    indicators.extend(parse_ads_txt(url, response))
+    #indicators.extend(parse_ads_txt(url, response))
     ## Uncomment the following if needed
     # indicators.extend(add_who_is(url))
     # indicators.extend(parse_images(url, soup, response))
@@ -933,7 +938,7 @@ def remove_json_like_strings(text):
 def annotate_indicators(indicators_df):
     # iterate across the indicators dataframe, if indicator_type is in the keys of FINANCIAL_IDS, add 'financial' to the indicator_annotation column
     for index, row in indicators_df.iterrows():
-        if row['indicator_type'] in FINANCIAL_IDS.keys():
+        if row['indicator_type'] in FINANCIAL_IDS.keys() or row['indicator_type'] in CRYPTO_IDS.keys():
             indicators_df.at[index, 'indicator_annotation'] = 'financial'
         elif row['indicator_type'] in EMBEDDED_IDS.keys():
             indicators_df.at[index, 'indicator_annotation'] = 'embedded'
@@ -999,7 +1004,7 @@ def write_domain_indicators(domain, indicators, output_file):
         type=str,
         help="file to save final list of match results",
         required=False,
-        default=os.path.join(".", "indicators_output_dmi.csv"),
+        default=os.path.join(".", "indicators_output.csv"),
     )
     logging.basicConfig(
         level=logging.INFO,
@@ -1017,6 +1022,7 @@ def write_domain_indicators(domain, indicators, output_file):
     domains = input_data[domain_col]
     for domain in domains:
         try:
+            print(f"Processing {domain}")
             domain_name = get_domain_name(domain)
             indicators = crawl(domain, run_urlscan=run_urlscan)
             write_domain_indicators(domain_name, indicators, output_file=output_file)

diff --git a/modules/matcher.py b/modules/matcher.py
@@ -13,7 +13,7 @@
 from pandas.api.types import is_list_like
 
 from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS,
-                                TRACKING_IDS)
+                                TRACKING_IDS, CRYPTO_IDS)
 
 ## Preprocessing
 
@@ -300,15 +300,16 @@ def parse_certificate_matches(
 "2-urlscanhrefs" : iou_match,
 "2-techstack" : iou_match,
 "3-footer-text": direct_match,
-"4-outbound-domain": iou_match,
+"3-outbound-domain": iou_match,
 "2-ads_txt": iou_match
-
 }
 
 FEATURE_MATCHING.update({financial_id: direct_match for financial_id in FINANCIAL_IDS})
 FEATURE_MATCHING.update({embedded_id: direct_match for embedded_id in EMBEDDED_IDS})
 FEATURE_MATCHING.update({social_id: direct_match for social_id in SOCIAL_MEDIA_IDS})
 FEATURE_MATCHING.update({tracking_id: direct_match for tracking_id in TRACKING_IDS})
+FEATURE_MATCHING.update({crypto_id: direct_match for crypto_id in CRYPTO_IDS})
+
 
 WHOIS_FEATURES = [
     "whois-registrar",