From 4e6572283cf0d3b462164fca1085267aac7ab7e8 Mon Sep 17 00:00:00 2001 From: Peter Benzoni Date: Thu, 6 Jun 2024 17:38:28 -0400 Subject: [PATCH] Various updates, including language updates and fixing gdelt/copyscape inputs, UI updates --- app.py | 130 +++++---- modules/crawler.py | 42 +-- modules/matcher.py | 7 +- modules/reference.py | 26 ++ templates/about.html | 607 ++++++++----------------------------------- templates/index.html | 395 +++++++++++++++------------- 6 files changed, 430 insertions(+), 777 deletions(-) diff --git a/app.py b/app.py index 41bb094..33a5acd 100644 --- a/app.py +++ b/app.py @@ -374,9 +374,6 @@ def parse_content_search(): else: return render_template('index.html', request=request, results=results, csv_data=csv_data, engines=ENGINES, countries=COUNTRIES, languages=LANGUAGES, indicator_metadata=INDICATOR_METADATA) - - - def content(request, title_query=None, content_query=None): if request.method == 'POST': title_query = title_query if title_query is not None else request.form.get('titleQuery') @@ -428,14 +425,14 @@ def parse_url(request, urlToParse=None): if request.method == 'POST': url = urlToParse if urlToParse is not None else request.form.get('url', '') url = format_url(url) - engines = request.form.getlist('search_engines', []) + engines = request.form.getlist('search_engines') combineOperator = request.form.get('combineOperator', 'OR') language = request.form.get('language', 'en') country = request.form.get('country', 'us') elif request.method == 'GET': url = urlToParse if urlToParse is not None else request.args.get('url', '') url = format_url(url) - engines = request.args.getlist('search_engines', []) + engines = request.args.getlist('search_engines') combineOperator = request.args.get('combineOperator', 'OR') language = request.args.get('language', 'en') country = request.args.get('country', 'us') @@ -778,7 +775,7 @@ def indicators(request): if len(selected_type) > 0 and row['indicator_type'] == selected_type: truncated_row = {key: value[:100] for key, value in row.items()} data.append(truncated_row) - unique_types = sorted(set(unique_types_list)) + unique_types = sorted(set(unique_types_list)) return data, unique_types, selected_type @@ -907,40 +904,22 @@ def fetch_content_results(title_query, content_query, combineOperator, language, return results, csv_data def format_copyscape_output(data): - output = {} + output = [] for article in data: - parsed_url = urlparse(article["url"]) - domain = f"{parsed_url.scheme}://{parsed_url.netloc}" - if domain not in output: - output[domain] = {"count": 0, "links": [], - "concern": False, "source": []} - output[domain]["count"] += 1 - output[domain]["links"].append({ - "link": article["url"], + output.append({ + "url": article["url"], "title": article["title"], "snippet": article["textsnippet"], - "count": 1, # Assuming each link is unique and counts as 1 - # Placeholder, as the engine is not specified in the data - "engines": ["Plagiarism Checker"] }) return output def format_gdelt_output(data): - output = {} + output = [] for article in data.get("articles", []): - parsed_url = urlparse(article["url"]) - domain = f"{parsed_url.scheme}://{parsed_url.netloc}" - if domain not in output: - output[domain] = {"count": 0, "links": [], - "concern": False, "source": []} - output[domain]["count"] += 1 - output[domain]["links"].append({ - "link": article["url"], + output.append({ + "url": article["url"], "title": article["title"], "snippet": "", - "count": 1, # Assuming each link is unique and counts as 1 - # Placeholder, as the engine is not specified in the data - "engines": ["GDELT"] }) return output @@ -974,19 +953,14 @@ def normalize_results(results, engine): if results is None: return [] for result in results: - if engine == 'copyscape': + if engine == 'copyscape' or engine == 'gdelt': parsed_url = urlparse(result['url']) domain = f"{parsed_url.scheme}://{parsed_url.netloc}" - normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': result['textsnippet'], 'engine': engine}) - elif engine == 'gdelt': - parsed_url = urlparse(result['url']) - domain = f"{parsed_url.scheme}://{parsed_url.netloc}" - normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': '', 'engine': engine}) + normalized_data.append({'domain':domain, 'url': result['url'], 'title': result['title'], 'snippet': result['snippet'], 'engine': [engine]}) else: parsed_url = urlparse(result['link']) domain = f"{parsed_url.scheme}://{parsed_url.netloc}" - normalized_data.append({'domain':domain,'url': result.get('link'), 'title': result.get( - 'title'), 'snippet': result.get('snippet') , 'engine': [engine]}) + normalized_data.append({'domain':domain,'url': result.get('link'), 'title': result.get('title'), 'snippet': result.get('snippet') , 'engine': [engine]}) return normalized_data with concurrent.futures.ThreadPoolExecutor() as executor: @@ -1010,40 +984,58 @@ def normalize_results(results, engine): # Temporary dictionary to hold the first occurrence index of each URL url_indexes = {} - for idx in range(len(all_results) - 1, -1, -1): - result = all_results[idx] - url = result['url'] - if url in url_indexes: - # This URL has been seen before; merge information and delete this occurrence - first_occurrence_idx = url_indexes[url] - all_results[first_occurrence_idx]['engines'].extend(result['engine']) - all_results[first_occurrence_idx]['link_count'] += 1 - all_results[first_occurrence_idx]['score'] = max( - sequence_match_score(all_results[first_occurrence_idx]['title'], result['title']), - sequence_match_score(all_results[first_occurrence_idx]['snippet'], result['snippet']) - ) - all_results.pop(idx) - else: - url_indexes[url] = idx - local_source = local_domains_dict.get(result['domain']) or local_domains_dict.get(result['domain'].split('.')[1]) # Check for FQDN and no subdomain - github_source = "statemedia" if urlparse(result['domain']).netloc.strip() in github_domains else None - all_results[idx]['source'] = [] - if local_source is not None: - #aggregated_results["source"].append(local_source) - all_results[idx]['source'] = [local_source] - if github_source is not None: - #aggregated_results["source"].append(github_source) - all_results[idx]['source'] = [github_source] - all_results[idx]['link_count'] = 1 - all_results[idx]['domain_count'] = 1 - all_results[idx]['engines'] = result['engine'] - all_results[idx]['score'] = max(sequence_match_score(title_query, all_results[idx]['title']), sequence_match_score(content_query, all_results[idx]['snippet'])) - + aggregated_results = [] + try: + for idx in range(len(all_results) - 1): + result = all_results[idx] + url = result['url'] + if url in url_indexes: + # This URL has been seen before; merge information and delete this occurrence + try: + first_occurrence_idx = url_indexes[url] + aggregated_results[first_occurrence_idx]['engines'].extend(result['engine']) + aggregated_results[first_occurrence_idx]['link_count'] += 1 + aggregated_results[first_occurrence_idx]['score'] = max( + aggregated_results[first_occurrence_idx]['score'], + max( + sequence_match_score(title_query, result['title']), + sequence_match_score(content_query, result['snippet']) if result['snippet'] != '' else 0 + ) + ) + if sequence_match_score(result['title'], title_query) > sequence_match_score(aggregated_results[first_occurrence_idx]['title'], title_query): + aggregated_results[first_occurrence_idx]['title'] = result['title'] + if sequence_match_score(result['snippet'], content_query) > sequence_match_score(aggregated_results[first_occurrence_idx]['snippet'], content_query): + aggregated_results[first_occurrence_idx]['snippet'] = result['snippet'] + except Exception as e: + print(f"Error merging results: {e}") + continue + else: + aggregated_results.append(all_results[idx]) + agg_idx = len(aggregated_results) - 1 + url_indexes[url] = agg_idx + local_source = local_domains_dict.get(urlparse(result['domain']).netloc.strip()) or local_domains_dict.get(urlparse(result['domain']).netloc.strip().split('.')[1]) # Check for FQDN and no subdomain + github_source = "statemedia" if urlparse(result['domain']).netloc.strip() in github_domains else None + aggregated_results[agg_idx]['source'] = [] + if local_source is not None: + aggregated_results[agg_idx]['source'] = local_source + if github_source is not None: + aggregated_results[agg_idx]['source'] = github_source + aggregated_results[agg_idx]['link_count'] = 1 + aggregated_results[agg_idx]['domain_count'] = 1 + aggregated_results[agg_idx]['engines'] = result['engine'] + aggregated_results[agg_idx]['score'] = max(sequence_match_score(title_query, result['title']), sequence_match_score(content_query, result['snippet']) if result['snippet'] != '' else 0) + except Exception as e: + print(f"Error aggregating results: {e}") + app.logger.error(f"Error aggregating results: {e}") + # convet list of engines to set to delete duplicates + for result in aggregated_results: + result['engines'] = list(set(result['engines'])) + # Assuming flattened_data is your list of dictionaries - all_results = sorted(all_results, key=lambda x: x['score'], reverse=True) + aggregated_results = sorted(aggregated_results, key=lambda x: x['score'], reverse=True) - return all_results + return aggregated_results def customize_params_by_platform(title_query, content_query, combineOperator, language, country): diff --git a/modules/crawler.py b/modules/crawler.py index 89c6adb..003db7e 100644 --- a/modules/crawler.py +++ b/modules/crawler.py @@ -31,7 +31,7 @@ from usp.tree import sitemap_tree_for_homepage from modules.indicator import Indicator -from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, TRACKING_IDS) +from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, TRACKING_IDS, CRYPTO_IDS) from modules.reference import LEAD_GEN_INDICATORS URLSCAN_API_KEY = os.getenv('URLSCAN_API_KEY', '') @@ -180,7 +180,7 @@ def parse_sitemaps(url) -> list[Indicator]: tree = sitemap_tree_for_homepage(url) logging.info(tree) entries = set(page.url for page in tree.all_pages()) - return [Indicator("4-sitemap_entries", entries)] + return [Indicator("3-sitemap_entries", entries)] @return_empty_if_fails def parse_dom_tree(soup) -> list[Indicator]: @@ -222,7 +222,7 @@ def parse_meta_tags(soup) -> list[Indicator]: name = meta_tag.get("name") prop = meta_tag.get("property") content = meta_tag.get("content") - if name and "verif" in name.lower(): + if name and ("verif" in name.lower() or "valid" in name.lower()): tag_indicators.append(Indicator("1-verification_id", name + "|" + content)) elif name and name in ["twitter:site", "fb:pages"]: tag_indicators.append(Indicator("3-meta_social", name + "|" + content)) @@ -481,7 +481,7 @@ def get_ipms_ip_indicators(ipms_url) -> list[Indicator]: @return_empty_if_fails def parse_body(response) -> list[Indicator]: text = response.text - return find_uuids(text) + find_wallets(text) + return find_uuids(text) @return_empty_if_fails @@ -563,10 +563,13 @@ def add_associated_domains_from_cert(url) -> list[Indicator]: return [] @return_empty_if_fails -def parse_id_patterns(response, id_patterns: dict[str,str]) -> list[Indicator]: +def parse_id_patterns(response, soup, use_plaintext, id_patterns: dict[str,str]) -> list[Indicator]: tag_indicators = [] for id_type, pattern in id_patterns.items(): - id_indicators = find_with_regex(regex=pattern, text=response.text, indicator_type=id_type) + if use_plaintext: + id_indicators = find_with_regex(regex=pattern, text=soup.get_text(separator=' ', strip=True), indicator_type=id_type) + else: + id_indicators = find_with_regex(regex=pattern, text=response.text, indicator_type=id_type) tag_indicators.extend(id_indicators) return tag_indicators @@ -744,14 +747,14 @@ def detect_and_parse_feed_content(url) -> list[Indicator]: feed = feedparser.parse(url) for entry in feed.entries: feed_indicators.append( - Indicator("4-content-title", entry.title) + Indicator("3-content-title", entry.title) ) feed_indicators.append(Indicator("4-content-link", entry.link)) feed_indicators.append( - Indicator("4-content-summary", entry.summary) + Indicator("3-content-summary", entry.summary) ) feed_indicators.append( - Indicator("4-content-published", entry.published) + Indicator("3-content-published", entry.published) ) return feed_indicators @@ -777,7 +780,7 @@ def get_outbound_domains(url, soup) -> list[Indicator]: link_domain = f"{td}.{tsu}" if link_domain != f"{od}.{osu}": outbound_domains.add(link_domain) - return [Indicator("4-outbound-domain", outbound_domains) ] + return [Indicator("3-outbound-domain", outbound_domains) ] # parses .ads.txt file for associated ad networks, exchanges, and other ad-related entities def parse_ads_txt(url, soup): @@ -843,18 +846,20 @@ def crawl(url, run_urlscan=False) -> list[Indicator]: indicators.extend(parse_id_attributes(soup)) indicators.extend(parse_link_tags(soup)) indicators.extend(parse_footer(soup)) - indicators.extend(parse_id_patterns(response=response, id_patterns=EMBEDDED_IDS)) - indicators.extend(parse_id_patterns(response=response, id_patterns=FINANCIAL_IDS)) - indicators.extend(parse_id_patterns(response=response, id_patterns=SOCIAL_MEDIA_IDS)) - indicators.extend(parse_id_patterns(response=response, id_patterns=TRACKING_IDS)) + indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=EMBEDDED_IDS)) + indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=FINANCIAL_IDS)) + indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=SOCIAL_MEDIA_IDS)) + indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=False, id_patterns=TRACKING_IDS)) + indicators.extend(parse_id_patterns(response=response, soup=soup, use_plaintext=True, id_patterns=CRYPTO_IDS)) + indicators.extend(add_cdn_domains(soup)) indicators.extend(parse_domain_name(url)) indicators.extend(parse_classes(soup)) - indicators.extend(get_ipms_indicators(url)) + #indicators.extend(get_ipms_indicators(url)) indicators.extend(get_shodan_indicators(url)) indicators.extend(add_associated_domains_from_cert(url)) indicators.extend(get_outbound_domains(url, soup)) - indicators.extend(parse_ads_txt(url, response)) + #indicators.extend(parse_ads_txt(url, response)) ## Uncomment the following if needed # indicators.extend(add_who_is(url)) # indicators.extend(parse_images(url, soup, response)) @@ -933,7 +938,7 @@ def remove_json_like_strings(text): def annotate_indicators(indicators_df): # iterate across the indicators dataframe, if indicator_type is in the keys of FINANCIAL_IDS, add 'financial' to the indicator_annotation column for index, row in indicators_df.iterrows(): - if row['indicator_type'] in FINANCIAL_IDS.keys(): + if row['indicator_type'] in FINANCIAL_IDS.keys() or row['indicator_type'] in CRYPTO_IDS.keys(): indicators_df.at[index, 'indicator_annotation'] = 'financial' elif row['indicator_type'] in EMBEDDED_IDS.keys(): indicators_df.at[index, 'indicator_annotation'] = 'embedded' @@ -999,7 +1004,7 @@ def write_domain_indicators(domain, indicators, output_file): type=str, help="file to save final list of match results", required=False, - default=os.path.join(".", "indicators_output_dmi.csv"), + default=os.path.join(".", "indicators_output.csv"), ) logging.basicConfig( level=logging.INFO, @@ -1017,6 +1022,7 @@ def write_domain_indicators(domain, indicators, output_file): domains = input_data[domain_col] for domain in domains: try: + print(f"Processing {domain}") domain_name = get_domain_name(domain) indicators = crawl(domain, run_urlscan=run_urlscan) write_domain_indicators(domain_name, indicators, output_file=output_file) diff --git a/modules/matcher.py b/modules/matcher.py index 5818dfc..61ff12c 100644 --- a/modules/matcher.py +++ b/modules/matcher.py @@ -13,7 +13,7 @@ from pandas.api.types import is_list_like from modules.indicators import (EMBEDDED_IDS, FINANCIAL_IDS, SOCIAL_MEDIA_IDS, - TRACKING_IDS) + TRACKING_IDS, CRYPTO_IDS) ## Preprocessing @@ -300,15 +300,16 @@ def parse_certificate_matches( "2-urlscanhrefs" : iou_match, "2-techstack" : iou_match, "3-footer-text": direct_match, -"4-outbound-domain": iou_match, +"3-outbound-domain": iou_match, "2-ads_txt": iou_match - } FEATURE_MATCHING.update({financial_id: direct_match for financial_id in FINANCIAL_IDS}) FEATURE_MATCHING.update({embedded_id: direct_match for embedded_id in EMBEDDED_IDS}) FEATURE_MATCHING.update({social_id: direct_match for social_id in SOCIAL_MEDIA_IDS}) FEATURE_MATCHING.update({tracking_id: direct_match for tracking_id in TRACKING_IDS}) +FEATURE_MATCHING.update({crypto_id: direct_match for crypto_id in CRYPTO_IDS}) + WHOIS_FEATURES = [ "whois-registrar", diff --git a/modules/reference.py b/modules/reference.py index 518132d..0e0f85f 100644 --- a/modules/reference.py +++ b/modules/reference.py @@ -1118,8 +1118,34 @@ "name": "Facebook Link", "description": "A link to a Facebook page or post found on a webpage.", "interpretation": "Links to Facebook pages or posts can indicate social media engagement or content sharing strategies. Shared Facebook links might suggest common social media strategies or affiliations. However, links to popular social media platforms are widely used and may not be uniquely significant." + }, + '3-youtube': { + "name": "YouTube Link", + "description": "A link to a YouTube channel or video found on a webpage.", + "interpretation": "Links to YouTube channels or videos can indicate content sharing or promotional strategies. Shared YouTube links might suggest common content sources or affiliations. However, links to popular video platforms are widely used and may not be uniquely significant." + }, + '3-telegram': { + "name": "Telegram Link", + "description": "A link to a Telegram channel or post found on a webpage.", + "interpretation": "Links to Telegram channels or posts can indicate content sharing or promotional strategies. Shared Telegram links might suggest common content sources or affiliations. However, links to popular messaging platforms are widely used and may not be uniquely significant." + }, + '3-outbound-domain': { + "name": "Outbound Domain", + "description": "A domain to which a website links or redirects its users.", + "interpretation": "Shared outbound domains can indicate common affiliations or content sources. However, links to popular websites or services can appear across a wide range of unrelated sites, limiting the significance of this data for direct connections." + }, + '3-patreon': { + "name": "Patreon Link", + "description": "A link to a Patreon page or post found on a webpage.", + "interpretation": "Links to Patreon pages or posts can indicate content sharing or promotional strategies. Shared Patreon links might suggest common content sources or affiliations. However, links to crowdfunding platforms are widely used and may not be uniquely significant." + }, + '3-gab' : { + "name": "Gab Link", + "description": "A link to a Gab profile or post found on a webpage.", + "interpretation": "Links to Gab profiles or posts can indicate social media engagement or content sharing strategies. Shared Gab links might suggest common social media strategies or affiliations. However, links to alternative social media platforms are widely used and may not be uniquely significant." } + } MATCH_VALUES_TO_IGNORE = [ diff --git a/templates/about.html b/templates/about.html index 8f8117d..1c058d3 100644 --- a/templates/about.html +++ b/templates/about.html @@ -43,55 +43,77 @@

-
+

About

-

The purpose of the Laundromat, how to use it effectively, and how to interpret the results +

The following sections provide information on how the Information Laundromat tool works, how to use + it effectively, and how to interpret results.

The Laundromat

-

The Laundromat tool provides two functions: Content Similarity Search and Domain Forensics Matching: +

The Laundromat tool provides two core functions: Content Similarity Search and Metadata Similarity + Search:

-
    -
  • Content Similarity Search attempts to detect URLs where a given text snippet occurs. It does not - provide evidence of where that text originated or any relationship between two entities posting - two similar texts. Detemination of a given text's provenance is outside the scope of this - tool.
  • -
  • Metadata Similarity Search attempts to find aspects of a website which indicate what makes it - unique, give insight into its architecture/design, or show how its used/tracked. These - indicators are compared for items with high degrees of similarity and matches are provided to - the user.
  • -
-

The Domain Forensics Comparison Corpus

-

Any URLs entered into the Metadata Similarity Search tool are compared against a list of - domains already processed by the tool. This corpus is sourced from a number of sources, including: +

+

Content Similarity Search takes a user-selected URL, title, and/or text snippet and uses GDELT, a + variety of search services, and a plagiarism checker to detect URLs that contain some degree of + similarity with the queried content. The user may also specify a country and language to search in. + As not all languages and countries are supported by each service, the default is the United States + and English if unsupported. Finally, users may specify which search engines/services they want to + use for their search.

- - -

Inclusion in the corpus of comparison sites is neither an endorsement nor a criticism of a given - website's point of view or their relationship to any other member of the corpus. It solely - reflects what websites are of interest to OSINT researchers. If you'd like a website removed - from the list or have a potential list of new items to include, email info (at) securingdemocracy.org.

- +

Content Similarity Search attempts to find similar articles or text across the open web. It does not + provide evidence of where that text originated or any relationship between two entities posting two + similar texts. Determination of a given text's provenance is outside the scope of this tool. +

+

URL Search

+

Enter the full URL of an article or webpage (e.g. https://tech.cnn.com/article-title.html or + https://www.rt.com/russia/588284-darkening-prospects-ukraine-postwar/) to automatically extract + title and content. This feature will not work with every website.

+

Advanced (Title/Content) Search

+

This search allows users to specify the title and content (and apply boolean ANDs/ORs to the title + and content). It also requires specifying a country and language to search in. As not all languages + and countries are supported by each service, these will default to US and English if unsupported. + Finally, users may specify which search engines they want to use for their search. +

Batch Search

+

To search multiple URLs at once, the Laundromat allows users to upload a list of URLs in CSV format. + To access this feature, contact us at info [at] securingdemocracy.org to obtain a registration code. +

+

Interpreting Results

+

A content search will produce a searchable list of links, their domains, possible associations with + known lists (see below for more information), the title and text snippet, the search engines where + that link was found, and the percentage of the title or snippet that matches the provided input. + Because this method leverages search results, there are articles that surface that share some + similarities with the queried text but that are fundamentally different. To improve the accuracy of + results, we use gestalt string matching (also known as Ratcliff/Obershelp pattern recognition), a + technique to determine the similarity of two pieces of text (“strings”) on their common substrings + to determine the similarity between the queried text and the surfaced article. This technique is + useful in cases where a piece of text may have been lightly edited or words inserted or removed, as + often happens with headlines and articles. A score of 100% indicates a complete match between the + queried text and a result, while a value of 0% indicates no match. While this scoring method is very + accurate when querying a snippet of text, it is less accurate when querying URLs because websites + often contain sidebars or other text on the page that is different from the original source, even if + the article itself is identical. The information laundromat tool may therefore produce lower + similarity scores when querying URLs than the strength of the match would otherwise suggest. + The accuracy of results is also dependent on the length and uniqueness of the queried text. + Searching for a well-known name or common phrase will likely produce high match scores but poor + results. For example, searching “Xi Jinping” will produce many URLs with 100% match scores, but + likely few of them will be relevant. + Regardless of match score, we urge users to manually confirm results. +

+ +

Metadata Similarity Search attempts to find aspects of a website which indicate what makes it unique, + give insight into its architecture/design, or show how its used/tracked. These indicators are + compared for items with high degrees of similarity and matches are provided to the user. + This search feature will accept a list of one or more fully qualified domain names (user must + include a prepended https:// on each domain name). This will produce a list of indicators and a list + of sites that match (or are extremely similar to) those indicators. Indicators, and thus matches, + are broken into the three tiers described below. +

+

About the Indicator Tier System and Interpreting Results

-

About the Indicator Tier - System and Interpreting Results

Each indicator is associated with an evidentiary tier and is subject to interpretation.

@@ -106,7 +128,8 @@

About the Indi website. These are not as unique as Tier 1 indicators but provide valuable context. This tier includes IPs within the same subnet, matching meta tags, and commonalities in standard and custom response headers.

-

Tier 3 Indicators: WHEN VALID, these are +

Tier 3 Indicators: WHEN VALID, these + are the least specific but can still support broader analyses when combined with higher-tier indicators. These include shared CSS classes, UUIDs, and Content Management Systems.

Interpreting Indicator Validity

@@ -152,473 +175,44 @@
Example Investigation:

In interpreting indicator validity, analysts must weigh the evidence, seek corroboration, and consider the broader context to distinguish between high-confidence connections and potentially misleading, spurious matches.

-

How to use the Laundromat

- -

Content Similarity Search takes a given title and/or content and uses GDELT, a variety of search services, and a plagiarism - checker to detect urls with some degree of similarity of the provided content.

- -

Enter the full URL of an article or webpage (e.g. https://tech.cnn.com/article-title.html or https://www.rt.com/russia/588284-darkening-prospects-ukraine-postwar/) - to automatically attempt to extract title and content.

- -

This search allows users to specify the title and content (and apply boolean ANDs/ORs to the title - and content). It also requires specifying a country and language to search in. As not all languages - and countries are supported by each service, these will default to US and English if unsupported. - Finally, users may specify which search engines they want to use for their search.

-

This will produce a searchable list of links, their domains, possible associations with known lists, - the title and snippet, the search engines where that link will be found, and the percentage of the - title or snippet which matches the provided inputs as determined by the Ratcliff/Obershelp algorithm.. + +

The Domain Forensics Comparison Corpus

+

Any URLs entered into the Metadata Similarity Search tool are compared against a list of + domains already processed by the tool. This corpus is sourced from a number of sources, including:

-

Metadata Similarity Search

-

This search, which will accept a list of one or more fully qualified domain - names. (including a prepended https:// on each domain name). This will produce a list of - indicators and a list of sites which match (or are extremely similar to) those indicators. - Indicators, and thus matches, are broken into the three tiers described above.

+ + +

Inclusion in the corpus of comparison sites is neither an endorsement nor a criticism of a given + website's point of view or their relationship to any other member of the corpus. It solely + reflects what websites are of interest to OSINT researchers. If you'd like a website removed + from the list or have a potential list of new items to include, email info (at) + securingdemocracy.org.

+ +

Partners, Sponsors, Disclaimers

-

The Laundromat Tool is made possible with the support of the European Media and Information Fund (EMIF). - The Information Laundromat Tool is built by a partnership of the Alliance for Securing Democracy (ASD), +

The Laundromat Tool is made possible with the support of the European Media and Information Fund + (EMIF). + The Information Laundromat Tool is built by a partnership of the Alliance for Securing Democracy + (ASD), the Institute for Strategic Dialogue (ISD), and the University of Amsterdam (UvA) through the Digital Methods Institute.

-

Full Indicators List:

-
    -
  • -

    1-cert-domain - Domain Certificate: - An SSL certificate is a digital certificate that authenticates a website's or multiple - websites' identity and enables an encrypted connection. A shared certificate between - two sites is strong evidence of a link between sites, as typically a certificate must be - issued for all those sites at once by a single entity and cannot easily be spoofed. - However, some web hosting and DDOS protection services bundle certificates for unrelated - sites, so carefully research any matches.

    -
  • -
  • -

    1-crypto-wallet - Cryptocurrency Wallet: - A digital wallet used to store, send, and receive cryptocurrencies like Bitcoin and - Ethereum. The presence of a cryptocurrency wallet address can link a site or an - individual to cryptocurrency transactions, potentially indicating financial sources or - preferences. However, due to the pseudonymous nature of such wallets, additional - information is required to definitively establish ownership or connections.

    -
  • -
  • -

    1-domain - Domain Name: - The unique name that identifies a website, which is registered in the Domain Name System - (DNS). The domain name can provide insights into the nature or origin of a website. - Commonalities in domain names may suggest shared affiliations or intents. However, the - ease of registering domain names requires careful analysis to avoid false - associations.

    -
  • -
  • -

    1-domain_suffix - Domain Suffix: - The last part of a domain name, typically representing a category or country code. A - domain suffix can indicate the intended audience or origin of a website. Similar - suffixes across different sites might suggest a geographical or organizational link. - Yet, the global accessibility of most suffixes means this should not be a sole - determinant of connection.

    -
  • -
  • -

    1-fb_pixel_id - Facebook Pixel ID: - A unique identifier for the Facebook Pixel, an analytics tool that allows website owners to - measure the effectiveness of their advertising by understanding the actions people take on - their website. Shared Facebook Pixel IDs across sites can indicate common ownership or a - shared marketing strategy. However, third-party marketing agencies might use the same ID - across different clients, potentially leading to mistaken connections.

    -
  • -
  • -

    1-adobe_analytics_id - Adobe Analytics ID: - A unique identifier used by Adobe Analytics, a tool for analyzing visitor traffic on - websites. Similar to other analytics tools, shared Adobe Analytics IDs can hint at - common management or partnerships between websites. However, as with Facebook Pixel IDs, - the use of analytics IDs by third-party services may introduce unrelated links.

    -
  • -
  • -

    3-sitemap_entries - Sitemap Entries: - Entries in a website's sitemap, which is an XML file listing the URLs for a site along - with additional metadata about each URL. Analysis of sitemap entries can reveal the - structure and content priorities of a website. Commonalities in sitemap structures or - content might suggest shared authorship or objectives. However, similarities could also - result from common website templates or platforms.

    -
  • -
  • -

    3-ipms_domain_iprangeowner_cidr - IP Range Owner CIDR: - The Classless Inter-Domain Routing (CIDR) notation indicating the range of IP addresses - owned by an entity. CIDR data can help identify the network scope and location of a - domain's hosting. Shared IP ranges might suggest hosting or service provider - commonalities. However, large hosting providers may have numerous unrelated clients - within the same range.

    -
  • -
  • -

    3-ipms_domain_iprangeowner_ownerName - IP Range Owner Name: - The name of the entity owning a range of IP addresses. This information can be used to - identify the hosting provider or organization controlling a set of IP addresses. Shared - ownership names might indicate a relationship between the entities using those IPs, - though large organizations often host unrelated entities.

    -
  • -
  • -

    3-ipms_domain_iprangeowner_address - IP Range Owner Address: - Physical address of the entity owning a range of IP addresses. Physical addresses can - provide geographical and organizational context. Shared addresses across different IP - ranges might suggest a close relationship or common management. However, the presence of - data centers and shared office spaces can result in address overlaps for unrelated - entities.

    -
  • -
  • -

    3-ipms_domain_nameserver - Domain Name Server: - A server that translates domain names into IP addresses, facilitating the connection between - a user's device and the website's server. Common nameservers among different - domains might indicate shared hosting or management services. However, popular hosting - providers serve a large number of clients, potentially leading to false - associations.

    -
  • -
  • -

    3-ipms_domain_otheripused - Other IPs Used by Domain: - A list of IP addresses that have been used by a domain, aside from its primary IP address. - This data can reveal the network history and changes in hosting of a domain. Shared - historical IPs might suggest past commonalities or transitions in hosting services. - However, dynamic IP allocation by hosting services can result in unrelated sites - temporarily sharing IPs. -

    -
  • -
  • -

    3-ipms_siteonthisip_now - Current Sites on This IP: - Websites currently hosted on the same IP address. Websites sharing an IP address may - have a relationship, such as being part of the same network or organization. However, - shared hosting environments can lead to unrelated websites being hosted on the same - IP.

    -
  • -
  • -

    3-ipms_siteonthisip_before - Former Sites on This IP: - Websites that were previously hosted on the same IP address but are no longer. - Historical data on IP hosting can provide insights into the network associations and - changes over time. Formerly shared IPs might indicate previous relationships or common - hosting decisions. However, dynamic IP allocations can lead to brief and incidental - overlaps. -

    -
  • -
  • -

    3-ipms_siteonthisip_broken - Broken Sites on This IP: - Websites hosted on the same IP address that are currently not functional or accessible. - Identifying non-functional sites on a shared IP can indicate network health or hosting - issues. Patterns in broken sites might suggest targeted disruptions or poor hosting - services. However, temporary technical issues can also cause sites to be non-functional, - unrelated to their network neighbors. -

    -
  • -
  • -

    3-ipms_useragents - User Agents: - Strings that web browsers and other client devices send to identify themselves to web - servers, typically containing information about the device and browser. Analysis of user - agents can reveal the types of devices and browsers most frequently accessing a site, - potentially indicating the site's target audience or technological preferences. - However, the widespread use of common browsers can limit the specificity of these - insights.

    -
  • -
  • -

    1-ip_shodan_hostnames - Shodan Hostnames: - Hostnames associated with an IP address as indexed by Shodan, a search engine for - internet-connected devices. Shodan's data can reveal the various services and - hostnames associated with an IP, potentially indicating its use and ownership. Shared - hostnames across IPs might suggest network or organizational links. However, the dynamic - nature of IP allocations can lead to transient or outdated hostname associations. -

    -
  • -
  • -

    3-ip_shodan_ports - Shodan Ports: - Open network ports on an IP address as detected by Shodan. Open ports can indicate the - types of services an IP is offering, with certain ports associated with specific - applications or protocols. Common ports across different IPs might suggest similar uses - or configurations. However, standard port uses can be widespread and not necessarily - indicative of direct relationships.

    -
  • -
  • -

    2-ip_shodan_vuln - Shodan Vulnerabilities: - Vulnerabilities identified on an IP address by Shodan, based on open ports and services. - Identifying vulnerabilities can help assess the security posture of a network or device. - Shared vulnerabilities might indicate common software or configuration weaknesses. - However, widespread vulnerabilities in popular software can appear across unrelated - networks. -

    -
  • -
  • -

    3-ip_shodan_cpe - Shodan CPE: - Common Platform Enumeration (CPE) identifiers found by Shodan, indicating specific software - or hardware on an IP. CPE identifiers can provide detailed insights into the - technological stack of a network or device. Shared CPEs might suggest technological - commonalities or shared suppliers. However, the ubiquity of certain technologies can - lead to coincidental overlaps.

    -
  • -
  • -

    1-ga_id - Google Analytics ID: - A unique identifier associated with Google Analytics, used for tracking and analyzing - website traffic. Like other analytics IDs, shared Google Analytics IDs across websites - may indicate common ownership or marketing strategies. However, the use of third-party - marketing agencies can result in the same ID being used across unrelated sites.

    -
  • -
  • -

    1-ga_tag_id - Google Analytics Tag ID: - A unique tag identifier used in Google Analytics for tracking specific user interactions on - a website. Similar to the general Google Analytics ID, shared tag IDs might suggest a - connection between sites, especially in how they track user behavior. However, similar - tracking strategies might also be independently adopted by unrelated sites.

    -
  • -
  • -

    1-ip - IP Address: - A unique numerical label assigned to each device connected to a computer network that uses - the Internet Protocol for communication. An IP address can reveal the geographic - location and network provider of a device or website. Shared IPs may indicate shared - hosting or network resources. However, dynamic IP allocation and large hosting - environments can lead to incidental sharing.

    -
  • -
  • -

    1-verification_id - Verification ID: - A unique identifier used for verifying ownership or authenticity of a website or online - account. Verification IDs can establish the legitimacy of a site or account, potentially - linking it to a specific owner or organization. However, verification processes vary, - and IDs can be reassigned or spoofed, requiring careful verification.

    -
  • -
  • -

    1-yandex_tag_id - Yandex Tag ID: - A unique identifier used by Yandex Metrica, a tool for analyzing visitor traffic, similar to - Google Analytics. Shared Yandex Tag IDs could suggest common ownership or similar web - analytics strategies between sites. However, like other analytics tools, the involvement - of third-party services can create misleading connections.

    -
  • -
  • -

    2-subnet - Subnet: - A segment of a network's IP address range that can be designated to optimize performance - and security. Subnet information can indicate how a network is structured and segmented - for various purposes. Shared subnets between different entities might suggest a - relationship or common network management. However, subnets are often allocated by ISPs - or hosting providers to multiple clients.

    -
  • -
  • -

    3-cdn-domain - CDN Domain: - A domain used by a Content Delivery Network (CDN) to deliver content efficiently across the - internet. Shared CDN domains can indicate that websites are utilizing the same CDN - provider for content distribution, which might imply performance or operational - preferences. However, popular CDNs are used by a wide range of websites, limiting the - value of this data for establishing direct connections.

    -
  • -
  • -

    3-cms - Content Management System: - A software application or set of related programs used to create and manage digital content. - Common CMS platforms among different websites might suggest similar operational needs or - preferences. However, widely-used CMS platforms like WordPress are employed by a diverse - array of sites, often without any direct relation. -

    -
  • -
  • -

    3-css_classes - CSS Classes: - Classes defined in Cascading Style Sheets (CSS) to style and format the layout of web pages. - Analysis of CSS classes can provide insights into the design and development approaches - of a website. Shared classes might suggest common design templates or developers. - However, common frameworks and libraries can lead to similar CSS classes across - unrelated sites. -

    -
  • -
  • -

    3-header-nonstd-value - Non-Standard Header Value: - Values in HTTP headers that do not conform to standard header formats, potentially - indicating custom configurations or software. Non-standard header values can be unique - identifiers of custom configurations or software used by a website. Shared non-standard - values might indicate common development practices or software choices. However, the - interpretation of these values requires technical expertise to avoid - misattribution.

    -
  • -
  • -

    3-header-server - Server Header: - The 'Server' HTTP header field that specifies information about the software used by - the origin server. The server header can reveal the web server software and its - configuration. Shared server headers might indicate similar technological choices or - configurations. However, popular server software like Apache and Nginx is widely used, - so this data alone is not sufficient to establish a connection.

    -
  • -
  • -

    3-id_tags - ID Tags: - Unique identifiers used in the HTML code of a website to distinguish specific elements. - Similar ID tags across websites might suggest shared development practices or template - usage. However, common ID tags can also be a result of widespread frameworks or - libraries, and thus, might not be indicative of direct relationships. -

    -
  • -
  • -

    3-iframe_id_tags - Iframe ID Tags: - Unique identifiers used for 'iframe' elements in HTML, allowing the embedding of an - external webpage within a webpage. Shared Iframe ID tags could indicate similar website - functionalities or content sharing strategies. However, common frameworks or website - templates can lead to the usage of similar Iframe IDs across different websites, - reducing the significance of this correlation.

    -
  • -
  • -

    3-link_href - Link Href Attributes: - The 'href' attribute of a link in HTML, specifying the URL of the page the link goes - to. Analysis of 'href' attributes can reveal the external connections or - references a website makes. Shared 'href' attributes across different sites - might suggest common affiliations or sources. However, links to popular or general - websites might not be indicative of a direct relationship.

    -
  • -
  • -

    3-meta_generic - Generic Meta Tags: - Meta tags in HTML that provide general information about a webpage, such as description, - keywords, and author. Common meta tags can indicate similar content or objectives. - However, generic or broadly used tags may appear in a wide range of websites, - potentially leading to mistaken connections.

    -
  • -
  • -

    3-meta_social - Social Media Meta Tags: - Meta tags specifically designed for optimizing social media sharing, defining how content - appears when shared on social platforms. Shared social media meta tags might suggest a - coordinated approach to social media engagement or content strategy. However, the use of - standard social media optimization practices can lead to similar tags across unrelated - sites.

    -
  • -
  • -

    3-script_src - Script Source Attributes: - The 'src' attribute of a script tag in HTML, indicating the source of a JavaScript - file. Shared script sources can point to the use of common libraries or external - scripts. However, the widespread use of popular JavaScript libraries and scripts might - lead to coincidental similarities.

    -
  • -
  • -

    3-uuid - UUID: - Universally Unique Identifier, a 128-bit number used to identify information in computer - systems. UUIDs can be used to track and manage assets or components within a system. - Shared UUIDs might indicate a connection between different systems or components. - However, the nature of UUIDs as unique identifiers typically limits the occurrence of - shared UUIDs across unrelated systems.

    -
  • -
  • -

    3-whois_creation_date - WHOIS Creation Date: - The date a domain name was first registered, as recorded in the WHOIS database. Similar - creation dates for domains might suggest a coordinated launch or common origin. However, - coincidental registration dates are possible, especially during popular events or domain - sales.

    -
  • -
  • -

    3-whois_server - WHOIS Server: - The server that provides the WHOIS information, containing details about domain name - registrations. Use of the same WHOIS server for different domains could indicate a - preference for certain domain registrars. However, popular registrars serve a large - number of clients, so this alone isn't a strong indicator of a relationship. -

    -
  • -
  • -

    3-whois-registrar - WHOIS Registrar: - The organization authorized to register and manage domain names for a particular top-level - domain. Domains registered through the same registrar might have some administrative - commonalities. However, given the market dominance of certain registrars, this is not a - definitive sign of a direct connection between domain owners.

    -
  • -
  • -

    3-wp-blocks - WordPress Blocks: - Content blocks used in WordPress to build and design webpages. Shared WordPress blocks - could indicate similar website designs or use of common templates. However, due to the - popularity of WordPress and its wide range of available blocks, similarities might occur - coincidentally.

    -
  • -
  • -

    3-wp-categories - WordPress Categories: - Categorization system in WordPress used to group content into different sections. - Similar categories in different WordPress sites might suggest related content or - thematic similarities. However, common categories are widely used across various sites, - potentially leading to non-significant matches. -

    -
  • -
  • -

    3-wp-pages - WordPress Pages: - Web pages created and managed within the WordPress platform. Analysis of WordPress pages - can reveal the structure and content emphasis of a site. Shared page structures or - content might suggest a common template or designer. However, the extensive use of - WordPress templates can lead to similar page structures across unrelated sites. -

    -
  • -
  • -

    3-wp-posts - WordPress Posts: - Blog posts or articles published on a WordPress website. Shared themes or styles in - WordPress posts might indicate similar content strategies or sources. However, the - widespread use of WordPress for blogging and content creation means that thematic - overlaps are common and not necessarily indicative of a connection.

    -
  • -
  • -

    3-wp-tags - WordPress Tags: - Tagging system in WordPress used to describe specific details of posts, aiding in content - organization and navigation. Common tags across WordPress sites might suggest related - topics or a shared content approach. However, popular tags are frequently used across - diverse websites, diminishing the potential for meaningful connections.

    -
  • -
  • -

    3-wp-users - WordPress Users: - Individual accounts within WordPress that have various roles and permissions for managing - website content. Shared user accounts or roles across WordPress sites might imply common - administration or authorship. However, generic user roles such as - 'administrator' or 'editor' are common and not uniquely - identifying.

    -
  • -
  • -

    2-urlscan_globalvariable - URLScan Global Variable: - Global JavaScript variables identified by URLScan, a tool for scanning and analyzing - websites. Shared global variables might indicate the use of similar scripts or - frameworks. However, common JavaScript practices and libraries can result in widespread - use of the same global variables across different websites.

    -
  • -
  • -

    2-urlscan_cookies - URLScan Cookies: - Cookies identified by URLScan as being set by websites during a scan. Analysis of - cookies can reveal tracking, personalization, or functional aspects of a website. Shared - cookies across sites might suggest shared tracking or management tools. However, common - third-party services, like analytics or advertising platforms, often set similar cookies - across various websites.

    -
  • -
  • -

    2-urlscan_consolemessages - URLScan Console Messages: - Messages output to the browser console during a website scan by URLScan. Console - messages can provide insights into the website's functionality or potential issues. - Common messages across different scans might indicate similar development practices or - shared issues. However, these messages can also be generated by common frameworks or - browser extensions.

    -
  • -
  • -

    2-urlscan_asn - URLScan Autonomous System Number: - The Autonomous System Number (ASN) identified by URLScan, representing the collection of IP - networks and routers under the control of one entity. Shared ASNs can suggest that - websites are part of the same network or hosted by the same provider. However, large - hosting providers and ISPs control extensive ASNs that encompass a wide range of - clients.

    -
  • -
  • -

    2-urlscan_domainsonpage - URLScan Domains on Page: - A list of all domains found on a webpage during a URLScan. Domains listed on a page can - reveal external links or embedded content. Shared domains across different webpages - might suggest common affiliations or sources. However, widely used domains, such as - social media or analytics platforms, are commonly found across numerous sites.

    -
  • -
  • -

    2-urlscan_urlssonpage - URLScan URLs on Page: - All URLs found on a webpage during a URLScan. The presence of specific URLs can indicate - the nature of the content or the external connections of a website. Shared URLs across - different pages might suggest a relationship or common sources. However, links to - popular websites or resources might not be uniquely significant.

    -
  • -
  • -

    2-urlscanhrefs - URLScan Hrefs: - Hypertext references (hrefs) identified on webpages during a URLScan. Href attributes - can provide insights into the external links and relationships of a website. Common - hrefs across different sites might suggest shared affiliations or content. However, - links to widely used resources or platforms can appear across many sites, limiting the - potential for direct connection inference.

    -
  • -
  • -

    2-techstack - Technology Stack: - The set of technologies used to build and run a website or application, including - frameworks, languages, and software. Similar technology stacks can suggest shared - development practices or preferences. However, certain technology combinations are - widely popular and may be used by a vast range of unrelated websites or - applications.

    -
  • -

Disclaimers

Opinions Disclaimer

The sole responsibility for any content supported by the European Media and Information Fund lies @@ -890,22 +484,27 @@

.top-nav a { color: #fff; } + .top-nav a:hover { color: lightgray; } + .about-page { color: lightgray; } + .about-page a { color: lightblue; } + .about-page a:hover { color: lightgray; } + .main-page { max-width: 1800px; margin: 0 auto; } - - \ No newline at end of file + + \ No newline at end of file diff --git a/templates/index.html b/templates/index.html index 765a0d1..42f0321 100644 --- a/templates/index.html +++ b/templates/index.html @@ -30,6 +30,8 @@ + +