diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index 9d21a196b..9b4257779 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -50,11 +50,10 @@ import re from functools import cached_property, lru_cache from itertools import chain -from typing import Optional, Pattern +from typing import Pattern from urllib.parse import parse_qs, urlencode, urlparse from attrs import define, field, validators -from tldextract import TLDExtract from logprep.abc.processor import Processor from logprep.factory_error import InvalidConfigurationError @@ -70,7 +69,6 @@ Encrypter, ) from logprep.util.url import extract_urls -from logprep.util.validators import list_of_urls_validator class Pseudonymizer(FieldManager): @@ -137,12 +135,6 @@ class Config(FieldManager.Config): ) """The maximum number of cached pseudonymized urls. Default is 10000. Behaves similarly to the max_cached_pseudonyms. Has to be greater than 0.""" - tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator]) - """Optional list of path to files with top-level domain lists - (like https://publicsuffix.org/list/public_suffix_list.dat). If no path is given, - a default list will be retrieved online and cached in a local directory. For local - files the path has to be given with :code:`file:///path/to/file.dat`.""" - mode: str = field( validator=[validators.instance_of(str), validators.in_(("GCM", "CTR"))], default="GCM" ) @@ -211,12 +203,6 @@ def _encrypter(self) -> Encrypter: encrypter.load_public_keys(self._config.pubkey_analyst, self._config.pubkey_depseudo) return encrypter - @cached_property - def _tld_extractor(self) -> TLDExtract: - if self._config.tld_lists is not None: - return TLDExtract(suffix_list_urls=self._config.tld_lists) - return TLDExtract() - @cached_property def _regex_mapping(self) -> dict: return GetterFactory.from_string(self._config.regex_mapping).get_yaml() @@ -305,13 +291,15 @@ def _pseudonymize(self, value): return {"pseudonym": hash_string, "origin": encrypted_origin} def _pseudonymize_url(self, url_string: str) -> str: - url = self._tld_extractor(url_string) if url_string.startswith(("http://", "https://")): parsed_url = urlparse(url_string) else: parsed_url = urlparse(f"http://{url_string}") - if url.subdomain: - url_string = url_string.replace(url.subdomain, self._pseudonymize_string(url.subdomain)) + if parsed_url.hostname: + splitted_hostname = parsed_url.hostname.split(".") + if len(splitted_hostname) > 2: + subdomain = ".".join(splitted_hostname[0:-2]) + url_string = url_string.replace(subdomain, self._pseudonymize_string(subdomain)) if parsed_url.fragment: url_string = url_string.replace( f"#{parsed_url.fragment}", f"#{self._pseudonymize_string(parsed_url.fragment)}" diff --git a/pyproject.toml b/pyproject.toml index 0845ecbb0..531396813 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,6 @@ dependencies = [ "regex", "ruamel.yaml", "schedule", - "tldextract", "urllib3>=1.26.17", # CVE-2023-43804 "uvicorn", "deepdiff", diff --git a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py index 146aa96d3..d7fcd1a72 100644 --- a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py +++ b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py @@ -765,15 +765,6 @@ def test_testcases(self, testcase, rule, event, expected, regex_mapping): self.object.process(event) assert event == expected, testcase - def test_tld_extractor_uses_file(self): - config = deepcopy(self.CONFIG) - config["tld_lists"] = [TLD_LIST] - object_with_tld_list = Factory.create({"pseudonymizer": config}) - assert len(object_with_tld_list._tld_extractor.suffix_list_urls) == 1 - assert object_with_tld_list._tld_extractor.suffix_list_urls[0].endswith( - "tests/testdata/mock_external/tld_list.dat", - ) - def _load_specific_rule(self, rule): config = deepcopy(self.CONFIG) config["regex_mapping"] = self.regex_mapping @@ -893,7 +884,6 @@ def test_resolve_from_cache_pseudonymize_urls(self): ), ( "https://test.de/?a=b&c=d", - # nosemgrep ( "https://test.de/?a=" "" @@ -903,7 +893,6 @@ def test_resolve_from_cache_pseudonymize_urls(self): ), ( "https://test.de/#test", - # nosemgrep ( "https://test.de/#" ""