remove tldextractor from pseudonymizer

fkie-cad · Dec 6, 2024 · faedcbc · faedcbc
1 parent da20a9d
commit faedcbc
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 30 deletions.
diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py
@@ -50,11 +50,10 @@
 import re
 from functools import cached_property, lru_cache
 from itertools import chain
-from typing import Optional, Pattern
+from typing import Pattern
 from urllib.parse import parse_qs, urlencode, urlparse
 
 from attrs import define, field, validators
-from tldextract import TLDExtract
 
 from logprep.abc.processor import Processor
 from logprep.factory_error import InvalidConfigurationError
@@ -70,7 +69,6 @@
     Encrypter,
 )
 from logprep.util.url import extract_urls
-from logprep.util.validators import list_of_urls_validator
 
 
 class Pseudonymizer(FieldManager):
@@ -137,12 +135,6 @@ class Config(FieldManager.Config):
         )
         """The maximum number of cached pseudonymized urls. Default is 10000.
         Behaves similarly to the max_cached_pseudonyms. Has to be greater than 0."""
-        tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator])
-        """Optional list of path to files with top-level domain lists
-        (like https://publicsuffix.org/list/public_suffix_list.dat). If no path is given,
-        a default list will be retrieved online and cached in a local directory. For local
-        files the path has to be given with :code:`file:///path/to/file.dat`."""
-
         mode: str = field(
             validator=[validators.instance_of(str), validators.in_(("GCM", "CTR"))], default="GCM"
         )
@@ -211,12 +203,6 @@ def _encrypter(self) -> Encrypter:
         encrypter.load_public_keys(self._config.pubkey_analyst, self._config.pubkey_depseudo)
         return encrypter
 
-    @cached_property
-    def _tld_extractor(self) -> TLDExtract:
-        if self._config.tld_lists is not None:
-            return TLDExtract(suffix_list_urls=self._config.tld_lists)
-        return TLDExtract()
-
     @cached_property
     def _regex_mapping(self) -> dict:
         return GetterFactory.from_string(self._config.regex_mapping).get_yaml()
@@ -305,13 +291,15 @@ def _pseudonymize(self, value):
         return {"pseudonym": hash_string, "origin": encrypted_origin}
 
     def _pseudonymize_url(self, url_string: str) -> str:
-        url = self._tld_extractor(url_string)
         if url_string.startswith(("http://", "https://")):
             parsed_url = urlparse(url_string)
         else:
             parsed_url = urlparse(f"http://{url_string}")
-        if url.subdomain:
-            url_string = url_string.replace(url.subdomain, self._pseudonymize_string(url.subdomain))
+        if parsed_url.hostname:
+            splitted_hostname = parsed_url.hostname.split(".")
+            if len(splitted_hostname) > 2:
+                subdomain = ".".join(splitted_hostname[0:-2])
+                url_string = url_string.replace(subdomain, self._pseudonymize_string(subdomain))
         if parsed_url.fragment:
             url_string = url_string.replace(
                 f"#{parsed_url.fragment}", f"#{self._pseudonymize_string(parsed_url.fragment)}"

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,7 +82,6 @@ dependencies = [
   "regex",
   "ruamel.yaml",
   "schedule",
-  "tldextract",
   "urllib3>=1.26.17",              # CVE-2023-43804
   "uvicorn",
   "deepdiff",

diff --git a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py
@@ -765,15 +765,6 @@ def test_testcases(self, testcase, rule, event, expected, regex_mapping):
         self.object.process(event)
         assert event == expected, testcase
 
-    def test_tld_extractor_uses_file(self):
-        config = deepcopy(self.CONFIG)
-        config["tld_lists"] = [TLD_LIST]
-        object_with_tld_list = Factory.create({"pseudonymizer": config})
-        assert len(object_with_tld_list._tld_extractor.suffix_list_urls) == 1
-        assert object_with_tld_list._tld_extractor.suffix_list_urls[0].endswith(
-            "tests/testdata/mock_external/tld_list.dat",
-        )
-
     def _load_specific_rule(self, rule):
         config = deepcopy(self.CONFIG)
         config["regex_mapping"] = self.regex_mapping
@@ -893,7 +884,6 @@ def test_resolve_from_cache_pseudonymize_urls(self):
             ),
             (
                 "https://test.de/?a=b&c=d",
-                # nosemgrep
                 (
                     "https://test.de/?a="
                     "<pseudonym:4c77fcd97a3d4d98eb062561c37e4ef000f0476bdf153b25ba8031f90ac89877>"
@@ -903,7 +893,6 @@ def test_resolve_from_cache_pseudonymize_urls(self):
             ),
             (
                 "https://test.de/#test",
-                # nosemgrep
                 (
                     "https://test.de/#"
                     "<pseudonym:d95ac3629be3245d3f5e836c059516ad04081d513d2888f546b783d178b02e5a>"