Skip to content

Commit

Permalink
remove tldextractor from pseudonymizer
Browse files Browse the repository at this point in the history
  • Loading branch information
ekneg54 committed Dec 6, 2024
1 parent da20a9d commit faedcbc
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 30 deletions.
24 changes: 6 additions & 18 deletions logprep/processor/pseudonymizer/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,10 @@
import re
from functools import cached_property, lru_cache
from itertools import chain
from typing import Optional, Pattern
from typing import Pattern
from urllib.parse import parse_qs, urlencode, urlparse

from attrs import define, field, validators
from tldextract import TLDExtract

from logprep.abc.processor import Processor
from logprep.factory_error import InvalidConfigurationError
Expand All @@ -70,7 +69,6 @@
Encrypter,
)
from logprep.util.url import extract_urls
from logprep.util.validators import list_of_urls_validator


class Pseudonymizer(FieldManager):
Expand Down Expand Up @@ -137,12 +135,6 @@ class Config(FieldManager.Config):
)
"""The maximum number of cached pseudonymized urls. Default is 10000.
Behaves similarly to the max_cached_pseudonyms. Has to be greater than 0."""
tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator])
"""Optional list of path to files with top-level domain lists
(like https://publicsuffix.org/list/public_suffix_list.dat). If no path is given,
a default list will be retrieved online and cached in a local directory. For local
files the path has to be given with :code:`file:///path/to/file.dat`."""

mode: str = field(
validator=[validators.instance_of(str), validators.in_(("GCM", "CTR"))], default="GCM"
)
Expand Down Expand Up @@ -211,12 +203,6 @@ def _encrypter(self) -> Encrypter:
encrypter.load_public_keys(self._config.pubkey_analyst, self._config.pubkey_depseudo)
return encrypter

@cached_property
def _tld_extractor(self) -> TLDExtract:
if self._config.tld_lists is not None:
return TLDExtract(suffix_list_urls=self._config.tld_lists)
return TLDExtract()

@cached_property
def _regex_mapping(self) -> dict:
return GetterFactory.from_string(self._config.regex_mapping).get_yaml()
Expand Down Expand Up @@ -305,13 +291,15 @@ def _pseudonymize(self, value):
return {"pseudonym": hash_string, "origin": encrypted_origin}

def _pseudonymize_url(self, url_string: str) -> str:
url = self._tld_extractor(url_string)
if url_string.startswith(("http://", "https://")):
parsed_url = urlparse(url_string)
else:
parsed_url = urlparse(f"http://{url_string}")
if url.subdomain:
url_string = url_string.replace(url.subdomain, self._pseudonymize_string(url.subdomain))
if parsed_url.hostname:
splitted_hostname = parsed_url.hostname.split(".")
if len(splitted_hostname) > 2:
subdomain = ".".join(splitted_hostname[0:-2])
url_string = url_string.replace(subdomain, self._pseudonymize_string(subdomain))
if parsed_url.fragment:
url_string = url_string.replace(
f"#{parsed_url.fragment}", f"#{self._pseudonymize_string(parsed_url.fragment)}"
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ dependencies = [
"regex",
"ruamel.yaml",
"schedule",
"tldextract",
"urllib3>=1.26.17", # CVE-2023-43804
"uvicorn",
"deepdiff",
Expand Down
11 changes: 0 additions & 11 deletions tests/unit/processor/pseudonymizer/test_pseudonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,15 +765,6 @@ def test_testcases(self, testcase, rule, event, expected, regex_mapping):
self.object.process(event)
assert event == expected, testcase

def test_tld_extractor_uses_file(self):
config = deepcopy(self.CONFIG)
config["tld_lists"] = [TLD_LIST]
object_with_tld_list = Factory.create({"pseudonymizer": config})
assert len(object_with_tld_list._tld_extractor.suffix_list_urls) == 1
assert object_with_tld_list._tld_extractor.suffix_list_urls[0].endswith(
"tests/testdata/mock_external/tld_list.dat",
)

def _load_specific_rule(self, rule):
config = deepcopy(self.CONFIG)
config["regex_mapping"] = self.regex_mapping
Expand Down Expand Up @@ -893,7 +884,6 @@ def test_resolve_from_cache_pseudonymize_urls(self):
),
(
"https://test.de/?a=b&c=d",
# nosemgrep
(
"https://test.de/?a="
"<pseudonym:4c77fcd97a3d4d98eb062561c37e4ef000f0476bdf153b25ba8031f90ac89877>"
Expand All @@ -903,7 +893,6 @@ def test_resolve_from_cache_pseudonymize_urls(self):
),
(
"https://test.de/#test",
# nosemgrep
(
"https://test.de/#"
"<pseudonym:d95ac3629be3245d3f5e836c059516ad04081d513d2888f546b783d178b02e5a>"
Expand Down

0 comments on commit faedcbc

Please sign in to comment.