Skip to content

Commit

Permalink
remove tldextractor from domain_label_extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
ekneg54 committed Dec 6, 2024
1 parent 3220736 commit c93b156
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 59 deletions.
1 change: 0 additions & 1 deletion logprep/processor/domain_label_extractor/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
- tests/testdata/rules/specific/
generic_rules:
- tests/testdata/rules/generic/
tld_lists: /path/to/list/file
tagging_field_name: resolved
.. autoclass:: logprep.processor.domain_label_extractor.processor.DomainLabelExtractor.Config
Expand Down
2 changes: 0 additions & 2 deletions logprep/processor/pseudonymizer/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@
regex_mapping: /path/to/regex_mapping.json
max_cached_pseudonyms: 1000000
mode: GCM
tld_lists:
-/path/to/tld_list.dat
.. autoclass:: logprep.processor.pseudonymizer.processor.Pseudonymizer.Config
:members:
Expand Down
1 change: 0 additions & 1 deletion tests/testdata/mock_external/tld_list.dat

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
# pylint: disable=protected-access
# pylint: disable=missing-docstring

import copy
import hashlib
import os
import shutil
import tempfile
from pathlib import Path

import responses

from logprep.factory import Factory
from logprep.processor.base.exceptions import FieldExistsWarning
Expand Down Expand Up @@ -340,44 +332,3 @@ def test_raises_field_exists_warning_if_target_field_exits(self):
assert len(result.warnings) == 1
assert isinstance(result.warnings[0], FieldExistsWarning)
assert document == expected

@responses.activate
def test_setup_downloads_tld_lists_to_separate_process_file(self):
tld_list = "http://db-path-target/list.dat"
tld_list_path = Path("/usr/bin/ls") if Path("/usr/bin/ls").exists() else Path("/bin/ls")
tld_list_content = tld_list_path.read_bytes()
expected_checksum = hashlib.md5(tld_list_content).hexdigest() # nosemgrep
responses.add(responses.GET, tld_list, tld_list_content)
config = copy.deepcopy(self.CONFIG)
config["tld_lists"] = [tld_list]
self.object = Factory.create({"domain_label_extractor": config})
self.object.setup()
logprep_tmp_dir = Path(tempfile.gettempdir()) / "logprep"
downloaded_file = logprep_tmp_dir / f"{self.object.name}-tldlist-0.dat"
assert downloaded_file.exists()
downloaded_checksum = hashlib.md5(downloaded_file.read_bytes()).hexdigest() # nosemgrep
assert expected_checksum == downloaded_checksum
# delete testfile
shutil.rmtree(logprep_tmp_dir)

@responses.activate
def test_setup_doesnt_overwrite_already_existing_tld_list_file(self):
tld_list = "http://db-path-target/list.dat"
tld_list_content = "some content"
responses.add(responses.GET, tld_list, tld_list_content.encode("utf8"))

logprep_tmp_dir = Path(tempfile.gettempdir()) / "logprep"
os.makedirs(logprep_tmp_dir, exist_ok=True)
tld_temp_file = logprep_tmp_dir / f"{self.object.name}-tldlist-0.dat"

pre_existing_content = "file exists already"
tld_temp_file.touch()
tld_temp_file.write_bytes(pre_existing_content.encode("utf8"))
config = copy.deepcopy(self.CONFIG)
config["tld_lists"] = [tld_list]
self.object = Factory.create({"domain_label_extractor": config})
self.object.setup()
assert tld_temp_file.exists()
assert tld_temp_file.read_bytes().decode("utf8") == pre_existing_content
assert tld_temp_file.read_bytes().decode("utf8") != tld_list_content
shutil.rmtree(logprep_tmp_dir) # delete testfile
6 changes: 0 additions & 6 deletions tests/unit/processor/pseudonymizer/test_pseudonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# pylint: disable=line-too-long
import re
from copy import deepcopy
from pathlib import Path

import pytest

Expand All @@ -18,11 +17,6 @@
)
from tests.unit.processor.base import BaseProcessorTestCase

REL_TLD_LIST_PATH = "tests/testdata/mock_external/tld_list.dat"

TLD_LIST = f"file://{Path().absolute().joinpath(REL_TLD_LIST_PATH).as_posix()}"


test_cases = [ # testcase, rule, event, expected, regex_mapping
(
"simple pseudonymization",
Expand Down

0 comments on commit c93b156

Please sign in to comment.