From 3c2700b126637f3b46678a60e41242ad82eda4a2 Mon Sep 17 00:00:00 2001 From: dtrai2 <95028228+dtrai2@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:14:15 +0100 Subject: [PATCH] prevent duplicate pseudonyms (#494) --- CHANGELOG.md | 2 + logprep/processor/pseudonymizer/processor.py | 7 +- .../pseudonymizer/test_pseudonymizer.py | 86 ++++++++++++++++++- 3 files changed, 91 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1d0c4e2f..2e384e0d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ ### Bugfix +* remove duplicate pseudonyms in extra outputs of pseudonymizer + ## v9.0.1 ### Breaking diff --git a/logprep/processor/pseudonymizer/processor.py b/logprep/processor/pseudonymizer/processor.py index 2c4ce42b5..1132958d6 100644 --- a/logprep/processor/pseudonymizer/processor.py +++ b/logprep/processor/pseudonymizer/processor.py @@ -117,7 +117,7 @@ class Config(Processor.Config): max_cached_pseudonymized_urls: int = field( validator=[validators.instance_of(int), validators.gt(0)], default=10000 ) - """The maximum number of cached pseudonymized urls. Default is 10000. + """The maximum number of cached pseudonymized urls. Default is 10000. Behaves similarly to the max_cached_pseudonyms. Has to be greater than 0.""" tld_lists: Optional[list] = field(default=None, validator=[list_of_urls_validator]) """Optional list of path to files with top-level domain lists @@ -220,7 +220,10 @@ def load_rules(self, specific_rules_targets: List[str], generic_rules_targets: L def process(self, event: dict): self.pseudonyms = [] super().process(event) - return (self.pseudonyms, self._config.outputs) if self.pseudonyms else None + unique_pseudonyms = list( + {pseudonyms["pseudonym"]: pseudonyms for pseudonyms in self.pseudonyms}.values() + ) + return (unique_pseudonyms, self._config.outputs) if unique_pseudonyms else None def _apply_rules(self, event: dict, rule: PseudonymizerRule): for dotted_field, regex in rule.pseudonyms.items(): diff --git a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py index c4b4528f1..623de0c3d 100644 --- a/tests/unit/processor/pseudonymizer/test_pseudonymizer.py +++ b/tests/unit/processor/pseudonymizer/test_pseudonymizer.py @@ -912,7 +912,6 @@ def test_process_returns_extra_output(self): "pseudonymizer": { "pseudonyms": { "winlog.event_data.param1": "RE_WHOLE_FIELD", - "winlog.event_data.param2": "RE_WHOLE_FIELD", } }, } @@ -923,7 +922,6 @@ def test_process_returns_extra_output(self): "provider_name": "Test456", "event_data": { "param1": "Pseudonymize me!", - "param2": "Pseudonymize me!", }, }, } @@ -936,9 +934,93 @@ def test_process_returns_extra_output(self): assert isinstance(extra_output[1], tuple) assert isinstance(extra_output[1][0], dict) assert extra_output[1][0] == {"kafka": "topic"}, "Output is set as in CONFIG" + assert len(extra_output[0]) == 1, "Should contain only one pseudonym" + assert extra_output[0][0].get("pseudonym"), "pseudonym is set" + assert extra_output[0][0].get("origin"), "encrypted original is set" + assert extra_output[0][0].get("@timestamp"), "timestamp is set if present in event" + + def test_extra_output_contains_only_one_pseudonym_even_if_pseudonym_appears_multiple_times_in_event( + self, + ): + rule_dict = { + "filter": "winlog.event_id: 1234 AND winlog.provider_name: Test456", + "pseudonymizer": { + "pseudonyms": { + "winlog.event_data.param1": "RE_WHOLE_FIELD", + "winlog.event_data.param2": "RE_WHOLE_FIELD", + } + }, + } + event = { + "@timestamp": "custom timestamp", + "winlog": { + "event_id": 1234, + "provider_name": "Test456", + "event_data": { + "param1": "Pseudonymize me - appears twice!", + "param2": "Pseudonymize me - appears twice!", + }, + }, + } + self._load_specific_rule(rule_dict) # First call + extra_output = self.object.process(event) + assert extra_output + assert isinstance(extra_output, tuple) + assert len(extra_output) == 2 + assert isinstance(extra_output[0], list) + assert isinstance(extra_output[1], tuple) + assert isinstance(extra_output[1][0], dict) + assert extra_output[1][0] == {"kafka": "topic"}, "Output is set as in CONFIG" + assert ( + len(extra_output[0]) == 1 + ), "Should contain only one pseudonym, as the value for both is the same" + assert extra_output[0][0].get("pseudonym"), "pseudonym is set" + assert extra_output[0][0].get("origin"), "encrypted original is set" + assert extra_output[0][0].get("@timestamp"), "timestamp is set if present in event" + + def test_extra_output_contains_different_pseudonyms_for_different_values(self): + rule_dict = { + "filter": "winlog.event_id: 1234 AND winlog.provider_name: Test456", + "pseudonymizer": { + "pseudonyms": { + "winlog.event_data.param1": "RE_WHOLE_FIELD", + "winlog.event_data.param2": "RE_WHOLE_FIELD", + } + }, + } + event = { + "@timestamp": "custom timestamp", + "winlog": { + "event_id": 1234, + "provider_name": "Test456", + "event_data": { + "param1": "Pseudonymize me - first!", + "param2": "Pseudonymize me - second!", + }, + }, + } + self._load_specific_rule(rule_dict) # First call + extra_output = self.object.process(event) + assert extra_output + assert isinstance(extra_output, tuple) + assert len(extra_output) == 2 + assert isinstance(extra_output[0], list) + assert isinstance(extra_output[1], tuple) + assert isinstance(extra_output[1][0], dict) + assert extra_output[1][0] == {"kafka": "topic"}, "Output is set as in CONFIG" + assert len(extra_output[0]) == 2, "Should contain two pseudonyms, for each value one" assert extra_output[0][0].get("pseudonym"), "pseudonym is set" assert extra_output[0][0].get("origin"), "encrypted original is set" assert extra_output[0][0].get("@timestamp"), "timestamp is set if present in event" + assert extra_output[0][1].get("pseudonym"), "pseudonym is set" + assert extra_output[0][1].get("origin"), "encrypted original is set" + assert extra_output[0][1].get("@timestamp"), "timestamp is set if present in event" + assert extra_output[0][0].get("pseudonym") != extra_output[0][1].get( + "pseudonym" + ), "pseudonyms should differ" + assert extra_output[0][0].get("origin") != extra_output[0][1].get( + "origin" + ), "origins should differ" def test_ignores_missing_field(self): rule_dict = {