From e3045986942e8f33656be7e76a8e7754c2b7016b Mon Sep 17 00:00:00 2001 From: bckamil <> Date: Mon, 20 Nov 2023 20:31:15 +0100 Subject: [PATCH] Add recognizers --- .../predefined_recognizers/__init__.py | 6 ++ .../et_ik_recognizer.py | 63 +++++++++++++++++++ .../lt_national_id_number_recognizer.py | 63 +++++++++++++++++++ .../pl_identity_card_recognizer.py | 59 +++++++++++++++++ .../presidio_analyzer/recognizer_registry.py | 7 ++- .../tests/test_et_ik_recognizer.py | 35 +++++++++++ .../test_lt_national_id_number_recognizer.py | 35 +++++++++++ .../tests/test_pl_identity_card_recognizer.py | 38 +++++++++++ 8 files changed, 305 insertions(+), 1 deletion(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/et_ik_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/lt_national_id_number_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_identity_card_recognizer.py create mode 100644 presidio-analyzer/tests/test_et_ik_recognizer.py create mode 100644 presidio-analyzer/tests/test_lt_national_id_number_recognizer.py create mode 100644 presidio-analyzer/tests/test_pl_identity_card_recognizer.py diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index cf07a16f4..9f03e35c3 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -33,7 +33,10 @@ from .au_tfn_recognizer import AuTfnRecognizer from .au_medicare_recognizer import AuMedicareRecognizer from .in_pan_recognizer import InPanRecognizer +from .pl_identity_card_recognizer import PlIdentityCardRecognizer from .pl_pesel_recognizer import PlPeselRecognizer +from .et_ik_recognizer import EtIkRecognizer +from .lt_national_id_number_recognizer import LtNationalIdNumberRecognizer NLP_RECOGNIZERS = { @@ -75,5 +78,8 @@ "ItIdentityCardRecognizer", "ItPassportRecognizer", "InPanRecognizer", + "PlIdentityCardRecognizer", "PlPeselRecognizer", + "EtIkRecognizer", + "LtNationalIdNumberRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/et_ik_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/et_ik_recognizer.py new file mode 100644 index 000000000..9f1dae5c9 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/et_ik_recognizer.py @@ -0,0 +1,63 @@ +from typing import List, Optional + +from presidio_analyzer import Pattern, PatternRecognizer + + +class EtIkRecognizer(PatternRecognizer): + """ + Recognize national identification number (isikukood) using regex and checksum. + + For more information: + https://en.wikipedia.org/wiki/National_identification_number#Estonia + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS = [ + Pattern( + "Isikukood", + r"[1-6][0-9]{2}(0[1-9]|1[012])([0][1-9]|[1-2][0-9]|3[0-1])[0-7][0-9]{3}", + 0.3, + ), + ] + + CONTEXT = ["isikukood", "IK"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "et", + supported_entity: str = "ET_IK", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: # noqa D102 + digits = [int(digit) for digit in pattern_text] + weights = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1] + weights_2 = [3, 4, 5, 6, 7, 8, 9, 1, 2, 3] + + checksum = ( + sum(digit * weight for digit, weight in zip(digits[:10], weights)) % 11 + ) + + if checksum < 10: + return checksum == digits[10] + + checksum = ( + sum(digit * weight for digit, weight in zip(digits[:10], weights_2)) % 11 + ) + if checksum == 10: + checksum = 0 + + return checksum == digits[10] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/lt_national_id_number_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/lt_national_id_number_recognizer.py new file mode 100644 index 000000000..9c5f3f5ba --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/lt_national_id_number_recognizer.py @@ -0,0 +1,63 @@ +from typing import List, Optional + +from presidio_analyzer import Pattern, PatternRecognizer + + +class LtNationalIdNumberRecognizer(PatternRecognizer): + """ + Recognize national identification number using regex and checksum. + + For more information: + https://en.wikipedia.org/wiki/National_identification_number#Lithuania + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS = [ + Pattern( + "Asmens Kodas", + r"[1-6][0-9]{2}(0[1-9]|1[012])([0][1-9]|[1-2][0-9]|3[0-1])[0-9]{4}", + 0.3, + ), + ] + + CONTEXT = ["asmens kodas"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "lt", + supported_entity: str = "LT_ASMENS_KODAS", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: # noqa D102 + digits = [int(digit) for digit in pattern_text] + weights = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1] + weights_2 = [3, 4, 5, 6, 7, 8, 9, 1, 2, 3] + + checksum = ( + sum(digit * weight for digit, weight in zip(digits[:10], weights)) % 11 + ) + + if checksum < 10: + return checksum == digits[10] + + checksum = ( + sum(digit * weight for digit, weight in zip(digits[:10], weights_2)) % 11 + ) + if checksum == 10: + checksum = 0 + + return checksum == digits[10] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_identity_card_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_identity_card_recognizer.py new file mode 100644 index 000000000..e28cbe592 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/pl_identity_card_recognizer.py @@ -0,0 +1,59 @@ +from typing import List, Optional + +from presidio_analyzer import Pattern, PatternRecognizer + + +class PlIdentityCardRecognizer(PatternRecognizer): + """ + Recognize Polish identity card number using regex and checksum. + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS = [ + Pattern( + "Identity Card", + r"[A-Z]{3}[0-9]{6}", + 0.1, + ), + ] + + CONTEXT = ["numer dowodu"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "pl", + supported_entity: str = "PL_IDENTITY_CARD", + ): + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: # noqa D102 + pattern_text = pattern_text.replace(" ", "") + + letter_offset = 55 + weights_letters = [7, 3, 1] + weights_digits = [7, 3, 1, 7, 3] + + group_letters = [ord(a) - letter_offset for a in pattern_text[:3]] + group_digits = [int(a) for a in pattern_text[4:]] + + checksum = sum( + value * weight for value, weight in zip(group_letters, weights_letters) + ) + checksum += sum( + value * weight for value, weight in zip(group_digits, weights_digits) + ) + + return checksum % 10 == int(pattern_text[3]) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry.py index 2f1f09833..e53123982 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry.py @@ -44,7 +44,10 @@ ItPassportRecognizer, ItIdentityCardRecognizer, InPanRecognizer, + PlIdentityCardRecognizer, PlPeselRecognizer, + EtIkRecognizer, + LtNationalIdNumberRecognizer, ) logger = logging.getLogger("presidio-analyzer") @@ -103,6 +106,7 @@ def load_predefined_recognizers( InPanRecognizer, ], "es": [EsNifRecognizer], + "et": [EtIkRecognizer], "it": [ ItDriverLicenseRecognizer, ItFiscalCodeRecognizer, @@ -110,7 +114,8 @@ def load_predefined_recognizers( ItIdentityCardRecognizer, ItPassportRecognizer, ], - "pl": [PlPeselRecognizer], + "lt": [LtNationalIdNumberRecognizer], + "pl": [PlIdentityCardRecognizer, PlPeselRecognizer], "ALL": [ CreditCardRecognizer, CryptoRecognizer, diff --git a/presidio-analyzer/tests/test_et_ik_recognizer.py b/presidio-analyzer/tests/test_et_ik_recognizer.py new file mode 100644 index 000000000..744f7a3b2 --- /dev/null +++ b/presidio-analyzer/tests/test_et_ik_recognizer.py @@ -0,0 +1,35 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import EtIkRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return EtIkRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["ET_IK"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions", + [ + # fmt: off + # valid identity card scores + ("37102250382", 1, ((0, 11),),), + # invalid identity card scores + ("37132250382", 0, ()), + ("99999999999", 0, ()), + # fmt: on + ], +) +def test_when_all_et_ik_then_succeed( + text, expected_len, expected_positions, recognizer, entities, max_score +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos) in zip(results, expected_positions): + assert_result(res, entities[0], st_pos, fn_pos, max_score) diff --git a/presidio-analyzer/tests/test_lt_national_id_number_recognizer.py b/presidio-analyzer/tests/test_lt_national_id_number_recognizer.py new file mode 100644 index 000000000..1391e8069 --- /dev/null +++ b/presidio-analyzer/tests/test_lt_national_id_number_recognizer.py @@ -0,0 +1,35 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import LtNationalIdNumberRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return LtNationalIdNumberRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["LT_ASMENS_KODAS"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions", + [ + # fmt: off + # valid identity card scores + ("33309240064", 1, ((0, 11),),), + # invalid identity card scores + ("33309240063", 0, ()), + ("99999999999", 0, ()), + # fmt: on + ], +) +def test_when_all_lt_numbers_then_succeed( + text, expected_len, expected_positions, recognizer, entities, max_score +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos) in zip(results, expected_positions): + assert_result(res, entities[0], st_pos, fn_pos, max_score) diff --git a/presidio-analyzer/tests/test_pl_identity_card_recognizer.py b/presidio-analyzer/tests/test_pl_identity_card_recognizer.py new file mode 100644 index 000000000..8cdb407e4 --- /dev/null +++ b/presidio-analyzer/tests/test_pl_identity_card_recognizer.py @@ -0,0 +1,38 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import PlIdentityCardRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return PlIdentityCardRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["PL_IDENTITY_CARD"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_positions", + [ + # fmt: off + # valid identity card scores + # example from: https://www.gov.pl/web/gov/dowod-osobisty-informacje + ("ZZC108201", 1, ((0, 9),),), + # invalid identity card scores + ("123123456", 0, ()), + ("ABCD12345", 0, ()), + ("ABC-123456", 0, ()), + ("zzc108201", 0, ()), + # fmt: on + ], +) +def test_when_all_pl_identity_card_then_succeed( + text, expected_len, expected_positions, recognizer, entities, max_score +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + for res, (st_pos, fn_pos) in zip(results, expected_positions): + assert_result(res, entities[0], st_pos, fn_pos, max_score)