Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add recognizers (et, lt, pl) #1215

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@
from .au_tfn_recognizer import AuTfnRecognizer
from .au_medicare_recognizer import AuMedicareRecognizer
from .in_pan_recognizer import InPanRecognizer
from .pl_identity_card_recognizer import PlIdentityCardRecognizer
from .pl_pesel_recognizer import PlPeselRecognizer
from .et_ik_recognizer import EtIkRecognizer
from .lt_national_id_number_recognizer import LtNationalIdNumberRecognizer


NLP_RECOGNIZERS = {
Expand Down Expand Up @@ -75,5 +78,8 @@
"ItIdentityCardRecognizer",
"ItPassportRecognizer",
"InPanRecognizer",
"PlIdentityCardRecognizer",
"PlPeselRecognizer",
"EtIkRecognizer",
"LtNationalIdNumberRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from typing import List, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class EtIkRecognizer(PatternRecognizer):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we / should we create a class which supports all those entities with the same logic, and have the different specific country implementations inherit those? What are your thoughts?

"""
Recognize national identification number (isikukood) using regex and checksum.

For more information:
https://en.wikipedia.org/wiki/National_identification_number#Estonia

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"Isikukood",
r"[1-6][0-9]{2}(0[1-9]|1[012])([0][1-9]|[1-2][0-9]|3[0-1])[0-7][0-9]{3}",
0.3,
),
]

CONTEXT = ["isikukood", "IK"]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "et",
supported_entity: str = "ET_IK",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
digits = [int(digit) for digit in pattern_text]
weights = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1]
weights_2 = [3, 4, 5, 6, 7, 8, 9, 1, 2, 3]

checksum = (
sum(digit * weight for digit, weight in zip(digits[:10], weights)) % 11
)

if checksum < 10:
return checksum == digits[10]

checksum = (
sum(digit * weight for digit, weight in zip(digits[:10], weights_2)) % 11
)
if checksum == 10:
checksum = 0

return checksum == digits[10]
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from typing import List, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class LtNationalIdNumberRecognizer(PatternRecognizer):
"""
Recognize national identification number using regex and checksum.

For more information:
https://en.wikipedia.org/wiki/National_identification_number#Lithuania

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"Asmens Kodas",
r"[1-6][0-9]{2}(0[1-9]|1[012])([0][1-9]|[1-2][0-9]|3[0-1])[0-9]{4}",
0.3,
),
]

CONTEXT = ["asmens kodas"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation of context works better with unigrams. Can we separate this into "asmens", "kodas" or one of those, in addition to the existing "asmens kodas"?


def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "lt",
supported_entity: str = "LT_ASMENS_KODAS",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
digits = [int(digit) for digit in pattern_text]
weights = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1]
weights_2 = [3, 4, 5, 6, 7, 8, 9, 1, 2, 3]

checksum = (
sum(digit * weight for digit, weight in zip(digits[:10], weights)) % 11
)

if checksum < 10:
return checksum == digits[10]

checksum = (
sum(digit * weight for digit, weight in zip(digits[:10], weights_2)) % 11
)
if checksum == 10:
checksum = 0

return checksum == digits[10]
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import List, Optional

from presidio_analyzer import Pattern, PatternRecognizer


class PlIdentityCardRecognizer(PatternRecognizer):
"""
Recognize Polish identity card number using regex and checksum.

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
"""

PATTERNS = [
Pattern(
"Identity Card",
r"[A-Z]{3}[0-9]{6}",
0.1,
),
]

CONTEXT = ["numer dowodu"]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "pl",
supported_entity: str = "PL_IDENTITY_CARD",
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
)

def validate_result(self, pattern_text: str) -> bool: # noqa D102
pattern_text = pattern_text.replace(" ", "")

letter_offset = 55
weights_letters = [7, 3, 1]
weights_digits = [7, 3, 1, 7, 3]

group_letters = [ord(a) - letter_offset for a in pattern_text[:3]]
group_digits = [int(a) for a in pattern_text[4:]]

checksum = sum(
value * weight for value, weight in zip(group_letters, weights_letters)
)
checksum += sum(
value * weight for value, weight in zip(group_digits, weights_digits)
)

return checksum % 10 == int(pattern_text[3])
7 changes: 6 additions & 1 deletion presidio-analyzer/presidio_analyzer/recognizer_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@
ItPassportRecognizer,
ItIdentityCardRecognizer,
InPanRecognizer,
PlIdentityCardRecognizer,
PlPeselRecognizer,
EtIkRecognizer,
LtNationalIdNumberRecognizer,
)

logger = logging.getLogger("presidio-analyzer")
Expand Down Expand Up @@ -103,14 +106,16 @@ def load_predefined_recognizers(
InPanRecognizer,
],
"es": [EsNifRecognizer],
"et": [EtIkRecognizer],
"it": [
ItDriverLicenseRecognizer,
ItFiscalCodeRecognizer,
ItVatCodeRecognizer,
ItIdentityCardRecognizer,
ItPassportRecognizer,
],
"pl": [PlPeselRecognizer],
"lt": [LtNationalIdNumberRecognizer],
"pl": [PlIdentityCardRecognizer, PlPeselRecognizer],
"ALL": [
CreditCardRecognizer,
CryptoRecognizer,
Expand Down
35 changes: 35 additions & 0 deletions presidio-analyzer/tests/test_et_ik_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import EtIkRecognizer


@pytest.fixture(scope="module")
def recognizer():
return EtIkRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["ET_IK"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions",
[
# fmt: off
# valid identity card scores
("37102250382", 1, ((0, 11),),),
# invalid identity card scores
("37132250382", 0, ()),
("99999999999", 0, ()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a test case with surrounding text

# fmt: on
],
)
def test_when_all_et_ik_then_succeed(
text, expected_len, expected_positions, recognizer, entities, max_score
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos) in zip(results, expected_positions):
assert_result(res, entities[0], st_pos, fn_pos, max_score)
35 changes: 35 additions & 0 deletions presidio-analyzer/tests/test_lt_national_id_number_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import LtNationalIdNumberRecognizer


@pytest.fixture(scope="module")
def recognizer():
return LtNationalIdNumberRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["LT_ASMENS_KODAS"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions",
[
# fmt: off
# valid identity card scores
("33309240064", 1, ((0, 11),),),
# invalid identity card scores
("33309240063", 0, ()),
("99999999999", 0, ()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, please add a test case with surrounding text

# fmt: on
],
)
def test_when_all_lt_numbers_then_succeed(
text, expected_len, expected_positions, recognizer, entities, max_score
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos) in zip(results, expected_positions):
assert_result(res, entities[0], st_pos, fn_pos, max_score)
38 changes: 38 additions & 0 deletions presidio-analyzer/tests/test_pl_identity_card_recognizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pytest

from tests import assert_result
from presidio_analyzer.predefined_recognizers import PlIdentityCardRecognizer


@pytest.fixture(scope="module")
def recognizer():
return PlIdentityCardRecognizer()


@pytest.fixture(scope="module")
def entities():
return ["PL_IDENTITY_CARD"]


@pytest.mark.parametrize(
"text, expected_len, expected_positions",
[
# fmt: off
# valid identity card scores
# example from: https://www.gov.pl/web/gov/dowod-osobisty-informacje
("ZZC108201", 1, ((0, 9),),),
# invalid identity card scores
("123123456", 0, ()),
("ABCD12345", 0, ()),
("ABC-123456", 0, ()),
("zzc108201", 0, ()),
# fmt: on
],
)
def test_when_all_pl_identity_card_then_succeed(
text, expected_len, expected_positions, recognizer, entities, max_score
):
results = recognizer.analyze(text, entities)
assert len(results) == expected_len
for res, (st_pos, fn_pos) in zip(results, expected_positions):
assert_result(res, entities[0], st_pos, fn_pos, max_score)