-
Notifications
You must be signed in to change notification settings - Fork 586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add recognizers (et, lt, pl) #1215
base: main
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from typing import List, Optional | ||
|
||
from presidio_analyzer import Pattern, PatternRecognizer | ||
|
||
|
||
class EtIkRecognizer(PatternRecognizer): | ||
""" | ||
Recognize national identification number (isikukood) using regex and checksum. | ||
|
||
For more information: | ||
https://en.wikipedia.org/wiki/National_identification_number#Estonia | ||
|
||
:param patterns: List of patterns to be used by this recognizer | ||
:param context: List of context words to increase confidence in detection | ||
:param supported_language: Language this recognizer supports | ||
:param supported_entity: The entity this recognizer can detect | ||
""" | ||
|
||
PATTERNS = [ | ||
Pattern( | ||
"Isikukood", | ||
r"[1-6][0-9]{2}(0[1-9]|1[012])([0][1-9]|[1-2][0-9]|3[0-1])[0-7][0-9]{3}", | ||
0.3, | ||
), | ||
] | ||
|
||
CONTEXT = ["isikukood", "IK"] | ||
|
||
def __init__( | ||
self, | ||
patterns: Optional[List[Pattern]] = None, | ||
context: Optional[List[str]] = None, | ||
supported_language: str = "et", | ||
supported_entity: str = "ET_IK", | ||
): | ||
patterns = patterns if patterns else self.PATTERNS | ||
context = context if context else self.CONTEXT | ||
super().__init__( | ||
supported_entity=supported_entity, | ||
patterns=patterns, | ||
context=context, | ||
supported_language=supported_language, | ||
) | ||
|
||
def validate_result(self, pattern_text: str) -> bool: # noqa D102 | ||
digits = [int(digit) for digit in pattern_text] | ||
weights = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1] | ||
weights_2 = [3, 4, 5, 6, 7, 8, 9, 1, 2, 3] | ||
|
||
checksum = ( | ||
sum(digit * weight for digit, weight in zip(digits[:10], weights)) % 11 | ||
) | ||
|
||
if checksum < 10: | ||
return checksum == digits[10] | ||
|
||
checksum = ( | ||
sum(digit * weight for digit, weight in zip(digits[:10], weights_2)) % 11 | ||
) | ||
if checksum == 10: | ||
checksum = 0 | ||
|
||
return checksum == digits[10] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from typing import List, Optional | ||
|
||
from presidio_analyzer import Pattern, PatternRecognizer | ||
|
||
|
||
class LtNationalIdNumberRecognizer(PatternRecognizer): | ||
""" | ||
Recognize national identification number using regex and checksum. | ||
|
||
For more information: | ||
https://en.wikipedia.org/wiki/National_identification_number#Lithuania | ||
|
||
:param patterns: List of patterns to be used by this recognizer | ||
:param context: List of context words to increase confidence in detection | ||
:param supported_language: Language this recognizer supports | ||
:param supported_entity: The entity this recognizer can detect | ||
""" | ||
|
||
PATTERNS = [ | ||
Pattern( | ||
"Asmens Kodas", | ||
r"[1-6][0-9]{2}(0[1-9]|1[012])([0][1-9]|[1-2][0-9]|3[0-1])[0-9]{4}", | ||
0.3, | ||
), | ||
] | ||
|
||
CONTEXT = ["asmens kodas"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current implementation of context works better with unigrams. Can we separate this into "asmens", "kodas" or one of those, in addition to the existing "asmens kodas"? |
||
|
||
def __init__( | ||
self, | ||
patterns: Optional[List[Pattern]] = None, | ||
context: Optional[List[str]] = None, | ||
supported_language: str = "lt", | ||
supported_entity: str = "LT_ASMENS_KODAS", | ||
): | ||
patterns = patterns if patterns else self.PATTERNS | ||
context = context if context else self.CONTEXT | ||
super().__init__( | ||
supported_entity=supported_entity, | ||
patterns=patterns, | ||
context=context, | ||
supported_language=supported_language, | ||
) | ||
|
||
def validate_result(self, pattern_text: str) -> bool: # noqa D102 | ||
digits = [int(digit) for digit in pattern_text] | ||
weights = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1] | ||
weights_2 = [3, 4, 5, 6, 7, 8, 9, 1, 2, 3] | ||
|
||
checksum = ( | ||
sum(digit * weight for digit, weight in zip(digits[:10], weights)) % 11 | ||
) | ||
|
||
if checksum < 10: | ||
return checksum == digits[10] | ||
|
||
checksum = ( | ||
sum(digit * weight for digit, weight in zip(digits[:10], weights_2)) % 11 | ||
) | ||
if checksum == 10: | ||
checksum = 0 | ||
|
||
return checksum == digits[10] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from typing import List, Optional | ||
|
||
from presidio_analyzer import Pattern, PatternRecognizer | ||
|
||
|
||
class PlIdentityCardRecognizer(PatternRecognizer): | ||
""" | ||
Recognize Polish identity card number using regex and checksum. | ||
|
||
:param patterns: List of patterns to be used by this recognizer | ||
:param context: List of context words to increase confidence in detection | ||
:param supported_language: Language this recognizer supports | ||
:param supported_entity: The entity this recognizer can detect | ||
""" | ||
|
||
PATTERNS = [ | ||
Pattern( | ||
"Identity Card", | ||
r"[A-Z]{3}[0-9]{6}", | ||
0.1, | ||
), | ||
] | ||
|
||
CONTEXT = ["numer dowodu"] | ||
|
||
def __init__( | ||
self, | ||
patterns: Optional[List[Pattern]] = None, | ||
context: Optional[List[str]] = None, | ||
supported_language: str = "pl", | ||
supported_entity: str = "PL_IDENTITY_CARD", | ||
): | ||
patterns = patterns if patterns else self.PATTERNS | ||
context = context if context else self.CONTEXT | ||
super().__init__( | ||
supported_entity=supported_entity, | ||
patterns=patterns, | ||
context=context, | ||
supported_language=supported_language, | ||
) | ||
|
||
def validate_result(self, pattern_text: str) -> bool: # noqa D102 | ||
pattern_text = pattern_text.replace(" ", "") | ||
|
||
letter_offset = 55 | ||
weights_letters = [7, 3, 1] | ||
weights_digits = [7, 3, 1, 7, 3] | ||
|
||
group_letters = [ord(a) - letter_offset for a in pattern_text[:3]] | ||
group_digits = [int(a) for a in pattern_text[4:]] | ||
|
||
checksum = sum( | ||
value * weight for value, weight in zip(group_letters, weights_letters) | ||
) | ||
checksum += sum( | ||
value * weight for value, weight in zip(group_digits, weights_digits) | ||
) | ||
|
||
return checksum % 10 == int(pattern_text[3]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pytest | ||
|
||
from tests import assert_result | ||
from presidio_analyzer.predefined_recognizers import EtIkRecognizer | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def recognizer(): | ||
return EtIkRecognizer() | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def entities(): | ||
return ["ET_IK"] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"text, expected_len, expected_positions", | ||
[ | ||
# fmt: off | ||
# valid identity card scores | ||
("37102250382", 1, ((0, 11),),), | ||
# invalid identity card scores | ||
("37132250382", 0, ()), | ||
("99999999999", 0, ()), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a test case with surrounding text |
||
# fmt: on | ||
], | ||
) | ||
def test_when_all_et_ik_then_succeed( | ||
text, expected_len, expected_positions, recognizer, entities, max_score | ||
): | ||
results = recognizer.analyze(text, entities) | ||
assert len(results) == expected_len | ||
for res, (st_pos, fn_pos) in zip(results, expected_positions): | ||
assert_result(res, entities[0], st_pos, fn_pos, max_score) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pytest | ||
|
||
from tests import assert_result | ||
from presidio_analyzer.predefined_recognizers import LtNationalIdNumberRecognizer | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def recognizer(): | ||
return LtNationalIdNumberRecognizer() | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def entities(): | ||
return ["LT_ASMENS_KODAS"] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"text, expected_len, expected_positions", | ||
[ | ||
# fmt: off | ||
# valid identity card scores | ||
("33309240064", 1, ((0, 11),),), | ||
# invalid identity card scores | ||
("33309240063", 0, ()), | ||
("99999999999", 0, ()), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, please add a test case with surrounding text |
||
# fmt: on | ||
], | ||
) | ||
def test_when_all_lt_numbers_then_succeed( | ||
text, expected_len, expected_positions, recognizer, entities, max_score | ||
): | ||
results = recognizer.analyze(text, entities) | ||
assert len(results) == expected_len | ||
for res, (st_pos, fn_pos) in zip(results, expected_positions): | ||
assert_result(res, entities[0], st_pos, fn_pos, max_score) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import pytest | ||
|
||
from tests import assert_result | ||
from presidio_analyzer.predefined_recognizers import PlIdentityCardRecognizer | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def recognizer(): | ||
return PlIdentityCardRecognizer() | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def entities(): | ||
return ["PL_IDENTITY_CARD"] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"text, expected_len, expected_positions", | ||
[ | ||
# fmt: off | ||
# valid identity card scores | ||
# example from: https://www.gov.pl/web/gov/dowod-osobisty-informacje | ||
("ZZC108201", 1, ((0, 9),),), | ||
# invalid identity card scores | ||
("123123456", 0, ()), | ||
("ABCD12345", 0, ()), | ||
("ABC-123456", 0, ()), | ||
("zzc108201", 0, ()), | ||
# fmt: on | ||
], | ||
) | ||
def test_when_all_pl_identity_card_then_succeed( | ||
text, expected_len, expected_positions, recognizer, entities, max_score | ||
): | ||
results = recognizer.analyze(text, entities) | ||
assert len(results) == expected_len | ||
for res, (st_pos, fn_pos) in zip(results, expected_positions): | ||
assert_result(res, entities[0], st_pos, fn_pos, max_score) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we / should we create a class which supports all those entities with the same logic, and have the different specific country implementations inherit those? What are your thoughts?