Skip to content

Commit

Permalink
fix: detect tnm entities followed by a space
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Dec 14, 2023
1 parent d43fbd9 commit fb62ae3
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 8 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Microgram scale is now correctly 1/1000g and inverse meter now 1/100 inverse cm.
- We now isolate some of edsnlp components (trainable pipes that require ml dependencies)
in a new `edsnlp_factories` entry points to prevent spacy from auto-importing them.
- TNM scores followed by a space are now correctly detected

## v0.10.0

Expand Down
10 changes: 5 additions & 5 deletions edsnlp/pipes/ner/tnm/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@

version_pattern = (
r"\(?(?P<version>uicc|accj|tnm|UICC|ACCJ|TNM)"
r"\s+([éeE]ditions|[éeE]d\.?)?\s*"
r"\s+([éeE]ditions|[éeE]d\.?)?\s*?"
r"(?P<version_year>\d{4}|\d{2})\)?"
)

spacer = r"(.|\n){1,5}"

tnm_pattern = f"(?<={version_pattern}{spacer})?"
tnm_pattern += prefix_pattern + r"\s*" + f"({tumour_pattern})"
tnm_pattern += r"\s*" + f"({node_pattern})?"
tnm_pattern += r"\s*" + f"({metastasis_pattern})?"
tnm_pattern += r"\s*" + f"({resection_completeness})?"
tnm_pattern += prefix_pattern + r"\s*?" + f"({tumour_pattern})"
tnm_pattern += r"\s*?" + f"({node_pattern})?"
tnm_pattern += r"\s*?" + f"({metastasis_pattern})?"
tnm_pattern += r"\s*?" + f"({resection_completeness})?"
tnm_pattern += f"({spacer}{version_pattern})?"
tnm_pattern = r"(?:\b|^)" + tnm_pattern + r"(?:\b|$)"
2 changes: 1 addition & 1 deletion edsnlp/pipes/ner/tnm/tnm.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(
if isinstance(pattern, str):
pattern = [pattern]

self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="strict")
self.regex_matcher = RegexMatcher(attr=attr, alignment_mode="expand")
self.regex_matcher.add(self.label, pattern)

def set_extensions(self) -> None:
Expand Down
5 changes: 3 additions & 2 deletions tests/pipelines/ner/test_tnm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,20 @@
"TNM: <ent norm=pT1mN1M0>pT1(m)N1 M0</ent>",
"TNM: <ent norm=pT1bN0sn>pT1bN0(sn)</ent>",
"TNM: <ent norm=pT1N1M0>pT1 pN1 M0</ent>",
"TNM: <ent norm=aTxN1M0>aTxN1M0</ent> ",
]


def test_scores(blank_nlp):

blank_nlp.add_pipe("eds.tnm")

for example in examples:

text, entities = parse_example(example=example)

doc = blank_nlp(text)

assert len(entities) == len(doc.ents)

for entity, ent in zip(entities, doc.ents):
norm = entity.modifiers[0].value
assert norm == ent._.value.norm()

0 comments on commit fb62ae3

Please sign in to comment.