diff --git a/changelog.md b/changelog.md index 97101019f..2e0342bef 100644 --- a/changelog.md +++ b/changelog.md @@ -1,8 +1,14 @@ # Changelog +## v0.9.2 + +### Changed + +- Fix matchers to skip pipes with assigned extensions that are not required by the matcher during the initialization + ## v0.9.1 -## Changed +### Changed - Improve negation patterns - Abstent disorders now set the negation to True when matched as `ABSENT` diff --git a/edsnlp/matchers/phrase.pyx b/edsnlp/matchers/phrase.pyx index 5f4b0d9ee..51fea5558 100644 --- a/edsnlp/matchers/phrase.pyx +++ b/edsnlp/matchers/phrase.pyx @@ -12,8 +12,7 @@ from spacy.tokens.token cimport Token from spacy.typedefs cimport attr_t from spacy.vocab cimport Vocab -from edsnlp.matchers.utils import Patterns - +from edsnlp.matchers.utils import Patterns, normalize_token_attr def get_normalized_variant(doclike) -> str: tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded] @@ -88,13 +87,15 @@ cdef class EDSPhraseMatcher(PhraseMatcher): if not terms: terms = dict() + matched_attr = normalize_token_attr(self.vocab.strings[self.attr]) + assert matched_attr is not None, "Unsupported attribute for matching" token_pipelines = [ name for name, pipe in nlp.pipeline - if any( - "token" in assign and not assign == "token.is_sent_start" - for assign in nlp.get_pipe_meta(name).assigns - ) + if name not in nlp.disabled and matched_attr in { + normalize_token_attr(ass) + for ass in nlp.get_pipe_meta(name).assigns + } ] with nlp.select_pipes(enable=token_pipelines): for key, expressions in (tqdm( diff --git a/edsnlp/matchers/simstring.py b/edsnlp/matchers/simstring.py index 8a625225c..cd0f31f20 100644 --- a/edsnlp/matchers/simstring.py +++ b/edsnlp/matchers/simstring.py @@ -13,7 +13,7 @@ from spacy.tokens import Doc, Span from tqdm import tqdm -from edsnlp.matchers.utils import ATTRIBUTES, get_text +from edsnlp.matchers.utils import ATTRIBUTES, get_text, normalize_token_attr class SimstringWriter: @@ -129,13 +129,15 @@ def build_patterns( self.syn2cuis = None syn2cuis = defaultdict(lambda: []) + + matched_attr = normalize_token_attr(self.vocab.strings[self.attr]) + assert matched_attr is not None, "Unsupported attribute for matching" token_pipelines = [ name for name, pipe in nlp.pipeline - if any( - "token" in assign and not assign == "token.is_sent_start" - for assign in nlp.get_pipe_meta(name).assigns - ) + if name not in nlp.disabled + and matched_attr + in {normalize_token_attr(ass) for ass in nlp.get_pipe_meta(name).assigns} ] with nlp.select_pipes(enable=token_pipelines): with SimstringWriter(self.path) as ss_db: diff --git a/edsnlp/matchers/utils/__init__.py b/edsnlp/matchers/utils/__init__.py index b975d6166..d8888a4a3 100644 --- a/edsnlp/matchers/utils/__init__.py +++ b/edsnlp/matchers/utils/__init__.py @@ -4,6 +4,15 @@ DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr] Patterns = Dict[str, DictOrPattern] + +def normalize_token_attr(attr): + if attr.startswith("doc.") or attr.startswith("span."): + return None + attr = attr.replace("token.", "") + lower = attr.replace("_", "").lower() + return "text" if lower == "orth" else lower + + ATTRIBUTES = { "LOWER": "lower_", "TEXT": "text",