fix: during their init, matchers skip pipes with non-required assigne…

…d extensions
aphp · Oct 24, 2023 · a2962c4 · a2962c4
1 parent d6d4c92
commit a2962c4
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 12 deletions.
diff --git a/changelog.md b/changelog.md
@@ -1,8 +1,14 @@
 # Changelog
 
+## v0.9.2
+
+### Changed
+
+- Fix matchers to skip pipes with assigned extensions that are not required by the matcher during the initialization
+
 ## v0.9.1
 
-## Changed
+### Changed
 
 - Improve negation patterns
 - Abstent disorders now set the negation to True when matched as `ABSENT`

diff --git a/edsnlp/matchers/phrase.pyx b/edsnlp/matchers/phrase.pyx
@@ -12,8 +12,7 @@ from spacy.tokens.token cimport Token
 from spacy.typedefs cimport attr_t
 from spacy.vocab cimport Vocab
 
-from edsnlp.matchers.utils import Patterns
-
+from edsnlp.matchers.utils import Patterns, normalize_token_attr
 
 def get_normalized_variant(doclike) -> str:
     tokens = [t.text + t.whitespace_ for t in doclike if not t._.excluded]
@@ -88,13 +87,15 @@ cdef class EDSPhraseMatcher(PhraseMatcher):
         if not terms:
             terms = dict()
 
+        matched_attr = normalize_token_attr(self.vocab.strings[self.attr])
+        assert matched_attr is not None, "Unsupported attribute for matching"
         token_pipelines = [
             name
             for name, pipe in nlp.pipeline
-            if any(
-                "token" in assign and not assign == "token.is_sent_start"
-                for assign in nlp.get_pipe_meta(name).assigns
-            )
+            if name not in nlp.disabled and matched_attr in {
+                normalize_token_attr(ass)
+                for ass in nlp.get_pipe_meta(name).assigns
+            }
         ]
         with nlp.select_pipes(enable=token_pipelines):
             for key, expressions in (tqdm(

diff --git a/edsnlp/matchers/simstring.py b/edsnlp/matchers/simstring.py
@@ -13,7 +13,7 @@
 from spacy.tokens import Doc, Span
 from tqdm import tqdm
 
-from edsnlp.matchers.utils import ATTRIBUTES, get_text
+from edsnlp.matchers.utils import ATTRIBUTES, get_text, normalize_token_attr
 
 
 class SimstringWriter:
@@ -129,13 +129,15 @@ def build_patterns(
         self.syn2cuis = None
 
         syn2cuis = defaultdict(lambda: [])
+
+        matched_attr = normalize_token_attr(self.vocab.strings[self.attr])
+        assert matched_attr is not None, "Unsupported attribute for matching"
         token_pipelines = [
             name
             for name, pipe in nlp.pipeline
-            if any(
-                "token" in assign and not assign == "token.is_sent_start"
-                for assign in nlp.get_pipe_meta(name).assigns
-            )
+            if name not in nlp.disabled
+            and matched_attr
+            in {normalize_token_attr(ass) for ass in nlp.get_pipe_meta(name).assigns}
         ]
         with nlp.select_pipes(enable=token_pipelines):
             with SimstringWriter(self.path) as ss_db:

diff --git a/edsnlp/matchers/utils/__init__.py b/edsnlp/matchers/utils/__init__.py
@@ -4,6 +4,15 @@
 DictOrPattern = Union[Dict[str, ListOrStr], ListOrStr]
 Patterns = Dict[str, DictOrPattern]
 
+
+def normalize_token_attr(attr):
+    if attr.startswith("doc.") or attr.startswith("span."):
+        return None
+    attr = attr.replace("token.", "")
+    lower = attr.replace("_", "").lower()
+    return "text" if lower == "orth" else lower
+
+
 ATTRIBUTES = {
     "LOWER": "lower_",
     "TEXT": "text",