Skip to content

Commit

Permalink
fix: solve leading/trailing pollution cleaning in get_text
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Dec 15, 2023
1 parent 2a8e9d7 commit 1faf768
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 63 deletions.
4 changes: 4 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

### Changed

- Small regex matching performance improvement, up to 1.25x faster (e.g. `eds.measurements`)

### Fixed

- Microgram scale is now correctly 1/1000g and inverse meter now 1/100 inverse cm.
Expand Down
61 changes: 44 additions & 17 deletions edsnlp/utils/doc_to_text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from functools import lru_cache
from typing import Union
from typing import List, Tuple, Union

import spacy.attrs
from spacy.tokens import Doc, Span
Expand All @@ -11,7 +11,28 @@ def aggregate_tokens(
attr: str,
ignore_excluded: bool = False,
ignore_space_tokens: bool = False,
):
) -> Tuple[str, List[int], List[int], bytes]:
"""
Aggregate tokens strings, computed from their `attr` attribute, into a single
string, possibly ignoring excluded tokens (like pollution tokens) and/or space
tokens. This also returns the start and end offsets of each token in the
aggregated string, as well as a bytes array indicating which tokens were kept.
The reason for the bytes array is that it is faster to index, and allows reverse
indexing as well.
Parameters
----------
doc: Doc
attr: str
ignore_excluded: bool
ignore_space_tokens: bool
Returns
-------
Tuple[str, List[int], List[int], bytes]
The aggregated text, the start offsets, the end offsets, and the bytes array
indicating which tokens were kept.
"""
idx_to_strings = doc.vocab.strings
exclude_hash = idx_to_strings["EXCLUDED"]
space_hash = idx_to_strings["SPACE"]
Expand All @@ -28,6 +49,7 @@ def aggregate_tokens(
text_parts[i] = idx_to_strings[str_hash] + (" " if space else "")
begins = arr[:, 2].tolist()
ends = (arr[:, 2] + arr[:, 3]).tolist()
keep_list = [True] * len(arr)
else:
if hasattr(spacy.attrs, spacy_attr):
arr = doc.to_array(
Expand All @@ -40,11 +62,11 @@ def aggregate_tokens(
tokens_space = arr[:, 0].tolist()
tokens_tag = arr[:, 1]
tokens_text = arr[:, 2].tolist()
else:
arr = doc.to_array([spacy.attrs.SPACY, spacy.attrs.TAG])
tokens_space = arr[:, 0].tolist()
tokens_tag = arr[:, 1]
tokens_text = [token._.get(spacy_attr) for token in doc]
# else:
# arr = doc.to_array([spacy.attrs.SPACY, spacy.attrs.TAG])
# tokens_space = arr[:, 0].tolist()
# tokens_tag = arr[:, 1]
# tokens_text = [token._.get(spacy_attr) for token in doc]

text_parts = [""] * len(arr)
begins = [0] * len(arr)
Expand Down Expand Up @@ -83,7 +105,7 @@ def aggregate_tokens(
text = "".join(text_parts)
if attr == "LOWER":
text = text.lower()
return text, begins, ends
return text, begins, ends, bytes(keep_list)


def get_text(
Expand Down Expand Up @@ -112,19 +134,24 @@ def get_text(
Extracted text.
"""
is_doc = isinstance(doclike, Doc)
text, starts, ends = aggregate_tokens(
text, starts, ends, keep = aggregate_tokens(
doclike if is_doc else doclike.doc,
attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)
return (
text
if is_doc
else text[starts[doclike[0].i] : ends[doclike[-1].i]]
if len(doclike)
else ""
)
try:
return (
text[
starts[keep.index(1, doclike[0].i)] : ends[
keep.rindex(1, None, doclike[-1].i + 1)
]
]
if len(doclike)
else ""
)
except ValueError:
return ""


def get_char_offsets(
Expand Down Expand Up @@ -157,4 +184,4 @@ def get_char_offsets(
attr,
ignore_excluded=ignore_excluded,
ignore_space_tokens=ignore_space_tokens,
)[1:]
)[1:3]
40 changes: 1 addition & 39 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import pytest
import spacy
from helpers import make_nlp
from pytest import fixture

import edsnlp
Expand All @@ -13,45 +14,6 @@ def lang(request):
return request.param


def make_nlp(lang):
if lang == "eds":
model = spacy.blank("eds")
else:
model = edsnlp.blank("fr")

model.add_pipe("eds.normalizer")

model.add_pipe("eds.sentences")
model.add_pipe("eds.sections")

model.add_pipe(
"eds.matcher",
config=dict(
terms=dict(patient="patient"),
attr="NORM",
ignore_excluded=True,
),
)
model.add_pipe(
"eds.matcher",
name="matcher2",
config=dict(
regex=dict(anomalie=r"anomalie"),
),
)

model.add_pipe("eds.hypothesis")
model.add_pipe("eds.negation")
model.add_pipe("eds.family")
model.add_pipe("eds.history")
model.add_pipe("eds.reported_speech")

model.add_pipe("eds.dates")
model.add_pipe("eds.measurements")

return model


@fixture(scope="session")
def nlp(lang):
return make_nlp(lang)
Expand Down
42 changes: 42 additions & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import spacy

import edsnlp


def make_nlp(lang):
if lang == "eds":
model = spacy.blank("eds")
else:
model = edsnlp.blank("fr")

model.add_pipe("eds.normalizer")

model.add_pipe("eds.sentences")
model.add_pipe("eds.sections")

model.add_pipe(
"eds.matcher",
config=dict(
terms=dict(patient="patient"),
attr="NORM",
ignore_excluded=True,
),
)
model.add_pipe(
"eds.matcher",
name="matcher2",
config=dict(
regex=dict(anomalie=r"anomalie"),
),
)

model.add_pipe("eds.hypothesis")
model.add_pipe("eds.negation")
model.add_pipe("eds.family")
model.add_pipe("eds.history")
model.add_pipe("eds.reported_speech")

model.add_pipe("eds.dates")
model.add_pipe("eds.measurements")

return model
22 changes: 15 additions & 7 deletions tests/matchers/test_regex.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import re

import pytest
from helpers import make_nlp
from pytest import mark

from edsnlp.matchers.regex import RegexMatcher, create_span
from edsnlp.matchers.utils import get_text
from tests.conftest import make_nlp


def test_regex(doc):
Expand Down Expand Up @@ -167,7 +167,7 @@ def test_norm_alignment(blank_nlp):
"pollution",
["==================", "======= ======= =======", "Nnnnnnnnnnnnn nnnnnn nnnnnnnn"],
)
def test_wrong_extraction(
def text_get_text(
blank_nlp,
leading_text: str,
leading_pollution: bool,
Expand Down Expand Up @@ -251,7 +251,7 @@ def test_regex_with_space(blank_nlp):


@pytest.fixture(scope="session")
def doc(lang):
def doc2(lang):
blank_nlp = make_nlp(lang)
blank_nlp.add_pipe("eds.pollution")
blank_nlp.add_pipe("eds.spaces")
Expand All @@ -272,14 +272,14 @@ def doc(lang):
@mark.parametrize("attr", ["TEXT", "NORM"])
@mark.parametrize("full_doc", [True, False])
def test_create_span(
doc,
doc2,
ignore_excluded: bool,
ignore_space_tokens: bool,
attr: str,
full_doc: bool,
):
sent = list(doc.sents)[1]
doclike = doc if full_doc else sent
sent = list(doc2.sents)[1]
doclike = doc2 if full_doc else sent

matched_text = get_text(
doclike,
Expand All @@ -295,7 +295,7 @@ def test_create_span(
or (ignore_space_tokens and t.tag_ == "SPACE")
)
]
filtered_original = doc[clean_tokens[0].i : clean_tokens[-1].i + 1].text
filtered_original = doc2[clean_tokens[0].i : clean_tokens[-1].i + 1].text
for pattern, result, alignment_mode in [
(r"4 / 3", "24 / 30", "expand"),
(r"4 / 3", None, "strict"),
Expand Down Expand Up @@ -353,3 +353,11 @@ def test_create_empty_span(blank_nlp):
ignore_space_tokens=True,
)
assert span.start == 5 and span.end == 5


def test_empty_get_text(blank_nlp):
blank_nlp.add_pipe("eds.pollution")
blank_nlp.add_pipe("eds.spaces")
doc = blank_nlp("==================================")
clean = get_text(doc, attr="NORM", ignore_excluded=True, ignore_space_tokens=True)
assert clean == ""

0 comments on commit 1faf768

Please sign in to comment.