Skip to content

Commit

Permalink
fix: add unified regex w/ float support and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Jan 8, 2024
1 parent 03f9ca8 commit 021f0c0
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 10 deletions.
7 changes: 7 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# Changelog

## Unreleased

### Fixed

- Measurements now correctly match "0.X", "0.XX", ... numbers
- Typo in "celsius" measurement unit

## v0.10.2

### Changed
Expand Down
1 change: 1 addition & 0 deletions edsnlp/pipes/misc/measurements/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
measurements=list(patterns.common_measurements.keys()), # noqa: E501
units_config=patterns.units_config,
number_terms=patterns.number_terms,
number_regex=patterns.number_regex,
stopwords=patterns.stopwords,
unit_divisors=patterns.unit_divisors,
ignore_excluded=True,
Expand Down
12 changes: 2 additions & 10 deletions edsnlp/pipes/misc/measurements/measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ def __init__(
measurements: Union[str, List[Union[str, MsrConfig]], Dict[str, MsrConfig]] = list(patterns.common_measurements.keys()), # noqa: E501
units_config: Dict[str, UnitConfig] = patterns.units_config,
number_terms: Dict[str, List[str]] = patterns.number_terms,
number_regex: str = patterns.number_regex,
stopwords: List[str] = patterns.stopwords,
unit_divisors: List[str] = patterns.unit_divisors,
ignore_excluded: bool = True,
Expand Down Expand Up @@ -574,16 +575,7 @@ def __init__(
self.unitless_patterns[pattern_name] = {"name": name, **pattern}

# NUMBER PATTERNS
one_plus = "[1-9][0-9]*"
self.regex_matcher.add(
"number",
[
rf"(?<![0-9][.,]?){one_plus}([ ]\d{{3}})*[ ]+(?:[,.][ ]+\d+)?",
rf"(?<![0-9][.,]?){one_plus}([ ]\d{{3}})*(?:[,.]\d+)?",
rf"(?<![0-9][.,]?){one_plus}([ ]/[ ]|/){one_plus}",
r"(?<![0-9][.,]?)00?([,.]\d+)?",
],
)
self.regex_matcher.add("number",[number_regex])
self.number_label_hashes = {nlp.vocab.strings["number"]}
for number, terms in number_terms.items():
self.term_matcher.build_patterns(nlp, {number: terms})
Expand Down
19 changes: 19 additions & 0 deletions edsnlp/pipes/misc/measurements/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,25 @@
"1000": ["mille", "milles"],
}


number_regex = r"""(?x)
# no digit or floating point number prefix before
(?<![0-9][.,]?)
# integer part like 123 or 1 234
(?:
0
|[1-9][0-9]*(?:\ \d{3})*
)
(?:
# floating point surounded by spaces
\ +[,.]\ +\d+
# floating point w/o space
| [,.]\d+
# fractions
| (?:\ /\ |/)[1-9][0-9]*(?:\ \d{3})*
)?"""


units_config = {
# Lengths
"µm": {
Expand Down
18 changes: 18 additions & 0 deletions tests/pipelines/misc/test_measurements.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,21 @@ def test_merge_intersect(blank_nlp, matcher: MeasurementsMatcher):
assert len(doc.spans["measurements"]) == 2
assert [doc.ents[0].text, doc.ents[1].text] == ["2.0cm", "3cm"]
assert [doc.ents[0]._.value.cm, doc.ents[1]._.value.cm] == [2.0, 3]


def test_measurement_snippets(blank_nlp, matcher: MeasurementsMatcher):
for text, result in [
("0.50g", ["0.5 g"]),
("0.050g", ["0.05 g"]),
("1 m 50", ["1.5 m"]),
("1.50 m", ["1.5 m"]),
("1,50m", ["1.5 m"]),
("2.0cm x 3cm", ["2.0 cm", "3 cm"]),
("2 par 1mm", ["2 mm", "1 mm"]),
("8, 13 et 15dm", ["8 dm", "13 dm", "15 dm"]),
("1 / 50 kg", ["0.02 kg"]),
]:
doc = blank_nlp(text)
doc = matcher(doc)

assert [str(span._.value) for span in doc.spans["measurements"]] == result

0 comments on commit 021f0c0

Please sign in to comment.