Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(maybe_balance_style_tags): search further for missing tag #239

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ Changes:
Fixes:
- Prefer the other full citation on overlap with nominative reporter
citations #237

- Update `maybe_balance_style_tags` to account for party names and intro words
inside the style tag #231

## Current

Expand Down
48 changes: 29 additions & 19 deletions eyecite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,28 +254,28 @@ def is_valid_name(name: str) -> bool:


def maybe_balance_style_tags(
start: int, end: int, plain_text: str
start: int, end: int, plain_text: str, tolerance: int = 10
) -> tuple[int, int, str]:
"""Try to include style tags at the edge of the span marked as invalid
"""Try to include missing style tags in the proximity of the found span

In some HTML sources the citations are styled with tags like <i> or <em>
When the citation is found in a stripped-of-tags text, the span may
leave out the opening or closing tag. When this happens and we try to
annotate the HTML, it will render invalid HTML. This happens mostly with
IdCitation, ReferenceCitation, etc.

This function will try to find opening or closing tags inmediately
preceding or following the citation span. If it finds them, it will
This function will try to find opening or closing tags preceding or
following the citation span within a `tolerance`. If it finds them, it will
return the new start, end and span. If not, it will return the old ones

:param start: the original start of the span
:param end: the origina end of the span
:param plain_text: the text to annotate
:param tolerance: tolerate at most this amount of extra characters
:return: a tuple (new start, new end, new span text)
"""
span_text = plain_text[start:end]
style_tags = ["i", "em", "b"]
tolerance = 5 # tolerate at most this amount of whitespace

for tag in style_tags:
opening_tag = f"<{tag}>"
Expand All @@ -284,24 +284,34 @@ def maybe_balance_style_tags(
has_closing = closing_tag in span_text
if has_opening and not has_closing:
# look for closing tag after the end
extended_end = max(
extended_end = min(
end + len(closing_tag) + tolerance, len(plain_text)
)
if end_match := re.search(
rf"{re.escape(span_text)}\s*{re.escape(closing_tag)}",
plain_text[start:extended_end],
flags=re.MULTILINE,
):
end = start + end_match.end()

# Pick the first closing tag within tolerance
matches = list(
re.finditer(
re.escape(closing_tag),
plain_text[start:extended_end],
flags=re.MULTILINE,
)
)
if matches:
end = start + matches[0].end()

if not has_opening and has_closing:
# look for opening tag before the start
extended_start = min(start - len(opening_tag) - tolerance, 0)
if start_match := re.search(
rf"{re.escape(opening_tag)}\s*{re.escape(span_text)}",
plain_text[extended_start:end],
flags=re.MULTILINE,
):
start = extended_start + start_match.start()
extended_start = max(start - len(opening_tag) - tolerance, 0)

# Pick the last opening tag within tolerance
matches = list(
re.finditer(
re.escape(opening_tag),
plain_text[extended_start:end],
flags=re.MULTILINE,
)
)
if matches:
start = extended_start + matches[-1].start()

return start, end, plain_text[start:end]
35 changes: 30 additions & 5 deletions tests/test_AnnotateTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,36 @@ def lower_annotator(before, text, after):

def test_tag_balancing(self):
"""Test trickier tag balancing cases"""
string = "Something; In <em>Nobelman </em> at 332, 113 S.Ct. 2106 (2010); Something else"
span_text = "Nobelman </em> at 332, 113 S.Ct. 2106 (2010)"
start, end = re.search(re.escape(span_text), string).span()
_, _, balanced = maybe_balance_style_tags(start, end, string)
self.assertTrue(balanced.startswith("<em>"))
pairs = [
(
"Something; In <em>Nobelman </em> at 332, 113 S.Ct. 2106 (2010); Something else",
"Nobelman </em> at 332, 113 S.Ct. 2106 (2010)",
),
(
"it established in <i>State v. Wingler</i> something",
"Wingler</i>",
),
("something <em>See id.</em> at 642", "id.</em> at 642"),
("something <i>AT&T, supra</i> something", "supra</i>"),
]
for full_string, span_text in pairs:
start, end = re.search(re.escape(span_text), full_string).span()
_, _, balanced = maybe_balance_style_tags(start, end, full_string)
self.assertTrue(
re.search("^<(i|em)>", balanced),
f"{balanced} is not the expected output",
)

# test that we don't get style tags beyond the inmediate neighboring
# one
counter_examples = [
("<em>see</em><em>id</em>", "id</em>", "<em>id</em>"),
("<em>id.</em>.<em>See</em>", "<em>id.", "<em>id.</em>"),
]
for full_string, span, expected_balanced in counter_examples:
start, end = re.search(re.escape(span), full_string).span()
_, _, balanced = maybe_balance_style_tags(start, end, full_string)
self.assertEqual(balanced, expected_balanced)

def test_long_diff(self):
"""Does diffing work across a long text with many changes?"""
Expand Down
Loading