freelawproject · grossir · Mar 7, 2025
diff --git a/CHANGES.md b/CHANGES.md
@@ -13,7 +13,8 @@ Changes:
 Fixes:
 - Prefer the other full citation on overlap with nominative reporter 
   citations #237
-
+- Update `maybe_balance_style_tags` to account for party names and intro words
+  inside the style tag #231
 
 ## Current
 

diff --git a/eyecite/utils.py b/eyecite/utils.py
@@ -254,28 +254,28 @@ def is_valid_name(name: str) -> bool:
 
 
 def maybe_balance_style_tags(
-    start: int, end: int, plain_text: str
+    start: int, end: int, plain_text: str, tolerance: int = 10
 ) -> tuple[int, int, str]:
-    """Try to include style tags at the edge of the span marked as invalid
+    """Try to include missing style tags in the proximity of the found span
 
     In some HTML sources the citations are styled with tags like <i> or <em>
     When the citation is found in a stripped-of-tags text, the span may
     leave out the opening or closing tag. When this happens and we try to
     annotate the HTML, it will render invalid HTML. This happens mostly with
     IdCitation, ReferenceCitation, etc.
 
-    This function will try to find opening or closing tags inmediately
-    preceding or following the citation span. If it finds them, it will
+    This function will try to find opening or closing tags preceding or
+    following the citation span within a `tolerance`. If it finds them, it will
     return the new start, end and span. If not, it will return the old ones
 
     :param start: the original start of the span
     :param end: the origina end of the span
     :param plain_text: the text to annotate
+    :param tolerance: tolerate at most this amount of extra characters
     :return: a tuple (new start, new end, new span text)
     """
     span_text = plain_text[start:end]
     style_tags = ["i", "em", "b"]
-    tolerance = 5  # tolerate at most this amount of whitespace
 
     for tag in style_tags:
         opening_tag = f"<{tag}>"
@@ -284,24 +284,34 @@ def maybe_balance_style_tags(
         has_closing = closing_tag in span_text
         if has_opening and not has_closing:
             # look for closing tag after the end
-            extended_end = max(
+            extended_end = min(
                 end + len(closing_tag) + tolerance, len(plain_text)
             )
-            if end_match := re.search(
-                rf"{re.escape(span_text)}\s*{re.escape(closing_tag)}",
-                plain_text[start:extended_end],
-                flags=re.MULTILINE,
-            ):
-                end = start + end_match.end()
+
+            # Pick the first closing tag within tolerance
+            matches = list(
+                re.finditer(
+                    re.escape(closing_tag),
+                    plain_text[start:extended_end],
+                    flags=re.MULTILINE,
+                )
+            )
+            if matches:
+                end = start + matches[0].end()
 
         if not has_opening and has_closing:
             # look for opening tag before the start
-            extended_start = min(start - len(opening_tag) - tolerance, 0)
-            if start_match := re.search(
-                rf"{re.escape(opening_tag)}\s*{re.escape(span_text)}",
-                plain_text[extended_start:end],
-                flags=re.MULTILINE,
-            ):
-                start = extended_start + start_match.start()
+            extended_start = max(start - len(opening_tag) - tolerance, 0)
+
+            # Pick the last opening tag within tolerance
+            matches = list(
+                re.finditer(
+                    re.escape(opening_tag),
+                    plain_text[extended_start:end],
+                    flags=re.MULTILINE,
+                )
+            )
+            if matches:
+                start = extended_start + matches[-1].start()
 
     return start, end, plain_text[start:end]
diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
@@ -234,11 +234,36 @@ def lower_annotator(before, text, after):
 
     def test_tag_balancing(self):
         """Test trickier tag balancing cases"""
-        string = "Something; In <em>Nobelman </em> at 332, 113 S.Ct. 2106 (2010); Something else"
-        span_text = "Nobelman </em> at 332, 113 S.Ct. 2106 (2010)"
-        start, end = re.search(re.escape(span_text), string).span()
-        _, _, balanced = maybe_balance_style_tags(start, end, string)
-        self.assertTrue(balanced.startswith("<em>"))
+        pairs = [
+            (
+                "Something; In <em>Nobelman </em> at 332, 113 S.Ct. 2106 (2010); Something else",
+                "Nobelman </em> at 332, 113 S.Ct. 2106 (2010)",
+            ),
+            (
+                "it established in  <i>State v. Wingler</i> something",
+                "Wingler</i>",
+            ),
+            ("something <em>See id.</em> at 642", "id.</em> at 642"),
+            ("something <i>AT&T, supra</i> something", "supra</i>"),
+        ]
+        for full_string, span_text in pairs:
+            start, end = re.search(re.escape(span_text), full_string).span()
+            _, _, balanced = maybe_balance_style_tags(start, end, full_string)
+            self.assertTrue(
+                re.search("^<(i|em)>", balanced),
+                f"{balanced} is not the expected output",
+            )
+
+        # test that we don't get style tags beyond the inmediate neighboring
+        # one
+        counter_examples = [
+            ("<em>see</em><em>id</em>", "id</em>", "<em>id</em>"),
+            ("<em>id.</em>.<em>See</em>", "<em>id.", "<em>id.</em>"),
+        ]
+        for full_string, span, expected_balanced in counter_examples:
+            start, end = re.search(re.escape(span), full_string).span()
+            _, _, balanced = maybe_balance_style_tags(start, end, full_string)
+            self.assertEqual(balanced, expected_balanced)
 
     def test_long_diff(self):
         """Does diffing work across a long text with many changes?"""