Alternative maybe is text (#717)

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> Co-authored-by: James Braza <[email protected]>
Future-House · Dec 2, 2024 · 0130233 · 0130233
1 parent 2212757
commit 0130233
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 3 deletions.
diff --git a/paperqa/docs.py b/paperqa/docs.py
@@ -402,7 +402,8 @@ async def aadd(  # noqa: PLR0912
             or len(texts[0].text) < 10  # noqa: PLR2004
             or (
                 not parse_config.disable_doc_valid_check
-                and not maybe_is_text(texts[0].text)
+                # Use the first few text chunks to avoid potential issues with title page parsing in the first chunk
+                and not maybe_is_text("".join(text.text for text in texts[:5]))
             )
         ):
             raise ValueError(

diff --git a/paperqa/utils.py b/paperqa/utils.py
@@ -52,12 +52,19 @@ def name_in_text(name: str, text: str) -> bool:
 
 
 def maybe_is_text(s: str, thresh: float = 2.5) -> bool:
+    """
+    Calculate the entropy of the string to discard files with excessively repeated symbols.
+
+    PDF parsing sometimes represents horizontal distances between words on title pages
+    and in tables with spaces, which should therefore not be included in this calculation.
+    """
     if not s:
         return False
-    # Calculate the entropy of the string
+
     entropy = 0.0
+    s_wo_spaces = s.replace(" ", "")
     for c in string.printable:
-        p = s.count(c) / len(s)
+        p = s_wo_spaces.count(c) / len(s_wo_spaces)
         if p > 0:
             entropy += -p * math.log2(p)
 

diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -176,6 +176,9 @@ def test_maybe_is_text() -> None:
     bad_text = r.text.encode("latin1", "ignore").decode("utf-16", "ignore")
     assert not maybe_is_text(bad_text)
 
+    # account for possible spaces in the text due to tables or title pages
+    assert maybe_is_text("entry1                    entry2                    entry3")
+
 
 def test_name_in_text() -> None:
     name1 = "FooBar2022"