Skip to content

Commit

Permalink
Alternative maybe is text (#717)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Co-authored-by: James Braza <[email protected]>
  • Loading branch information
3 people authored Dec 2, 2024
1 parent 2212757 commit 0130233
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
3 changes: 2 additions & 1 deletion paperqa/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,8 @@ async def aadd( # noqa: PLR0912
or len(texts[0].text) < 10 # noqa: PLR2004
or (
not parse_config.disable_doc_valid_check
and not maybe_is_text(texts[0].text)
# Use the first few text chunks to avoid potential issues with title page parsing in the first chunk
and not maybe_is_text("".join(text.text for text in texts[:5]))
)
):
raise ValueError(
Expand Down
11 changes: 9 additions & 2 deletions paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,19 @@ def name_in_text(name: str, text: str) -> bool:


def maybe_is_text(s: str, thresh: float = 2.5) -> bool:
"""
Calculate the entropy of the string to discard files with excessively repeated symbols.
PDF parsing sometimes represents horizontal distances between words on title pages
and in tables with spaces, which should therefore not be included in this calculation.
"""
if not s:
return False
# Calculate the entropy of the string

entropy = 0.0
s_wo_spaces = s.replace(" ", "")
for c in string.printable:
p = s.count(c) / len(s)
p = s_wo_spaces.count(c) / len(s_wo_spaces)
if p > 0:
entropy += -p * math.log2(p)

Expand Down
3 changes: 3 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,9 @@ def test_maybe_is_text() -> None:
bad_text = r.text.encode("latin1", "ignore").decode("utf-16", "ignore")
assert not maybe_is_text(bad_text)

# account for possible spaces in the text due to tables or title pages
assert maybe_is_text("entry1 entry2 entry3")


def test_name_in_text() -> None:
name1 = "FooBar2022"
Expand Down

0 comments on commit 0130233

Please sign in to comment.