Skip to content

Commit

Permalink
[doc2x] Fix bug can not deal with PDF that has too many pages
Browse files Browse the repository at this point in the history
  • Loading branch information
Menghuan1918 committed Oct 25, 2024
1 parent 167a1dc commit 748a3d7
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "pdfdeal"
version = "0.4.4"
version = "0.4.5"
authors = [{ name = "Menghuan1918", email = "[email protected]" }]
description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)."
readme = "README.md"
Expand Down
7 changes: 6 additions & 1 deletion src/pdfdeal/doc2x.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,16 @@ async def pdf2file_back(
async def process_file(index, pdf, name):
try:
page_count = get_pdf_page_count(pdf)
except RequestError as e:
results[index] = ("", str(e), False)
logger.warning(f"Failed to get page count for {pdf}: {str(e)}")
return
except Exception as e:
logger.warning(f"Failed to get page count for {pdf}: {str(e)}")
page_count = self.max_pages - 1 #! Assume the worst case
if page_count > self.max_pages:
logger.warning(f"File {pdf} has too many pages, skipping.")
raise ValueError(f"File {pdf} has too many pages.")
results[index] = ("", "File has too many pages", False)

nonlocal total_pages, last_request_time

Expand Down Expand Up @@ -312,6 +316,7 @@ async def convert_file(index, name):
if fail["error"] != "":
print("====================================")
print(f"Failed to convert {fail['path']}: {fail['error']}")
print("====================================")
logger.info(
f"Successfully converted {sum(1 for file in success_files if file)} file(s)."
)
Expand Down

0 comments on commit 748a3d7

Please sign in to comment.