[doc2x] Fix bug can not deal with PDF that has too many pages

NoEdgeAI · Oct 25, 2024 · 748a3d7 · 748a3d7
1 parent 167a1dc
commit 748a3d7
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pdfdeal"
-version = "0.4.4"
+version = "0.4.5"
 authors = [{ name = "Menghuan1918", email = "[email protected]" }]
 description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)."
 readme = "README.md"

diff --git a/src/pdfdeal/doc2x.py b/src/pdfdeal/doc2x.py
@@ -181,12 +181,16 @@ async def pdf2file_back(
         async def process_file(index, pdf, name):
             try:
                 page_count = get_pdf_page_count(pdf)
+            except RequestError as e:
+                results[index] = ("", str(e), False)
+                logger.warning(f"Failed to get page count for {pdf}: {str(e)}")
+                return
             except Exception as e:
                 logger.warning(f"Failed to get page count for {pdf}: {str(e)}")
                 page_count = self.max_pages - 1  #! Assume the worst case
             if page_count > self.max_pages:
                 logger.warning(f"File {pdf} has too many pages, skipping.")
-                raise ValueError(f"File {pdf} has too many pages.")
+                results[index] = ("", "File has too many pages", False)
 
             nonlocal total_pages, last_request_time
 
@@ -312,6 +316,7 @@ async def convert_file(index, name):
                     if fail["error"] != "":
                         print("====================================")
                         print(f"Failed to convert {fail['path']}: {fail['error']}")
+                        print("====================================")
         logger.info(
             f"Successfully converted {sum(1 for file in success_files if file)} file(s)."
         )