Unstructured-IO · mackurzawa · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/README.md b/README.md
@@ -373,6 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio
 * `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`.
 * `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`.
 * `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total)
+* `UNSTRUCTURED_PDF_HI_RES_MAX_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`.
 
 Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`.
 

diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -35,6 +35,7 @@
 
 from prepline_general.api.models.form_params import GeneralFormParams
 from prepline_general.api.filetypes import get_validated_mimetype
+from unstructured.errors import PageCountExceededError
 from unstructured.documents.elements import Element
 from unstructured.partition.auto import partition
 from unstructured.staging.base import (
@@ -328,6 +329,7 @@ def pipeline_api(
 
     if file_content_type == "application/pdf":
         _check_pdf(file)
+    pdf_hi_res_max_pages = int(os.environ.get("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", 300))
 
     hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
     strategy = _validate_strategy(strategy)
@@ -373,6 +375,7 @@ def pipeline_api(
                         "extract_image_block_types": extract_image_block_types,
                         "extract_image_block_to_payload": extract_image_block_to_payload,
                         "unique_element_ids": unique_element_ids,
+                        "pdf_hi_res_max_pages": pdf_hi_res_max_pages,
                     },
                     default=str,
                 )
@@ -403,6 +406,7 @@ def pipeline_api(
             "extract_image_block_to_payload": extract_image_block_to_payload,
             "unique_element_ids": unique_element_ids,
             "starting_page_number": starting_page_number,
+            "pdf_hi_res_max_pages": pdf_hi_res_max_pages,
         }
 
         if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -437,6 +441,13 @@ def pipeline_api(
             status_code=500,
             detail=str(e),
         )
+    except PageCountExceededError as e:
+        raise HTTPException(
+            status_code=422,
+            detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file "
+            f"in smaller chunks.",
+        )
+
     except ValueError as e:
         if "Invalid file" in e.args[0]:
             raise HTTPException(

diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
@@ -1158,3 +1158,25 @@ def test__set_pdf_infer_table_structure(
         )
         is expected
     )
+
+
+@pytest.mark.parametrize(
+    ("strategy", "test_file", "pdf_hi_res_max_pages", "expect_code"),
+    [
+        ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
+        ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
+        ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
+        ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
+        ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
+        ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200),
+    ],
+)
+def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, pdf_hi_res_max_pages, expect_code):
+    monkeypatch.setenv("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", pdf_hi_res_max_pages)
+    client = TestClient(app)
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={"strategy": strategy},
+    )
+    assert response.status_code == expect_code