From 4bb2a3de1d8181f7e546fcb74d9e3d9c1a930e42 Mon Sep 17 00:00:00 2001
From: mackurzawa <mac.kurzawa@gmail.com>
Date: Tue, 30 Jul 2024 11:12:45 +0200
Subject: [PATCH 1/5] feat: add UNSTRUCTURED_MAX_PDF_PAGES environment
 variable. Add handling PageCountExceededError unstructured error

---
 prepline_general/api/general.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
index eb2868624..4c6a17540 100644
--- a/prepline_general/api/general.py
+++ b/prepline_general/api/general.py
@@ -35,6 +35,7 @@
 
 from prepline_general.api.models.form_params import GeneralFormParams
 from prepline_general.api.filetypes import get_validated_mimetype
+from unstructured.errors import PageCountExceededError
 from unstructured.documents.elements import Element
 from unstructured.partition.auto import partition
 from unstructured.staging.base import (
@@ -328,6 +329,7 @@ def pipeline_api(
 
     if file_content_type == "application/pdf":
         _check_pdf(file)
+    max_pages = int(os.environ.get('UNSTRUCTURED_MAX_PDF_PAGES', 300))
 
     hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
     strategy = _validate_strategy(strategy)
@@ -373,6 +375,7 @@ def pipeline_api(
                         "extract_image_block_types": extract_image_block_types,
                         "extract_image_block_to_payload": extract_image_block_to_payload,
                         "unique_element_ids": unique_element_ids,
+                        "max_pages": max_pages,
                     },
                     default=str,
                 )
@@ -403,6 +406,7 @@ def pipeline_api(
             "extract_image_block_to_payload": extract_image_block_to_payload,
             "unique_element_ids": unique_element_ids,
             "starting_page_number": starting_page_number,
+            "max_pages": max_pages,
         }
 
         if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
@@ -437,6 +441,13 @@ def pipeline_api(
             status_code=500,
             detail=str(e),
         )
+    except PageCountExceededError as e:
+        raise HTTPException(
+            status_code=422,
+            detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file "
+                   f"in smaller chunks."
+        )
+
     except ValueError as e:
         if "Invalid file" in e.args[0]:
             raise HTTPException(

From 695dcd77682301b25bf583b3063cd9dda1c6eab8 Mon Sep 17 00:00:00 2001
From: mackurzawa <mac.kurzawa@gmail.com>
Date: Tue, 30 Jul 2024 11:13:56 +0200
Subject: [PATCH 2/5] test: UNSTRUCTURED_MAX_PDF_PAGES, handle
 PageCountExceededError

---
 test_general/api/test_app.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
index f2fa3211b..2daacaec6 100644
--- a/test_general/api/test_app.py
+++ b/test_general/api/test_app.py
@@ -1158,3 +1158,24 @@ def test__set_pdf_infer_table_structure(
         )
         is expected
     )
+
+@pytest.mark.parametrize(
+    ("strategy", "test_file", "max_pages", "expect_code"),
+    [
+        ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
+        ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
+        ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
+        ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
+        ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
+        ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200),
+    ],
+)
+def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect_code):
+    monkeypatch.setenv("UNSTRUCTURED_MAX_PDF_PAGES", max_pages)
+    client = TestClient(app)
+    response = client.post(
+        MAIN_API_ROUTE,
+        files=[("files", (str(test_file), open(test_file, "rb")))],
+        data={"strategy": strategy}
+    )
+    assert response.status_code == expect_code

From df1d8971397f00037b8766e0543bf399f22cbbba Mon Sep 17 00:00:00 2001
From: mackurzawa <mac.kurzawa@gmail.com>
Date: Tue, 30 Jul 2024 11:15:03 +0200
Subject: [PATCH 3/5] docs: update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 21c876bbf..56f5422a5 100644
--- a/README.md
+++ b/README.md
@@ -373,6 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio
 * `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`.
 * `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`.
 * `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total)
+* `UNSTRUCTURED_MAX_PDF_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`.
 
 Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`.
 

From 64fb572a95b9858b21d53c60d23c50d575b367da Mon Sep 17 00:00:00 2001
From: mackurzawa <mac.kurzawa@gmail.com>
Date: Tue, 30 Jul 2024 12:15:18 +0200
Subject: [PATCH 4/5] fix: adjust code to meet linter requirements

---
 prepline_general/api/general.py | 4 ++--
 test_general/api/test_app.py    | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
index 4c6a17540..a0ff66cdf 100644
--- a/prepline_general/api/general.py
+++ b/prepline_general/api/general.py
@@ -329,7 +329,7 @@ def pipeline_api(
 
     if file_content_type == "application/pdf":
         _check_pdf(file)
-    max_pages = int(os.environ.get('UNSTRUCTURED_MAX_PDF_PAGES', 300))
+    max_pages = int(os.environ.get("UNSTRUCTURED_MAX_PDF_PAGES", 300))
 
     hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
     strategy = _validate_strategy(strategy)
@@ -445,7 +445,7 @@ def pipeline_api(
         raise HTTPException(
             status_code=422,
             detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file "
-                   f"in smaller chunks."
+            f"in smaller chunks.",
         )
 
     except ValueError as e:
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
index 2daacaec6..ebc264449 100644
--- a/test_general/api/test_app.py
+++ b/test_general/api/test_app.py
@@ -1159,6 +1159,7 @@ def test__set_pdf_infer_table_structure(
         is expected
     )
 
+
 @pytest.mark.parametrize(
     ("strategy", "test_file", "max_pages", "expect_code"),
     [
@@ -1176,6 +1177,6 @@ def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect
     response = client.post(
         MAIN_API_ROUTE,
         files=[("files", (str(test_file), open(test_file, "rb")))],
-        data={"strategy": strategy}
+        data={"strategy": strategy},
     )
     assert response.status_code == expect_code

From 5429ef3bf9cc537876fdc00c4bb3eb24a5863b00 Mon Sep 17 00:00:00 2001
From: mackurzawa <mac.kurzawa@gmail.com>
Date: Tue, 30 Jul 2024 14:48:37 +0200
Subject: [PATCH 5/5] refactor: rename variable max_pages to
 pdf_hi_res_max_pages for clarity

---
 README.md                       | 2 +-
 prepline_general/api/general.py | 6 +++---
 test_general/api/test_app.py    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 56f5422a5..3e1bc2c4e 100644
--- a/README.md
+++ b/README.md
@@ -373,7 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio
 * `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`.
 * `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`.
 * `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total)
-* `UNSTRUCTURED_MAX_PDF_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`.
+* `UNSTRUCTURED_PDF_HI_RES_MAX_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`.
 
 Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`.
 
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
index a0ff66cdf..b54a42e14 100644
--- a/prepline_general/api/general.py
+++ b/prepline_general/api/general.py
@@ -329,7 +329,7 @@ def pipeline_api(
 
     if file_content_type == "application/pdf":
         _check_pdf(file)
-    max_pages = int(os.environ.get("UNSTRUCTURED_MAX_PDF_PAGES", 300))
+    pdf_hi_res_max_pages = int(os.environ.get("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", 300))
 
     hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
     strategy = _validate_strategy(strategy)
@@ -375,7 +375,7 @@ def pipeline_api(
                         "extract_image_block_types": extract_image_block_types,
                         "extract_image_block_to_payload": extract_image_block_to_payload,
                         "unique_element_ids": unique_element_ids,
-                        "max_pages": max_pages,
+                        "pdf_hi_res_max_pages": pdf_hi_res_max_pages,
                     },
                     default=str,
                 )
@@ -406,7 +406,7 @@ def pipeline_api(
             "extract_image_block_to_payload": extract_image_block_to_payload,
             "unique_element_ids": unique_element_ids,
             "starting_page_number": starting_page_number,
-            "max_pages": max_pages,
+            "pdf_hi_res_max_pages": pdf_hi_res_max_pages,
         }
 
         if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py
index ebc264449..e33f6265e 100644
--- a/test_general/api/test_app.py
+++ b/test_general/api/test_app.py
@@ -1161,7 +1161,7 @@ def test__set_pdf_infer_table_structure(
 
 
 @pytest.mark.parametrize(
-    ("strategy", "test_file", "max_pages", "expect_code"),
+    ("strategy", "test_file", "pdf_hi_res_max_pages", "expect_code"),
     [
         ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
         ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
@@ -1171,8 +1171,8 @@ def test__set_pdf_infer_table_structure(
         ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200),
     ],
 )
-def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect_code):
-    monkeypatch.setenv("UNSTRUCTURED_MAX_PDF_PAGES", max_pages)
+def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, pdf_hi_res_max_pages, expect_code):
+    monkeypatch.setenv("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", pdf_hi_res_max_pages)
     client = TestClient(app)
     response = client.post(
         MAIN_API_ROUTE,