From 4bb2a3de1d8181f7e546fcb74d9e3d9c1a930e42 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Tue, 30 Jul 2024 11:12:45 +0200 Subject: [PATCH 1/5] feat: add UNSTRUCTURED_MAX_PDF_PAGES environment variable. Add handling PageCountExceededError unstructured error --- prepline_general/api/general.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index eb2868624..4c6a17540 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -35,6 +35,7 @@ from prepline_general.api.models.form_params import GeneralFormParams from prepline_general.api.filetypes import get_validated_mimetype +from unstructured.errors import PageCountExceededError from unstructured.documents.elements import Element from unstructured.partition.auto import partition from unstructured.staging.base import ( @@ -328,6 +329,7 @@ def pipeline_api( if file_content_type == "application/pdf": _check_pdf(file) + max_pages = int(os.environ.get('UNSTRUCTURED_MAX_PDF_PAGES', 300)) hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates) strategy = _validate_strategy(strategy) @@ -373,6 +375,7 @@ def pipeline_api( "extract_image_block_types": extract_image_block_types, "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, + "max_pages": max_pages, }, default=str, ) @@ -403,6 +406,7 @@ def pipeline_api( "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, "starting_page_number": starting_page_number, + "max_pages": max_pages, } if file_content_type == "application/pdf" and pdf_parallel_mode_enabled: @@ -437,6 +441,13 @@ def pipeline_api( status_code=500, detail=str(e), ) + except PageCountExceededError as e: + raise HTTPException( + status_code=422, + detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file " + f"in smaller chunks." + ) + except ValueError as e: if "Invalid file" in e.args[0]: raise HTTPException( From 695dcd77682301b25bf583b3063cd9dda1c6eab8 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Tue, 30 Jul 2024 11:13:56 +0200 Subject: [PATCH 2/5] test: UNSTRUCTURED_MAX_PDF_PAGES, handle PageCountExceededError --- test_general/api/test_app.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index f2fa3211b..2daacaec6 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1158,3 +1158,24 @@ def test__set_pdf_infer_table_structure( ) is expected ) + +@pytest.mark.parametrize( + ("strategy", "test_file", "max_pages", "expect_code"), + [ + ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), + ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422), + ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), + ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422), + ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), + ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200), + ], +) +def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect_code): + monkeypatch.setenv("UNSTRUCTURED_MAX_PDF_PAGES", max_pages) + client = TestClient(app) + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), open(test_file, "rb")))], + data={"strategy": strategy} + ) + assert response.status_code == expect_code From df1d8971397f00037b8766e0543bf399f22cbbba Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Tue, 30 Jul 2024 11:15:03 +0200 Subject: [PATCH 3/5] docs: update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 21c876bbf..56f5422a5 100644 --- a/README.md +++ b/README.md @@ -373,6 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio * `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`. * `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`. * `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total) +* `UNSTRUCTURED_MAX_PDF_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`. Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`. From 64fb572a95b9858b21d53c60d23c50d575b367da Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Tue, 30 Jul 2024 12:15:18 +0200 Subject: [PATCH 4/5] fix: adjust code to meet linter requirements --- prepline_general/api/general.py | 4 ++-- test_general/api/test_app.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 4c6a17540..a0ff66cdf 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -329,7 +329,7 @@ def pipeline_api( if file_content_type == "application/pdf": _check_pdf(file) - max_pages = int(os.environ.get('UNSTRUCTURED_MAX_PDF_PAGES', 300)) + max_pages = int(os.environ.get("UNSTRUCTURED_MAX_PDF_PAGES", 300)) hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates) strategy = _validate_strategy(strategy) @@ -445,7 +445,7 @@ def pipeline_api( raise HTTPException( status_code=422, detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file " - f"in smaller chunks." + f"in smaller chunks.", ) except ValueError as e: diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index 2daacaec6..ebc264449 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1159,6 +1159,7 @@ def test__set_pdf_infer_table_structure( is expected ) + @pytest.mark.parametrize( ("strategy", "test_file", "max_pages", "expect_code"), [ @@ -1176,6 +1177,6 @@ def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect response = client.post( MAIN_API_ROUTE, files=[("files", (str(test_file), open(test_file, "rb")))], - data={"strategy": strategy} + data={"strategy": strategy}, ) assert response.status_code == expect_code From 5429ef3bf9cc537876fdc00c4bb3eb24a5863b00 Mon Sep 17 00:00:00 2001 From: mackurzawa Date: Tue, 30 Jul 2024 14:48:37 +0200 Subject: [PATCH 5/5] refactor: rename variable max_pages to pdf_hi_res_max_pages for clarity --- README.md | 2 +- prepline_general/api/general.py | 6 +++--- test_general/api/test_app.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 56f5422a5..3e1bc2c4e 100644 --- a/README.md +++ b/README.md @@ -373,7 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio * `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`. * `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`. * `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total) -* `UNSTRUCTURED_MAX_PDF_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`. +* `UNSTRUCTURED_PDF_HI_RES_MAX_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`. Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`. diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index a0ff66cdf..b54a42e14 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -329,7 +329,7 @@ def pipeline_api( if file_content_type == "application/pdf": _check_pdf(file) - max_pages = int(os.environ.get("UNSTRUCTURED_MAX_PDF_PAGES", 300)) + pdf_hi_res_max_pages = int(os.environ.get("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", 300)) hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates) strategy = _validate_strategy(strategy) @@ -375,7 +375,7 @@ def pipeline_api( "extract_image_block_types": extract_image_block_types, "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, - "max_pages": max_pages, + "pdf_hi_res_max_pages": pdf_hi_res_max_pages, }, default=str, ) @@ -406,7 +406,7 @@ def pipeline_api( "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, "starting_page_number": starting_page_number, - "max_pages": max_pages, + "pdf_hi_res_max_pages": pdf_hi_res_max_pages, } if file_content_type == "application/pdf" and pdf_parallel_mode_enabled: diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index ebc264449..e33f6265e 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1161,7 +1161,7 @@ def test__set_pdf_infer_table_structure( @pytest.mark.parametrize( - ("strategy", "test_file", "max_pages", "expect_code"), + ("strategy", "test_file", "pdf_hi_res_max_pages", "expect_code"), [ ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422), @@ -1171,8 +1171,8 @@ def test__set_pdf_infer_table_structure( ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200), ], ) -def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect_code): - monkeypatch.setenv("UNSTRUCTURED_MAX_PDF_PAGES", max_pages) +def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, pdf_hi_res_max_pages, expect_code): + monkeypatch.setenv("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", pdf_hi_res_max_pages) client = TestClient(app) response = client.post( MAIN_API_ROUTE,