From a9ff1e70b2cba42b1422541b4a08b94e5cfc0cf7 Mon Sep 17 00:00:00 2001
From: Yao You <theyaoyou@gmail.com>
Date: Wed, 29 Jan 2025 06:11:17 -0600
Subject: [PATCH 1/2] Fix/fix ocr region to elements bug (#3891)

This PR fixes a bug in `build_layout_elements_from_ocr_regions` where
texts are joint in incorrect orders.

The bug is due to incorrect masking of the `ocr_regions` after some are
already selected as one of the final groups. The fix uses simpler method
to mask the indices by simply use the same indices that adds the regions
to the final groups to mask them so they are not considered again.

## Testing

This PR adds a unit test specifically aimed for this bug. Without the
fix the test would fail.
Additionally any PDF files with repeated texts has a potential to
trigger this bug. e.g., create a simple pdf use the test text

```python
"LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\nLayoutParser for Deep Learning"
```
and partition with `ocr_only` mode on main branch would hit this bug and
output text where position of the second "LayoutParser" is incorrect.
```python
[
    'LayoutParser:',
    'A Unified Toolkit for Deep Learning Based Document Image',
    'for Deep Learning LayoutParser',
]
```
---
 CHANGELOG.md                                  |  3 +-
 .../pdf_image/test_inference_utils.py         | 76 +++++++++++++++++++
 unstructured/__version__.py                   |  2 +-
 .../partition/pdf_image/inference_utils.py    |  2 +-
 4 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8869fbe9f7..8cc86c7b52 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.17-dev1
+## 0.16.17-dev2
 
 ### Enhancements
 - **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.
@@ -6,6 +6,7 @@
 ### Features
 
 ### Fixes
+- **Fix a bug where `build_layout_elements_from_cor_regions` incorrectly joins texts in wrong order**.
 
 ## 0.16.16
 
diff --git a/test_unstructured/partition/pdf_image/test_inference_utils.py b/test_unstructured/partition/pdf_image/test_inference_utils.py
index 02897c6819..28fbdd6f42 100644
--- a/test_unstructured/partition/pdf_image/test_inference_utils.py
+++ b/test_unstructured/partition/pdf_image/test_inference_utils.py
@@ -91,3 +91,79 @@ def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedd
             ),
         ]
     )
+
+
+def test_build_layout_elements_from_ocr_regions_with_repeated_texts(mock_embedded_text_regions):
+    mock_embedded_text_regions.extend(
+        [
+            LayoutElement.from_coords(
+                x1=453.00277777777774,
+                y1=417.319341111111,
+                x2=711.5338541666665,
+                y2=458.28571222222206,
+                text="LayoutParser",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+            LayoutElement.from_coords(
+                x1=453.00277777777774,
+                y1=468.319341111111,
+                x2=711.5338541666665,
+                y2=478.28571222222206,
+                text="for",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+            LayoutElement.from_coords(
+                x1=453.00277777777774,
+                y1=488.319341111111,
+                x2=711.5338541666665,
+                y2=500.28571222222206,
+                text="Deep",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+            LayoutElement.from_coords(
+                x1=453.00277777777774,
+                y1=510.319341111111,
+                x2=711.5338541666665,
+                y2=550.28571222222206,
+                text="Learning",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+        ]
+    )
+    text = (
+        "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\n"
+        "LayoutParser for Deep Learning"
+    )
+    elements = build_layout_elements_from_ocr_regions(
+        TextRegions.from_list(mock_embedded_text_regions),
+        text,
+        group_by_ocr_text=True,
+    )
+    assert elements == LayoutElements.from_list(
+        [
+            LayoutElement.from_coords(
+                x1=453.00277777777774,
+                y1=317.319341111111,
+                x2=711.5338541666665,
+                y2=358.28571222222206,
+                text="LayoutParser:",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+            LayoutElement.from_coords(
+                x1=437.83888888888885,
+                y1=317.319341111111,
+                x2=1256.334784222222,
+                y2=406.9837855555556,
+                text="A Unified Toolkit for Deep Learning Based Document Image",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+            LayoutElement.from_coords(
+                x1=453.00277777777774,
+                y1=417.319341111111,
+                x2=711.5338541666665,
+                y2=550.28571222222206,
+                text="LayoutParser for Deep Learning",
+                type=ElementType.UNCATEGORIZED_TEXT,
+            ),
+        ]
+    )
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 48ac8bc24b..0739850088 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.17-dev1"  # pragma: no cover
+__version__ = "0.16.17-dev2"  # pragma: no cover
diff --git a/unstructured/partition/pdf_image/inference_utils.py b/unstructured/partition/pdf_image/inference_utils.py
index 2f27b2c32b..fde81b2068 100644
--- a/unstructured/partition/pdf_image/inference_utils.py
+++ b/unstructured/partition/pdf_image/inference_utils.py
@@ -62,12 +62,12 @@ def build_layout_elements_from_ocr_regions(
                     break
                 if text in words:
                     regions.append(indices[mask][i])
-                    mask[mask][i] = False
                     words.remove(text)
 
             if not regions:
                 continue
 
+            mask[regions] = False
             grouped_regions.append(ocr_regions.slice(regions))
     else:
         grouped_regions = partition_groups_from_regions(ocr_regions)

From 55debafa8f89f42df5f5e35a686a4b4059e1221e Mon Sep 17 00:00:00 2001
From: cragwolfe <crag@unstructured.io>
Date: Wed, 29 Jan 2025 04:49:49 -0800
Subject: [PATCH 2/2] release: 0.16.17 (#3892)

Co-authored-by: Yao You <yao@unstructured.io>
---
 CHANGELOG.md                | 2 +-
 unstructured/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8cc86c7b52..4b1b078c15 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.17-dev2
+## 0.16.17
 
 ### Enhancements
 - **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 0739850088..d649086bcd 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.17-dev2"  # pragma: no cover
+__version__ = "0.16.17"  # pragma: no cover