diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dee1aa8b9..a35dca5a18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,13 @@ -## 0.16.17-dev1 +## 0.16.18-dev1 + +### Enhancements + +### Features + +### Fixes + **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff. + +## 0.16.17 ### Enhancements - **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features. @@ -6,7 +15,7 @@ ### Features ### Fixes -- **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff. +- **Fix a bug where `build_layout_elements_from_cor_regions` incorrectly joins texts in wrong order**. ## 0.16.16 diff --git a/test_unstructured/partition/pdf_image/test_inference_utils.py b/test_unstructured/partition/pdf_image/test_inference_utils.py index 02897c6819..28fbdd6f42 100644 --- a/test_unstructured/partition/pdf_image/test_inference_utils.py +++ b/test_unstructured/partition/pdf_image/test_inference_utils.py @@ -91,3 +91,79 @@ def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedd ), ] ) + + +def test_build_layout_elements_from_ocr_regions_with_repeated_texts(mock_embedded_text_regions): + mock_embedded_text_regions.extend( + [ + LayoutElement.from_coords( + x1=453.00277777777774, + y1=417.319341111111, + x2=711.5338541666665, + y2=458.28571222222206, + text="LayoutParser", + type=ElementType.UNCATEGORIZED_TEXT, + ), + LayoutElement.from_coords( + x1=453.00277777777774, + y1=468.319341111111, + x2=711.5338541666665, + y2=478.28571222222206, + text="for", + type=ElementType.UNCATEGORIZED_TEXT, + ), + LayoutElement.from_coords( + x1=453.00277777777774, + y1=488.319341111111, + x2=711.5338541666665, + y2=500.28571222222206, + text="Deep", + type=ElementType.UNCATEGORIZED_TEXT, + ), + LayoutElement.from_coords( + x1=453.00277777777774, + y1=510.319341111111, + x2=711.5338541666665, + y2=550.28571222222206, + text="Learning", + type=ElementType.UNCATEGORIZED_TEXT, + ), + ] + ) + text = ( + "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\n" + "LayoutParser for Deep Learning" + ) + elements = build_layout_elements_from_ocr_regions( + TextRegions.from_list(mock_embedded_text_regions), + text, + group_by_ocr_text=True, + ) + assert elements == LayoutElements.from_list( + [ + LayoutElement.from_coords( + x1=453.00277777777774, + y1=317.319341111111, + x2=711.5338541666665, + y2=358.28571222222206, + text="LayoutParser:", + type=ElementType.UNCATEGORIZED_TEXT, + ), + LayoutElement.from_coords( + x1=437.83888888888885, + y1=317.319341111111, + x2=1256.334784222222, + y2=406.9837855555556, + text="A Unified Toolkit for Deep Learning Based Document Image", + type=ElementType.UNCATEGORIZED_TEXT, + ), + LayoutElement.from_coords( + x1=453.00277777777774, + y1=417.319341111111, + x2=711.5338541666665, + y2=550.28571222222206, + text="LayoutParser for Deep Learning", + type=ElementType.UNCATEGORIZED_TEXT, + ), + ] + ) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 48ac8bc24b..d649086bcd 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.17-dev1" # pragma: no cover +__version__ = "0.16.17" # pragma: no cover diff --git a/unstructured/partition/pdf_image/inference_utils.py b/unstructured/partition/pdf_image/inference_utils.py index 2f27b2c32b..fde81b2068 100644 --- a/unstructured/partition/pdf_image/inference_utils.py +++ b/unstructured/partition/pdf_image/inference_utils.py @@ -62,12 +62,12 @@ def build_layout_elements_from_ocr_regions( break if text in words: regions.append(indices[mask][i]) - mask[mask][i] = False words.remove(text) if not regions: continue + mask[regions] = False grouped_regions.append(ocr_regions.slice(regions)) else: grouped_regions = partition_groups_from_regions(ocr_regions)