Skip to content

Commit

Permalink
Merge branch 'main' into update_ruff_invocation
Browse files Browse the repository at this point in the history
  • Loading branch information
cragwolfe authored Jan 29, 2025
2 parents b765adb + 55debaf commit 76162ab
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 4 deletions.
13 changes: 11 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
## 0.16.17-dev1
## 0.16.18-dev1

### Enhancements

### Features

### Fixes
**Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff.

## 0.16.17

### Enhancements
- **Refactoring the VoyageAI integration** to use voyageai package directly, allowing extra features.

### Features

### Fixes
- **Correct deprecated `ruff` invocation in `make tidy`**. This will future-proof it or avoid surprises if someone happens to upgrade Ruff.
- **Fix a bug where `build_layout_elements_from_cor_regions` incorrectly joins texts in wrong order**.

## 0.16.16

Expand Down
76 changes: 76 additions & 0 deletions test_unstructured/partition/pdf_image/test_inference_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,79 @@ def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedd
),
]
)


def test_build_layout_elements_from_ocr_regions_with_repeated_texts(mock_embedded_text_regions):
mock_embedded_text_regions.extend(
[
LayoutElement.from_coords(
x1=453.00277777777774,
y1=417.319341111111,
x2=711.5338541666665,
y2=458.28571222222206,
text="LayoutParser",
type=ElementType.UNCATEGORIZED_TEXT,
),
LayoutElement.from_coords(
x1=453.00277777777774,
y1=468.319341111111,
x2=711.5338541666665,
y2=478.28571222222206,
text="for",
type=ElementType.UNCATEGORIZED_TEXT,
),
LayoutElement.from_coords(
x1=453.00277777777774,
y1=488.319341111111,
x2=711.5338541666665,
y2=500.28571222222206,
text="Deep",
type=ElementType.UNCATEGORIZED_TEXT,
),
LayoutElement.from_coords(
x1=453.00277777777774,
y1=510.319341111111,
x2=711.5338541666665,
y2=550.28571222222206,
text="Learning",
type=ElementType.UNCATEGORIZED_TEXT,
),
]
)
text = (
"LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image\n\n"
"LayoutParser for Deep Learning"
)
elements = build_layout_elements_from_ocr_regions(
TextRegions.from_list(mock_embedded_text_regions),
text,
group_by_ocr_text=True,
)
assert elements == LayoutElements.from_list(
[
LayoutElement.from_coords(
x1=453.00277777777774,
y1=317.319341111111,
x2=711.5338541666665,
y2=358.28571222222206,
text="LayoutParser:",
type=ElementType.UNCATEGORIZED_TEXT,
),
LayoutElement.from_coords(
x1=437.83888888888885,
y1=317.319341111111,
x2=1256.334784222222,
y2=406.9837855555556,
text="A Unified Toolkit for Deep Learning Based Document Image",
type=ElementType.UNCATEGORIZED_TEXT,
),
LayoutElement.from_coords(
x1=453.00277777777774,
y1=417.319341111111,
x2=711.5338541666665,
y2=550.28571222222206,
text="LayoutParser for Deep Learning",
type=ElementType.UNCATEGORIZED_TEXT,
),
]
)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.17-dev1" # pragma: no cover
__version__ = "0.16.17" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/partition/pdf_image/inference_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ def build_layout_elements_from_ocr_regions(
break
if text in words:
regions.append(indices[mask][i])
mask[mask][i] = False
words.remove(text)

if not regions:
continue

mask[regions] = False
grouped_regions.append(ocr_regions.slice(regions))
else:
grouped_regions = partition_groups_from_regions(ocr_regions)
Expand Down

0 comments on commit 76162ab

Please sign in to comment.