From ba61808ab806de4818622fc08682b79b8ba0ed7a Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 2 Jan 2025 14:01:54 -0500 Subject: [PATCH] cleanup changes --- marker/converters/pdf.py | 4 +- .../processors/llm/llm_image_description.py | 65 ------------------- 2 files changed, 1 insertion(+), 68 deletions(-) diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index 23fc9545..bfe9d327 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -1,7 +1,4 @@ import os - -from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor - os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning import inspect @@ -24,6 +21,7 @@ from marker.processors.llm.llm_form import LLMFormProcessor from marker.processors.llm.llm_table import LLMTableProcessor from marker.processors.llm.llm_text import LLMTextProcessor +from marker.processors.llm.llm_image_description import LLMImageDescriptionProcessor from marker.processors.ignoretext import IgnoreTextProcessor from marker.processors.line_numbers import LineNumbersProcessor from marker.processors.list import ListProcessor diff --git a/marker/processors/llm/llm_image_description.py b/marker/processors/llm/llm_image_description.py index 4ff314cb..837e5fcf 100644 --- a/marker/processors/llm/llm_image_description.py +++ b/marker/processors/llm/llm_image_description.py @@ -1,17 +1,11 @@ -from tabled.schema import SpanTableCell - from marker.processors.llm import BaseLLMProcessor -from bs4 import BeautifulSoup -from typing import List from google.ai.generativelanguage_v1beta.types import content -from tabled.formats import html_format from marker.schema import BlockTypes from marker.schema.blocks import Block from marker.schema.document import Document from marker.schema.groups.page import PageGroup -from marker.schema.polygon import PolygonBox class LLMImageDescriptionProcessor(BaseLLMProcessor): @@ -66,62 +60,3 @@ def process_rewriting(self, document: Document, page: PageGroup, block: Block): return block.description = image_description - - - def parse_html_table(self, html_text: str, block: Block) -> List[SpanTableCell]: - soup = BeautifulSoup(html_text, 'html.parser') - table = soup.find('table') - - # Initialize grid - rows = table.find_all('tr') - cells = [] - max_cols = max(len(row.find_all(['td', 'th'])) for row in rows) - if max_cols == 0: - return [] - - grid = [[True] * max_cols for _ in range(len(rows))] - - for i, row in enumerate(rows): - cur_col = 0 - row_cells = row.find_all(['td', 'th']) - for j, cell in enumerate(row_cells): - while cur_col < max_cols and not grid[i][cur_col]: - cur_col += 1 - - if cur_col >= max_cols: - print("Table parsing warning: too many columns found") - break - - cell_text = cell.text.strip() - rowspan = min(int(cell.get('rowspan', 1)), len(rows) - i) - colspan = min(int(cell.get('colspan', 1)), max_cols - cur_col) - cell_rows = list(range(i, i + rowspan)) - cell_cols = list(range(cur_col, cur_col + colspan)) - - if colspan == 0 or rowspan == 0: - print("Table parsing warning: invalid colspan or rowspan") - continue - - for r in cell_rows: - for c in cell_cols: - grid[r][c] = False - - cell_bbox = [ - block.polygon.bbox[0] + cur_col, - block.polygon.bbox[1] + i, - block.polygon.bbox[0] + cur_col + colspan, - block.polygon.bbox[1] + i + rowspan - ] - cell_polygon = PolygonBox.from_bbox(cell_bbox) - - cell_obj = SpanTableCell( - text=cell_text, - row_ids=cell_rows, - col_ids=cell_cols, - bbox=cell_polygon.bbox - ) - cells.append(cell_obj) - cur_col += colspan - - - return cells