diff --git a/README.md b/README.md index 6d76032..8eb7e44 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ Request #3 => page_2_markdown + page_3_image ### Installation: -- Install **poppler-utils** on the system, it should be available in path variable +- Install **poppler-utils** and **tesseract** on the system, these should be available in path variable - Install py-zerox: ```sh pip install py-zerox @@ -243,6 +243,7 @@ async def zerox( concurrency: int = 10, file_path: Optional[str] = "", maintain_format: bool = False, + bounding_box: bool = False, model: str = "gpt-4o-mini", output_dir: Optional[str] = None, temp_dir: Optional[str] = None, @@ -263,6 +264,8 @@ Parameters The path to the PDF file to process. Defaults to an empty string. - **maintain_format** (bool, optional): Whether to maintain the format from the previous page. Defaults to False. +- **bounding_box** (bool, optional): + Whether to return the normalized bounding box of the identified sections in the page. - **model** (str, optional): The model to use for generating completions. Defaults to "gpt-4o-mini". Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider. @@ -332,6 +335,36 @@ ZeroxOutput( 'boolean type, `false` is the default, and the default char value is `\\0`, the null-terminating ' + 'character (zero in the ASCII table).', content_length=2333, + sections=[ + Section( + content='| Type | Description | Wrapper Class |\n|---------|-------------------------------------------|---------------|\n| byte | 8-bit signed 2s complement integer | Byte |\n| short | 16-bit signed 2s complement integer | Short |\n| int | 32-bit signed 2s complement integer | Integer |\n| long | 64-bit signed 2s complement integer | Long |\n| float | 32-bit IEEE 754 floating point number | Float |\n| double | 64-bit floating point number | Double |\n| boolean | may be set to `true` or `false` | Boolean |\n| char | 16-bit Unicode (UTF-16) character | Character |\n\n**Table 26.2.: Primitive types in Java** ', + bounding_box=(0.16198125836680052, 0.07765151515151515, 0.5863453815261044, 0.19602272727272727) + ), + Section( + content='\n\n### 26.3.1. Declaration & Assignment ', + bounding_box=(0.08433734939759036, 0.30303030303030304, 0.37751004016064255, 0.01231060606060606) + ), + Section( + content='\n\nJava is a statically typed language meaning that all variables must be declared before you can use them or refer to them. In addition, when declaring a variable, you must specify both its type and its identifier. For example:\n\n```java\nint numUnits; \ndouble costPerUnit; \nchar firstInitial; \nboolean isStudent; \n``` ', + bounding_box=(0.08299866131191433, 0.3446969696969697, 0.749665327978581, 0.13541666666666666) + ), + Section( + content="\n\nEach declaration specifies the variable's type followed by the identifier and ending with a semicolon. The identifier rules are fairly standard: a name can consist of lowercase and uppercase alphabetic characters, numbers, and underscores but may not begin with a numeric character. We adopt the modern camelCasing naming convention for variables in our code. In general, variables **must** be assigned a value before you can use them in an expression. You do not have to immediately assign a value when you declare them (though it is good practice), but some value must be assigned before they can be used or the compiler will issue an error.² ", + bounding_box=(0.08299866131191433, 0.5501893939393939, 0.751004016064257, 0.1571969696969697) + ), + Section( + content='\n\nThe assignment operator is a single equal sign, `=` and is a right-to-left assignment. That is, the variable that we wish to assign the value to appears on the left-hand-side while the value (literal, variable or expression) is on the right-hand-side. Using our variables from before, we can assign them values: ', + bounding_box=(0.08299866131191433, 0.6534090909090909, 0.7483266398929049, 0.10795454545454546) + ), + Section( + content='\n\n²Instance variables, that is variables declared as part of an object do have default values. For objects, the default is `null`, for all numeric types, zero is the default value. For the `boolean` type, `false` is the default, and the default `char` value is `\\0`, the null-terminating character (zero in the ASCII table). ', + bounding_box=(0.08299866131191433, 0.6998106060606061, 0.749665327978581, 0.13825757575757575) + ), + Section( + content='', + bounding_box=(0.7054886211512718, 0.048295454545454544, 0.03480589022757698, 0.00946969696969697) + ) + ], page=1 ) ] diff --git a/py_zerox/pyzerox/constants/messages.py b/py_zerox/pyzerox/constants/messages.py index e3ca826..318678e 100644 --- a/py_zerox/pyzerox/constants/messages.py +++ b/py_zerox/pyzerox/constants/messages.py @@ -54,3 +54,11 @@ class Messages: FAILED_TO_SAVE_FILE = """Failed to save file to local drive""" FAILED_TO_PROCESS_IMAGE = """Failed to process image""" + + FAILED_TO_PERFORM_OCR = """ + Failed to perform OCR on image: {0} + """ + + FAILED_TO_FIND_BOUNDING_BOX = """ + Failed to find bounding box for the section: {0} + """ diff --git a/py_zerox/pyzerox/constants/patterns.py b/py_zerox/pyzerox/constants/patterns.py index 6be1a77..b4506ca 100644 --- a/py_zerox/pyzerox/constants/patterns.py +++ b/py_zerox/pyzerox/constants/patterns.py @@ -4,3 +4,9 @@ class Patterns: MATCH_MARKDOWN_BLOCKS = r"^```[a-z]*\n([\s\S]*?)\n```$" MATCH_CODE_BLOCKS = r"^```\n([\s\S]*?)\n```$" + + +class MarkdownConstants: + """A class to hold constants related to Markdown formatting.""" + + SECTION_DELIMITER = "" diff --git a/py_zerox/pyzerox/constants/prompts.py b/py_zerox/pyzerox/constants/prompts.py index 1b00f61..901bae8 100644 --- a/py_zerox/pyzerox/constants/prompts.py +++ b/py_zerox/pyzerox/constants/prompts.py @@ -5,4 +5,9 @@ class Prompts: Convert the following PDF page to markdown. Return only the markdown with no explanation text. Do not exclude any content from the page. - """ \ No newline at end of file + """ + + SEGMENT_MARKDOWN_SYSTEM_PROMPT = """ + For each section (eg: headings, tables, footers, etc.), add a comment "section" at the end of that section in markdown. + Ensure as much content as possible is formatted using markdown where applicable. + """ diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py index ffe251d..6cdce88 100644 --- a/py_zerox/pyzerox/core/types.py +++ b/py_zerox/pyzerox/core/types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict, Any, Union, Iterable +from typing import List, Optional, Dict, Any, Tuple, Union, Iterable from dataclasses import dataclass, field @@ -12,6 +12,7 @@ class ZeroxArgs: cleanup: bool = True concurrency: int = 10 maintain_format: bool = False + bounding_box: bool = False model: str = "gpt-4o-mini", output_dir: Optional[str] = None temp_dir: Optional[str] = None @@ -19,6 +20,17 @@ class ZeroxArgs: select_pages: Optional[Union[int, Iterable[int]]] = None kwargs: Dict[str, Any] = field(default_factory=dict) + +@dataclass +class Section: + """ + Dataclass to represent a section of content within a page. + """ + + content: str + bounding_box: Tuple[float, float, float, float] + + @dataclass class Page: """ @@ -27,6 +39,7 @@ class Page: content: str content_length: int + sections: List[Section] page: int diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py index e7505bc..7da1ab5 100644 --- a/py_zerox/pyzerox/core/zerox.py +++ b/py_zerox/pyzerox/core/zerox.py @@ -19,7 +19,7 @@ from ..errors import FileUnavailable from ..constants.messages import Messages from ..models import litellmmodel -from .types import Page, ZeroxOutput +from .types import Page, Section, ZeroxOutput async def zerox( @@ -27,6 +27,7 @@ async def zerox( concurrency: int = 10, file_path: Optional[str] = "", maintain_format: bool = False, + bounding_box: bool = False, model: str = "gpt-4o-mini", output_dir: Optional[str] = None, temp_dir: Optional[str] = None, @@ -46,6 +47,8 @@ async def zerox( :type file_path: str, optional :param maintain_format: Whether to maintain the format from the previous page, defaults to False :type maintain_format: bool, optional + :param bounding_box: Whether to include bounding box information in the output. + :type bounding_box: bool, optional :param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name. :type model: str, optional :param output_dir: The directory to save the markdown output, defaults to None @@ -61,17 +64,17 @@ async def zerox( :return: The markdown content generated by the model. """ - input_token_count = 0 output_token_count = 0 prior_page = "" aggregated_markdown: List[str] = [] + sections_list: List[List[Section]] = [] start_time = datetime.now() - + # File Path Validators if not file_path: raise FileUnavailable() - + # Create an instance of the litellm model interface vision_model = litellmmodel(model=model,**kwargs) @@ -84,11 +87,12 @@ async def zerox( warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING) # If select_pages is a single integer, convert it to a list for consistency - if isinstance(select_pages, int): - select_pages = [select_pages] - - # Sort the pages to maintain consistency - select_pages = sorted(select_pages) + if select_pages: + if isinstance(select_pages, int): + select_pages = [select_pages] + else: + # Sort the pages to maintain consistency + select_pages = sorted(list(select_pages)) # Ensure the output directory exists if output_dir: @@ -100,7 +104,6 @@ async def zerox( await async_shutil.rmtree(temp_dir) await async_os.makedirs(temp_dir, exist_ok=True) - # Create a temporary directory to store the PDF and images with tempfile.TemporaryDirectory() as temp_dir_: @@ -115,10 +118,10 @@ async def zerox( local_path = await download_file(file_path=file_path, temp_dir=temp_directory) if not local_path: raise FileUnavailable() - + raw_file_name = os.path.splitext(os.path.basename(local_path))[0] file_name = "".join(c.lower() if c.isalnum() else "_" for c in raw_file_name) - + # create a subset pdf in temp dir with only the requested pages if select_pages is provided if select_pages is not None: subset_pdf_create_kwargs = {"original_pdf_path":local_path, "select_pages":select_pages, @@ -131,29 +134,37 @@ async def zerox( if maintain_format: for image in images: - result, input_token_count, output_token_count, prior_page = await process_page( - image, - vision_model, - temp_directory, - input_token_count, - output_token_count, - prior_page, + result, input_token_count, output_token_count, prior_page, sections = ( + await process_page( + image, + vision_model, + temp_directory, + bounding_box, + input_token_count, + output_token_count, + prior_page, + ) ) if result: aggregated_markdown.append(result) + sections_list.append(sections) else: results = await process_pages_in_batches( images, concurrency, vision_model, temp_directory, + bounding_box, input_token_count, output_token_count, prior_page, ) - aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)] + for result in results: + if isinstance(result[0], str): + aggregated_markdown.append(result[0]) + sections_list.append(result[-1]) ## add token usage input_token_count += sum([result[1] for result in results]) @@ -177,15 +188,25 @@ async def zerox( if select_pages is not None: # Map aggregated markdown to the selected pages formatted_pages = [ - Page(content=content, page=select_pages[i], content_length=len(content)) - for i, content in enumerate(aggregated_markdown) - ] + Page( + content=content, + page=select_pages[i], + content_length=len(content), + sections=sections_list[i], + ) + for i, content in enumerate(aggregated_markdown) + ] else: # Default behavior when no select_pages is provided formatted_pages = [ - Page(content=content, page=i + 1, content_length=len(content)) - for i, content in enumerate(aggregated_markdown) - ] + Page( + content=content, + page=i + 1, + content_length=len(content), + sections=sections_list[i], + ) + for i, content in enumerate(aggregated_markdown) + ] return ZeroxOutput( completion_time=completion_time, @@ -193,4 +214,4 @@ async def zerox( input_tokens=input_token_count, output_tokens=output_token_count, pages=formatted_pages, - ) \ No newline at end of file + ) diff --git a/py_zerox/pyzerox/models/modellitellm.py b/py_zerox/pyzerox/models/modellitellm.py index bda4828..b236896 100644 --- a/py_zerox/pyzerox/models/modellitellm.py +++ b/py_zerox/pyzerox/models/modellitellm.py @@ -13,6 +13,7 @@ from ..processor.image import encode_image_to_base64 DEFAULT_SYSTEM_PROMPT = Prompts.DEFAULT_SYSTEM_PROMPT +SEGMENT_MARKDOWN_SYSTEM_PROMPT = Prompts.SEGMENT_MARKDOWN_SYSTEM_PROMPT class litellmmodel(BaseModel): @@ -69,12 +70,12 @@ def validate_access(self) -> None: """Validates access to the model -> if environment variables are set correctly with correct values.""" if not litellm.check_valid_key(model=self.model,api_key=None): raise ModelAccessError(extra_info={"model": self.model}) - async def completion( self, image_path: str, maintain_format: bool, + bounding_box: bool, prior_page: str, ) -> CompletionResponse: """LitellM completion for image to markdown conversion. @@ -91,6 +92,7 @@ async def completion( messages = await self._prepare_messages( image_path=image_path, maintain_format=maintain_format, + bounding_box=bounding_box, prior_page=prior_page, ) @@ -112,6 +114,7 @@ async def _prepare_messages( self, image_path: str, maintain_format: bool, + bounding_box: bool, prior_page: str, ) -> List[Dict[str, Any]]: """Prepares the messages to send to the LiteLLM Completion API. @@ -131,6 +134,14 @@ async def _prepare_messages( }, ] + if bounding_box: + messages.append( + { + "role": "system", + "content": SEGMENT_MARKDOWN_SYSTEM_PROMPT + } + ) + # If content has already been generated, add it to context. # This helps maintain the same format across pages. if maintain_format and prior_page: diff --git a/py_zerox/pyzerox/processor/bounding_box.py b/py_zerox/pyzerox/processor/bounding_box.py new file mode 100644 index 0000000..3fb10fd --- /dev/null +++ b/py_zerox/pyzerox/processor/bounding_box.py @@ -0,0 +1,199 @@ +from typing import Dict, List, Tuple + +import Levenshtein + +from py_zerox.pyzerox.constants.messages import Messages + + +async def find_substring_with_minimum_edit_distance( + content_string: str, pattern: str +) -> Tuple[str, int]: + """ + Find the substring within the given content string + that has the minimum edit distance to the specified pattern. + + Args: + content_string (str): The string in which to search for the substring. + pattern (str): The pattern to compare against. + + Returns: + Tuple[str, int]: A tuple containing the best matching substring + and its starting index in the content string. + If no substring is found, the starting index + will be -1. + """ + content_length = len(content_string) + pattern_length = len(pattern) + min_distance = float("inf") + best_substring: str = "" + best_substring_start_index: int = -1 + + for i in range(content_length - pattern_length + 1): + substring = content_string[i : i + pattern_length] + distance = Levenshtein.distance(substring, pattern) + if distance < min_distance: + min_distance = distance + best_substring = substring + best_substring_start_index = i + + return best_substring, best_substring_start_index + + +async def find_substring_indices_from_ocr_data( + content_list: List[str], substring: str, substring_start_index: int +) -> Tuple[int, int]: + """ + Find the indices of the first and last strings in a list + of strings that contain a specified substring starting from a given index. + + Args: + content_list (List[str]): A list of strings to search through. + substring (str): The substring to find within the content list. + substring_start_index (int): The starting index in the combined string + representation of the content list from + which to search for the substring. + + Returns: + Tuple[int, int]: A tuple containing the index of the first string + and the index of the last string that contains + the substring. Returns (-1, -1) if the substring + is not found within the specified range. + """ + substring_length = len(substring) + current_length: int = 0 + first_string_index: int = -1 + last_string_index: int = -1 + + for index, string in enumerate(content_list): + string_length = len(string) + 1 + if current_length <= substring_start_index + 1 < current_length + string_length: + first_string_index = index + if ( + current_length + <= substring_start_index + substring_length - 1 + < current_length + string_length + ): + last_string_index = index + current_length += string_length + + return first_string_index, last_string_index + + +async def calculate_bounding_box( + ocr_data: Dict[str, list], first_string_index: int, last_string_index: int +) -> Tuple[float, float, float, float]: + """ + Calculate the bounding box coordinates that encompasses a set of strings based on OCR data. + + Args: + ocr_data (Dict[str, list]): A dictionary containing lists of + 'left_list', 'top_list', 'width_list', and 'height_list' + representing the OCR data for the strings. + first_string_index (int): The index of the first string to consider. + last_string_index (int): The index of the last string to consider. + + Returns: + Tuple[float, float, float, float]: A tuple containing the coordinates + of the bounding box in the format (left, top, width, height). + """ + leftmost_string_x = ocr_data["left_list"][first_string_index] + rightmost_string_x = 0 + topmost_string_y = ocr_data["top_list"][first_string_index] + bottommost_string_y = 0 + rightmost_string_width = ocr_data["width_list"][first_string_index] + bottommost_string_height = ocr_data["height_list"][first_string_index] + + for i in range(first_string_index + 1, last_string_index + 1): + if ocr_data["left_list"][i] < leftmost_string_x: + leftmost_string_x = ocr_data["left_list"][i] + if ocr_data["top_list"][i] < topmost_string_y: + topmost_string_y = ocr_data["top_list"][i] + if ocr_data["left_list"][i] > rightmost_string_x: + rightmost_string_x = ocr_data["left_list"][i] + rightmost_string_width = ocr_data["width_list"][i] + if ocr_data["top_list"][i] > bottommost_string_y: + bottommost_string_y = ocr_data["top_list"][i] + bottommost_string_height = ocr_data["height_list"][i] + + width = 0 + height = 0 + if rightmost_string_x > leftmost_string_x: + width = rightmost_string_x - leftmost_string_x + if bottommost_string_y > topmost_string_y: + height = bottommost_string_y - topmost_string_y + max_width = width + rightmost_string_width + max_height = height + bottommost_string_height + + return leftmost_string_x, topmost_string_y, max_width, max_height + + +async def normalize_bounding_box( + left: float, + top: float, + width: float, + height: float, + image_dimensions: Tuple[float, float], +) -> Tuple[float, float, float, float]: + """ + Normalize the bounding box coordinates and dimensions based on the image dimensions. + + Args: + left (float): The x-coordinate of the top-left corner of the bounding box. + top (float): The y-coordinate of the top-left corner of the bounding box. + width (float): The width of the bounding box. + height (float): The height of the bounding box. + image_dimensions (Tuple[float, float]): A tuple containing the width and height of the image. + + Returns: + Tuple[float, float, float, float]: A tuple containing the normalized left, top, width, and height of the bounding box. + """ + normalized_left = left / image_dimensions[0] + normalized_top = top / image_dimensions[1] + normalized_width = width / image_dimensions[0] + normalized_height = height / image_dimensions[1] + + return normalized_left, normalized_top, normalized_width, normalized_height + + +async def find_bounding_box( + ocr_data: Dict[str, list], string_to_compare: str +) -> Tuple[float, float, float, float]: + """ + Find the bounding box coordinates for a given string within the OCR data. + + Args: + ocr_data (Dict[str, list]): A dictionary containing OCR data. + string_to_compare (str): The string for which the bounding box needs to be found. + + Returns: + Tuple[float, float, float, float]: A tuple containing the bounding box coordinates in the format (left, top, width, height). + """ + try: + text_content = " ".join(ocr_data["text_list"]) + substring, substring_start_index = ( + await find_substring_with_minimum_edit_distance( + content_string=text_content, pattern=string_to_compare + ) + ) + first_string_index, last_string_index = ( + await find_substring_indices_from_ocr_data( + content_list=ocr_data["text_list"], + substring=substring, + substring_start_index=substring_start_index, + ) + ) + left, top, width, height = await calculate_bounding_box( + ocr_data=ocr_data, + first_string_index=first_string_index, + last_string_index=last_string_index, + ) + left, top, width, height = await normalize_bounding_box( + left=left, + top=top, + width=width, + height=height, + image_dimensions=ocr_data["dimensions"], + ) + return left, top, width, height + except Exception as err: + raise Exception(Messages.FAILED_TO_FIND_BOUNDING_BOX.format(err)) diff --git a/py_zerox/pyzerox/processor/ocr.py b/py_zerox/pyzerox/processor/ocr.py new file mode 100644 index 0000000..de305e3 --- /dev/null +++ b/py_zerox/pyzerox/processor/ocr.py @@ -0,0 +1,82 @@ +from typing import Dict +from PIL import Image +import pytesseract + +from py_zerox.pyzerox.constants.messages import Messages + + +def enhance_image_for_ocr(image: Image) -> Image: + """ + Enhances the given image for Optical Character Recognition. + Converts the image to grayscale. + + Args: + image (Image): The input image to be enhanced. + + Returns: + Image: The enhanced grayscale image ready for OCR processing. + """ + image = image.convert("L") + return image + + +async def _clean_ocr_text(data: Dict[str, list]) -> Dict[str, list]: + """ + Processes the input data dictionary containing OCR results, + filtering out entries with low confidence scores or empty text. + + Args: + data (dict): A dictionary containing OCR results: + - 'text': A list of recognized text strings. + - 'conf': A list of confidence scores corresponding to each text. + - 'left': A list of x-coordinates for the text bounding boxes. + - 'top': A list of y-coordinates for the text bounding boxes. + - 'width': A list of widths for the text bounding boxes. + - 'height': A list of heights for the text bounding boxes. + + Returns: + dict: A dictionary containing filtered lists of text and attributes: + - 'text_list': A list of valid text strings. + - 'left_list': A list of x-coordinates for the text bounding boxes. + - 'top_list': A list of y-coordinates for the text bounding boxes. + - 'width_list': A list of widths for the text bounding boxes. + - 'height_list': A list of heights for the text bounding boxes. + """ + data_lists = { + "text_list": [], + "left_list": [], + "top_list": [], + "width_list": [], + "height_list": [], + } + + for i in range(len(data["text"])): + if int(data["conf"][i]) > 0 and data["text"][i].strip(): + data_lists["text_list"].append(data["text"][i]) + data_lists["left_list"].append(data["left"][i]) + data_lists["top_list"].append(data["top"][i]) + data_lists["width_list"].append(data["width"][i]) + data_lists["height_list"].append(data["height"][i]) + + return data_lists + + +async def perform_image_ocr(image_path: str) -> Dict[str, list]: + """ + Perform OCR on the specified image. + + Args: + image_path (str): The file path to the image. + + Returns: + A dictionary containing the cleaned OCR text data and attributes. + """ + try: + image = Image.open(image_path) + image = enhance_image_for_ocr(image=image) + data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + cleaned_data = await _clean_ocr_text(data=data) + cleaned_data["dimensions"] = image.size + return cleaned_data + except Exception as err: + raise Exception(Messages.FAILED_TO_PERFORM_OCR.format(err)) diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py index c3b3fa6..585a43c 100644 --- a/py_zerox/pyzerox/processor/pdf.py +++ b/py_zerox/pyzerox/processor/pdf.py @@ -4,9 +4,14 @@ from typing import List, Optional, Tuple from pdf2image import convert_from_path +from py_zerox.pyzerox.constants.patterns import MarkdownConstants +from py_zerox.pyzerox.core.types import Section +from py_zerox.pyzerox.processor.bounding_box import find_bounding_box +from py_zerox.pyzerox.processor.ocr import perform_image_ocr + # Package Imports from .image import save_image -from .text import format_markdown +from .text import format_markdown, remove_markdown from ..constants import PDFConversionDefaultOptions, Messages from ..models import litellmmodel @@ -37,13 +42,17 @@ async def process_page( image: str, model: litellmmodel, temp_directory: str = "", + bounding_box: bool = False, input_token_count: int = 0, output_token_count: int = 0, prior_page: str = "", semaphore: Optional[asyncio.Semaphore] = None, -) -> Tuple[str, int, int, str]: +) -> Tuple[str, int, int, str, Optional[List[Section]]]: """Process a single page of a PDF""" + markdown_sections: List[str] = [] + sections: Optional[List[Section]] = None + # If semaphore is provided, acquire it before processing the page if semaphore: async with semaphore: @@ -51,6 +60,7 @@ async def process_page( image, model, temp_directory, + bounding_box, input_token_count, output_token_count, prior_page, @@ -63,15 +73,42 @@ async def process_page( completion = await model.completion( image_path=image_path, maintain_format=True, + bounding_box=bounding_box, prior_page=prior_page, ) formatted_markdown = format_markdown(completion.content) + + if bounding_box: + sections = [] + ocr_data = await perform_image_ocr(image_path=image_path) + + markdown_sections = formatted_markdown.split( + MarkdownConstants.SECTION_DELIMITER + ) + for markdown_section in markdown_sections: + text_section = remove_markdown(markdown_section) + bounding_box_coords = await find_bounding_box( + ocr_data=ocr_data, string_to_compare=text_section + ) + section = Section( + content=markdown_section, bounding_box=bounding_box_coords + ) + sections.append(section) + + formatted_markdown = "".join(markdown_sections) + input_token_count += completion.input_tokens output_token_count += completion.output_tokens prior_page = formatted_markdown - return formatted_markdown, input_token_count, output_token_count, prior_page + return ( + formatted_markdown, + input_token_count, + output_token_count, + prior_page, + sections, + ) except Exception as error: logging.error(f"{Messages.FAILED_TO_PROCESS_IMAGE} Error:{error}") @@ -83,6 +120,7 @@ async def process_pages_in_batches( concurrency: int, model: litellmmodel, temp_directory: str = "", + bounding_box: bool = False, input_token_count: int = 0, output_token_count: int = 0, prior_page: str = "", @@ -96,6 +134,7 @@ async def process_pages_in_batches( image, model, temp_directory, + bounding_box, input_token_count, output_token_count, prior_page, diff --git a/py_zerox/pyzerox/processor/text.py b/py_zerox/pyzerox/processor/text.py index 9b8fbe5..8927eb1 100644 --- a/py_zerox/pyzerox/processor/text.py +++ b/py_zerox/pyzerox/processor/text.py @@ -1,4 +1,6 @@ import re +from bs4 import BeautifulSoup +import markdown # Package imports from ..constants.patterns import Patterns @@ -10,3 +12,27 @@ def format_markdown(text: str) -> str: formatted_markdown = re.sub(Patterns.MATCH_MARKDOWN_BLOCKS, r"\1", text) formatted_markdown = re.sub(Patterns.MATCH_CODE_BLOCKS, r"\1", formatted_markdown) return formatted_markdown + + +def remove_markdown(content: str) -> str: + """ + Converts a Markdown formatted string to plain text. + + Args: + content (str): A string containing Markdown formatted text. + + Returns: + str: A plain text representation of the input Markdown content. + """ + html = markdown.markdown(content) + + parsed_html = BeautifulSoup(html, "html.parser") + content_text = parsed_html.get_text() + + content_text = re.sub(r"-+", "", content_text) + content_text = re.sub(r"\|", "", content_text) + content_text = re.sub(r"\n+", "\n", content_text) + content_text = re.sub(r"\s+", " ", content_text) + content_text = content_text.strip() + + return content_text diff --git a/py_zerox/scripts/pre_install.py b/py_zerox/scripts/pre_install.py index ed22a61..717205a 100644 --- a/py_zerox/scripts/pre_install.py +++ b/py_zerox/scripts/pre_install.py @@ -23,23 +23,54 @@ def install_package(command, package_name): raise RuntimeError(f"Failed to install {package_name}: {e}") +def check_and_install_poppler(): + """Check for the installation of Poppler and install if not present.""" + + try: + run_command("pdftoppm -h") + except RuntimeError: + if platform.system() == "Darwin": # macOS + install_package("brew install poppler", "Poppler") + elif platform.system() == "Linux": # Linux + install_package( + "sudo apt-get update && sudo apt-get install -y poppler-utils", + "Poppler", + ) + else: + raise RuntimeError( + "Please install Poppler manually from https://poppler.freedesktop.org/" + ) + + +def check_and_install_tesseract(): + """Check for the installation of Tesseract and install if not present.""" + try: + run_command("tesseract --version") + except RuntimeError: + if platform.system() == "Darwin": # macOS + install_package("brew install tesseract", "Tesseract") + elif platform.system() == "Linux": # Linux + install_package( + "sudo apt-get update && sudo apt-get install -y tesseract-ocr", + "Tesseract", + ) + elif platform.system() == "Windows": # Windows + print( + "Please download and install Tesseract from the official GitHub repository: https://github.com/UB-Mannheim/tesseract/wiki" + ) + print( + "Make sure to add the Tesseract installation path to your system's PATH environment variable." + ) + else: + raise RuntimeError( + "Please install Tesseract manually from the official website." + ) + + def check_and_install(): try: - # Check and install Poppler - try: - run_command("pdftoppm -h") - except RuntimeError: - if platform.system() == "Darwin": # macOS - install_package("brew install poppler", "Poppler") - elif platform.system() == "Linux": # Linux - install_package( - "sudo apt-get update && sudo apt-get install -y poppler-utils", - "Poppler", - ) - else: - raise RuntimeError( - "Please install Poppler manually from https://poppler.freedesktop.org/" - ) + check_and_install_poppler() + check_and_install_tesseract() except RuntimeError as err: print(f"Error during installation: {err}", file=sys.stderr) diff --git a/pyproject.toml b/pyproject.toml index b34a194..a2280e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,11 @@ pdf2image = "^1.17.0" litellm = "^1.44.15" aioshutil = "^1.5" pypdf2 = "^3.0.1" +pytesseract = "^0.3.13" +pillow = "^10.4.0" +python-Levenshtein = "^0.25.1" +Markdown = "^3.7" +beautifulsoup4 = "^4.12.3" [tool.poetry.scripts] pre-install = "py_zerox.scripts.pre_install:check_and_install" diff --git a/setup.cfg b/setup.cfg index f8bbc0f..20c6776 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ version = 0.0.5 description = ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc long_description = file: README.md long_description_content_type = text/markdown -author = wizenheimer, pradhyumna85 +author = wizenheimer, pradhyumna85, getwithashish license = MIT license_file = LICENSE classifiers = @@ -24,6 +24,11 @@ install_requires = litellm>=1.44.15 aioshutil>=1.5 PyPDF2>=3.0.1 + pytesseract>=0.3.13 + pillow>=10.4.0 + python-Levenshtein>=0.25.1 + Markdown>=3.7 + beautifulsoup4>=4.12.3 [options.packages.find] where = py_zerox.pyzerox