getomni-ai · getwithashish · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/README.md b/README.md
@@ -143,7 +143,7 @@ Request #3 => page_2_markdown + page_3_image
 
 ### Installation:
 
-- Install **poppler-utils** on the system, it should be available in path variable
+- Install **poppler-utils** and **tesseract** on the system, these should be available in path variable
 - Install py-zerox:
 ```sh
 pip install py-zerox
@@ -243,6 +243,7 @@ async def zerox(
     concurrency: int = 10,
     file_path: Optional[str] = "",
     maintain_format: bool = False,
+    bounding_box: bool = False,
     model: str = "gpt-4o-mini",
     output_dir: Optional[str] = None,
     temp_dir: Optional[str] = None,
@@ -263,6 +264,8 @@ Parameters
   The path to the PDF file to process. Defaults to an empty string.
 - **maintain_format** (bool, optional):
   Whether to maintain the format from the previous page. Defaults to False.
+- **bounding_box** (bool, optional):
+  Whether to return the normalized bounding box of the identified sections in the page.
 - **model** (str, optional):
   The model to use for generating completions. Defaults to "gpt-4o-mini".
   Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider.
@@ -332,6 +335,36 @@ ZeroxOutput(
                     'boolean type, `false` is the default, and the default char value is `\\0`, the null-terminating ' +
                     'character (zero in the ASCII table).',
             content_length=2333,
+            sections=[
+              Section(
+                content='| Type    | Description                               | Wrapper Class |\n|---------|-------------------------------------------|---------------|\n| byte    | 8-bit signed 2s complement integer        | Byte          |\n| short   | 16-bit signed 2s complement integer       | Short         |\n| int     | 32-bit signed 2s complement integer       | Integer       |\n| long    | 64-bit signed 2s complement integer       | Long          |\n| float   | 32-bit IEEE 754 floating point number     | Float         |\n| double  | 64-bit floating point number               | Double        |\n| boolean | may be set to `true` or `false`          | Boolean       |\n| char    | 16-bit Unicode (UTF-16) character        | Character     |\n\n**Table 26.2.: Primitive types in Java**  ',
+                bounding_box=(0.16198125836680052, 0.07765151515151515, 0.5863453815261044, 0.19602272727272727)
+              ),
+              Section(
+                content='\n\n### 26.3.1. Declaration & Assignment  ',
+                bounding_box=(0.08433734939759036, 0.30303030303030304, 0.37751004016064255, 0.01231060606060606)
+              ),
+              Section(
+                content='\n\nJava is a statically typed language meaning that all variables must be declared before you can use them or refer to them. In addition, when declaring a variable, you must specify both its type and its identifier. For example:\n\n```java\nint numUnits;  \ndouble costPerUnit;  \nchar firstInitial;  \nboolean isStudent;  \n```  ',
+                bounding_box=(0.08299866131191433, 0.3446969696969697, 0.749665327978581, 0.13541666666666666)
+              ),
+              Section(
+                  content="\n\nEach declaration specifies the variable's type followed by the identifier and ending with a semicolon. The identifier rules are fairly standard: a name can consist of lowercase and uppercase alphabetic characters, numbers, and underscores but may not begin with a numeric character. We adopt the modern camelCasing naming convention for variables in our code. In general, variables **must** be assigned a value before you can use them in an expression. You do not have to immediately assign a value when you declare them (though it is good practice), but some value must be assigned before they can be used or the compiler will issue an error.²  ",
+                  bounding_box=(0.08299866131191433, 0.5501893939393939, 0.751004016064257, 0.1571969696969697)
+              ),
+              Section(
+                content='\n\nThe assignment operator is a single equal sign, `=` and is a right-to-left assignment. That is, the variable that we wish to assign the value to appears on the left-hand-side while the value (literal, variable or expression) is on the right-hand-side. Using our variables from before, we can assign them values:  ',
+                bounding_box=(0.08299866131191433, 0.6534090909090909, 0.7483266398929049, 0.10795454545454546)
+              ),
+              Section(
+                content='\n\n²Instance variables, that is variables declared as part of an object do have default values. For objects, the default is `null`, for all numeric types, zero is the default value. For the `boolean` type, `false` is the default, and the default `char` value is `\\0`, the null-terminating character (zero in the ASCII table).  ',
+                bounding_box=(0.08299866131191433, 0.6998106060606061, 0.749665327978581, 0.13825757575757575)
+              ),
+              Section(
+                content='',
+                bounding_box=(0.7054886211512718, 0.048295454545454544, 0.03480589022757698, 0.00946969696969697)
+              )
+            ],
             page=1
         )
     ]

diff --git a/py_zerox/pyzerox/constants/messages.py b/py_zerox/pyzerox/constants/messages.py
@@ -54,3 +54,11 @@ class Messages:
     FAILED_TO_SAVE_FILE = """Failed to save file to local drive"""
 
     FAILED_TO_PROCESS_IMAGE = """Failed to process image"""
+
+    FAILED_TO_PERFORM_OCR = """
+    Failed to perform OCR on image: {0}
+    """
+
+    FAILED_TO_FIND_BOUNDING_BOX = """
+    Failed to find bounding box for the section: {0}
+    """
diff --git a/py_zerox/pyzerox/constants/patterns.py b/py_zerox/pyzerox/constants/patterns.py
@@ -4,3 +4,9 @@ class Patterns:
     MATCH_MARKDOWN_BLOCKS = r"^```[a-z]*\n([\s\S]*?)\n```$"
 
     MATCH_CODE_BLOCKS = r"^```\n([\s\S]*?)\n```$"
+
+
+class MarkdownConstants:
+    """A class to hold constants related to Markdown formatting."""
+
+    SECTION_DELIMITER = "<!-- section -->"
diff --git a/py_zerox/pyzerox/constants/prompts.py b/py_zerox/pyzerox/constants/prompts.py
@@ -5,4 +5,9 @@ class Prompts:
     Convert the following PDF page to markdown.
     Return only the markdown with no explanation text.
     Do not exclude any content from the page.
-    """
+    """
+
+    SEGMENT_MARKDOWN_SYSTEM_PROMPT = """
+    For each section (eg: headings, tables, footers, etc.), add a comment "section" at the end of that section in markdown.
+    Ensure as much content as possible is formatted using markdown where applicable.
+    """
diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Any, Union, Iterable
+from typing import List, Optional, Dict, Any, Tuple, Union, Iterable
 from dataclasses import dataclass, field
 
 
@@ -12,13 +12,25 @@ class ZeroxArgs:
     cleanup: bool = True
     concurrency: int = 10
     maintain_format: bool = False
+    bounding_box: bool = False
     model: str = "gpt-4o-mini",
     output_dir: Optional[str] = None
     temp_dir: Optional[str] = None
     custom_system_prompt: Optional[str] = None
     select_pages: Optional[Union[int, Iterable[int]]] = None
     kwargs: Dict[str, Any] = field(default_factory=dict)
 
+
+@dataclass
+class Section:
+    """
+    Dataclass to represent a section of content within a page.
+    """
+
+    content: str
+    bounding_box: Tuple[float, float, float, float]
+
+
 @dataclass
 class Page:
     """
@@ -27,6 +39,7 @@ class Page:
 
     content: str
     content_length: int
+    sections: List[Section]
     page: int
 
 

diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py
@@ -19,14 +19,15 @@
 from ..errors import FileUnavailable
 from ..constants.messages import Messages
 from ..models import litellmmodel
-from .types import Page, ZeroxOutput
+from .types import Page, Section, ZeroxOutput
 
 
 async def zerox(
     cleanup: bool = True,
     concurrency: int = 10,
     file_path: Optional[str] = "",
     maintain_format: bool = False,
+    bounding_box: bool = False,
     model: str = "gpt-4o-mini",
     output_dir: Optional[str] = None,
     temp_dir: Optional[str] = None,
@@ -46,6 +47,8 @@ async def zerox(
     :type file_path: str, optional
     :param maintain_format: Whether to maintain the format from the previous page, defaults to False
     :type maintain_format: bool, optional
+    :param bounding_box: Whether to include bounding box information in the output.
+    :type bounding_box: bool, optional
     :param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name.
     :type model: str, optional
     :param output_dir: The directory to save the markdown output, defaults to None
@@ -61,17 +64,17 @@ async def zerox(
     :return: The markdown content generated by the model.
     """
 
-
     input_token_count = 0
     output_token_count = 0
     prior_page = ""
     aggregated_markdown: List[str] = []
+    sections_list: List[List[Section]] = []
     start_time = datetime.now()
-    
+
     # File Path Validators
     if not file_path:
         raise FileUnavailable()
-    
+
     # Create an instance of the litellm model interface
     vision_model = litellmmodel(model=model,**kwargs)
 
@@ -84,11 +87,12 @@ async def zerox(
         warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING)
 
     # If select_pages is a single integer, convert it to a list for consistency
-    if isinstance(select_pages, int):
-        select_pages = [select_pages]
-
-    # Sort the pages to maintain consistency
-    select_pages = sorted(select_pages)
+    if select_pages:
+        if isinstance(select_pages, int):
+            select_pages = [select_pages]
+        else:
+            # Sort the pages to maintain consistency
+            select_pages = sorted(list(select_pages))
 
     # Ensure the output directory exists
     if output_dir:
@@ -100,7 +104,6 @@ async def zerox(
             await async_shutil.rmtree(temp_dir)
         await async_os.makedirs(temp_dir, exist_ok=True)
 
-
     # Create a temporary directory to store the PDF and images
     with tempfile.TemporaryDirectory() as temp_dir_:
 
@@ -115,10 +118,10 @@ async def zerox(
         local_path = await download_file(file_path=file_path, temp_dir=temp_directory)
         if not local_path:
             raise FileUnavailable()
-        
+
         raw_file_name = os.path.splitext(os.path.basename(local_path))[0]
         file_name = "".join(c.lower() if c.isalnum() else "_" for c in raw_file_name)
-        
+
         # create a subset pdf in temp dir with only the requested pages if select_pages is provided
         if select_pages is not None:
             subset_pdf_create_kwargs = {"original_pdf_path":local_path, "select_pages":select_pages, 
@@ -131,29 +134,37 @@ async def zerox(
 
         if maintain_format:
             for image in images:
-                result, input_token_count, output_token_count, prior_page = await process_page(
-                    image,
-                    vision_model,
-                    temp_directory,
-                    input_token_count,
-                    output_token_count,
-                    prior_page,
+                result, input_token_count, output_token_count, prior_page, sections = (
+                    await process_page(
+                        image,
+                        vision_model,
+                        temp_directory,
+                        bounding_box,
+                        input_token_count,
+                        output_token_count,
+                        prior_page,
+                    )
                 )
 
                 if result:
                     aggregated_markdown.append(result)
+                    sections_list.append(sections)
         else:
             results = await process_pages_in_batches(
                 images,
                 concurrency,
                 vision_model,
                 temp_directory,
+                bounding_box,
                 input_token_count,
                 output_token_count,
                 prior_page,
             )
 
-            aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)]
+            for result in results:
+                if isinstance(result[0], str):
+                    aggregated_markdown.append(result[0])
+                    sections_list.append(result[-1])
 
             ## add token usage
             input_token_count += sum([result[1] for result in results])
@@ -177,20 +188,30 @@ async def zerox(
         if select_pages is not None:
             # Map aggregated markdown to the selected pages
             formatted_pages = [
-                        Page(content=content, page=select_pages[i], content_length=len(content))
-                        for i, content in enumerate(aggregated_markdown)
-                    ]
+                Page(
+                    content=content,
+                    page=select_pages[i],
+                    content_length=len(content),
+                    sections=sections_list[i],
+                )
+                for i, content in enumerate(aggregated_markdown)
+            ]
         else:
             # Default behavior when no select_pages is provided
             formatted_pages = [
-                        Page(content=content, page=i + 1, content_length=len(content))
-                        for i, content in enumerate(aggregated_markdown)
-                    ]
+                Page(
+                    content=content,
+                    page=i + 1,
+                    content_length=len(content),
+                    sections=sections_list[i],
+                )
+                for i, content in enumerate(aggregated_markdown)
+            ]
 
         return ZeroxOutput(
             completion_time=completion_time,
             file_name=file_name,
             input_tokens=input_token_count,
             output_tokens=output_token_count,
             pages=formatted_pages,
-        )
+        )
diff --git a/py_zerox/pyzerox/models/modellitellm.py b/py_zerox/pyzerox/models/modellitellm.py
@@ -13,6 +13,7 @@
 from ..processor.image import encode_image_to_base64
 
 DEFAULT_SYSTEM_PROMPT = Prompts.DEFAULT_SYSTEM_PROMPT
+SEGMENT_MARKDOWN_SYSTEM_PROMPT = Prompts.SEGMENT_MARKDOWN_SYSTEM_PROMPT
 
 
 class litellmmodel(BaseModel):
@@ -69,12 +70,12 @@ def validate_access(self) -> None:
         """Validates access to the model -> if environment variables are set correctly with correct values."""
         if not litellm.check_valid_key(model=self.model,api_key=None):
             raise ModelAccessError(extra_info={"model": self.model})
-
 
     async def completion(
         self,
         image_path: str,
         maintain_format: bool,
+        bounding_box: bool,
         prior_page: str,
     ) -> CompletionResponse:
         """LitellM completion for image to markdown conversion.
@@ -91,6 +92,7 @@ async def completion(
         messages = await self._prepare_messages(
             image_path=image_path,
             maintain_format=maintain_format,
+            bounding_box=bounding_box,
             prior_page=prior_page,
         )
 
@@ -112,6 +114,7 @@ async def _prepare_messages(
         self,
         image_path: str,
         maintain_format: bool,
+        bounding_box: bool,
         prior_page: str,
     ) -> List[Dict[str, Any]]:
         """Prepares the messages to send to the LiteLLM Completion API.
@@ -131,6 +134,14 @@ async def _prepare_messages(
             },
         ]
 
+        if bounding_box:
+            messages.append(
+                {
+                    "role": "system",
+                    "content": SEGMENT_MARKDOWN_SYSTEM_PROMPT
+                }
+            )
+
         # If content has already been generated, add it to context.
         # This helps maintain the same format across pages.
         if maintain_format and prior_page: