From db381b5e7d421c0cf0df659eb7b5699fe3d53051 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:35:07 +0530
Subject: [PATCH 01/24] feat: Add prompt for getting segmented markdown

---
 py_zerox/pyzerox/constants/prompts.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/py_zerox/pyzerox/constants/prompts.py b/py_zerox/pyzerox/constants/prompts.py
index 1b00f61..d3c9c63 100644
--- a/py_zerox/pyzerox/constants/prompts.py
+++ b/py_zerox/pyzerox/constants/prompts.py
@@ -5,4 +5,9 @@ class Prompts:
     Convert the following PDF page to markdown.
     Return only the markdown with no explanation text.
     Do not exclude any content from the page.
-    """
\ No newline at end of file
+    """
+
+    BOUNDING_BOX_SYSTEM_PROMPT = """
+    For each section (eg: headings, tables, footers, etc.), add a comment "section" at the end of that section in markdown.
+    Ensure as much content as possible is formatted using markdown where applicable.
+    """

From b27294cc1fb78d781eac348c0b7846e9ae303eac Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:38:43 +0530
Subject: [PATCH 02/24] refactor: Rename the prompt for segmenting markdown

---
 py_zerox/pyzerox/constants/prompts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py_zerox/pyzerox/constants/prompts.py b/py_zerox/pyzerox/constants/prompts.py
index d3c9c63..901bae8 100644
--- a/py_zerox/pyzerox/constants/prompts.py
+++ b/py_zerox/pyzerox/constants/prompts.py
@@ -7,7 +7,7 @@ class Prompts:
     Do not exclude any content from the page.
     """
 
-    BOUNDING_BOX_SYSTEM_PROMPT = """
+    SEGMENT_MARKDOWN_SYSTEM_PROMPT = """
     For each section (eg: headings, tables, footers, etc.), add a comment "section" at the end of that section in markdown.
     Ensure as much content as possible is formatted using markdown where applicable.
     """

From 2a0e133cc37e5372c23ba4f0348fb1b3465c2d2f Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:39:36 +0530
Subject: [PATCH 03/24] feat: Append system prompt for getting segmented
 markdown, if bounding_box param is true

---
 py_zerox/pyzerox/models/modellitellm.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/py_zerox/pyzerox/models/modellitellm.py b/py_zerox/pyzerox/models/modellitellm.py
index bda4828..b236896 100644
--- a/py_zerox/pyzerox/models/modellitellm.py
+++ b/py_zerox/pyzerox/models/modellitellm.py
@@ -13,6 +13,7 @@
 from ..processor.image import encode_image_to_base64
 
 DEFAULT_SYSTEM_PROMPT = Prompts.DEFAULT_SYSTEM_PROMPT
+SEGMENT_MARKDOWN_SYSTEM_PROMPT = Prompts.SEGMENT_MARKDOWN_SYSTEM_PROMPT
 
 
 class litellmmodel(BaseModel):
@@ -69,12 +70,12 @@ def validate_access(self) -> None:
         """Validates access to the model -> if environment variables are set correctly with correct values."""
         if not litellm.check_valid_key(model=self.model,api_key=None):
             raise ModelAccessError(extra_info={"model": self.model})
-        
 
     async def completion(
         self,
         image_path: str,
         maintain_format: bool,
+        bounding_box: bool,
         prior_page: str,
     ) -> CompletionResponse:
         """LitellM completion for image to markdown conversion.
@@ -91,6 +92,7 @@ async def completion(
         messages = await self._prepare_messages(
             image_path=image_path,
             maintain_format=maintain_format,
+            bounding_box=bounding_box,
             prior_page=prior_page,
         )
 
@@ -112,6 +114,7 @@ async def _prepare_messages(
         self,
         image_path: str,
         maintain_format: bool,
+        bounding_box: bool,
         prior_page: str,
     ) -> List[Dict[str, Any]]:
         """Prepares the messages to send to the LiteLLM Completion API.
@@ -131,6 +134,14 @@ async def _prepare_messages(
             },
         ]
 
+        if bounding_box:
+            messages.append(
+                {
+                    "role": "system",
+                    "content": SEGMENT_MARKDOWN_SYSTEM_PROMPT
+                }
+            )
+
         # If content has already been generated, add it to context.
         # This helps maintain the same format across pages.
         if maintain_format and prior_page:

From 1916b1681cec689f079f02ee027575c46e0c71ac Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:42:16 +0530
Subject: [PATCH 04/24] feat: Clean OCR data from image

---
 py_zerox/pyzerox/processor/ocr.py | 62 +++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 py_zerox/pyzerox/processor/ocr.py

diff --git a/py_zerox/pyzerox/processor/ocr.py b/py_zerox/pyzerox/processor/ocr.py
new file mode 100644
index 0000000..a3b797d
--- /dev/null
+++ b/py_zerox/pyzerox/processor/ocr.py
@@ -0,0 +1,62 @@
+from typing import Dict
+from PIL import Image
+import pytesseract
+
+from py_zerox.pyzerox.constants.messages import Messages
+
+
+def enhance_image_for_ocr(image: Image) -> Image:
+    """
+    Enhances the given image for Optical Character Recognition.
+    Converts the image to grayscale.
+
+    Args:
+        image (Image): The input image to be enhanced.
+
+    Returns:
+        Image: The enhanced grayscale image ready for OCR processing.
+    """
+    image = image.convert("L")
+    return image
+
+
+async def _clean_ocr_text(data: Dict[str, list]) -> Dict[str, list]:
+    """
+    Processes the input data dictionary containing OCR results,
+    filtering out entries with low confidence scores or empty text.
+
+    Args:
+        data (dict): A dictionary containing OCR results:
+            - 'text': A list of recognized text strings.
+            - 'conf': A list of confidence scores corresponding to each text.
+            - 'left': A list of x-coordinates for the text bounding boxes.
+            - 'top': A list of y-coordinates for the text bounding boxes.
+            - 'width': A list of widths for the text bounding boxes.
+            - 'height': A list of heights for the text bounding boxes.
+
+    Returns:
+        dict: A dictionary containing filtered lists of text and attributes:
+            - 'text_list': A list of valid text strings.
+            - 'left_list': A list of x-coordinates for the text bounding boxes.
+            - 'top_list': A list of y-coordinates for the text bounding boxes.
+            - 'width_list': A list of widths for the text bounding boxes.
+            - 'height_list': A list of heights for the text bounding boxes.
+    """
+    data_lists = {
+        "text_list": [],
+        "left_list": [],
+        "top_list": [],
+        "width_list": [],
+        "height_list": [],
+    }
+
+    for i in range(len(data["text"])):
+        if int(data["conf"][i]) > 0 and data["text"][i].strip():
+            data_lists["text_list"].append(data["text"][i])
+            data_lists["left_list"].append(data["left"][i])
+            data_lists["top_list"].append(data["top"][i])
+            data_lists["width_list"].append(data["width"][i])
+            data_lists["height_list"].append(data["height"][i])
+
+    return data_lists
+

From 6fe805a934579dba44f8105f7c218f5ddff4f873 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:42:57 +0530
Subject: [PATCH 05/24] feat: Get OCR data from the image

---
 py_zerox/pyzerox/processor/ocr.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/py_zerox/pyzerox/processor/ocr.py b/py_zerox/pyzerox/processor/ocr.py
index a3b797d..f5b6257 100644
--- a/py_zerox/pyzerox/processor/ocr.py
+++ b/py_zerox/pyzerox/processor/ocr.py
@@ -60,3 +60,22 @@ async def _clean_ocr_text(data: Dict[str, list]) -> Dict[str, list]:
 
     return data_lists
 
+
+async def perform_image_ocr(image_path: str) -> Dict[str, list]:
+    """
+    Perform OCR on the specified image.
+
+    Args:
+        image_path (str): The file path to the image.
+
+    Returns:
+        A dictionary containing the cleaned OCR text data and attributes.
+    """
+    try:
+        image = Image.open(image_path)
+        image = enhance_image_for_ocr(image=image)
+        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
+        cleaned_data = await _clean_ocr_text(data=data)
+        return cleaned_data
+    except Exception as err:
+        raise Exception(Messages.FAILED_TO_PERFORM_OCR.format(err))

From 75e9230bc90fe9e0c52d8f0f9d9be8d077bf123c Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:45:32 +0530
Subject: [PATCH 06/24] feat: Find matching substring from OCR data using
 Levenshtein distance

---
 py_zerox/pyzerox/processor/bounding_box.py | 41 ++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 py_zerox/pyzerox/processor/bounding_box.py

diff --git a/py_zerox/pyzerox/processor/bounding_box.py b/py_zerox/pyzerox/processor/bounding_box.py
new file mode 100644
index 0000000..22fea0b
--- /dev/null
+++ b/py_zerox/pyzerox/processor/bounding_box.py
@@ -0,0 +1,41 @@
+from typing import Dict, List, Tuple
+
+import Levenshtein
+
+from py_zerox.pyzerox.constants.messages import Messages
+
+
+async def find_substring_with_minimum_edit_distance(
+    content_string: str, pattern: str
+) -> Tuple[str, int]:
+    """
+    Find the substring within the given content string
+    that has the minimum edit distance to the specified pattern.
+
+    Args:
+        content_string (str): The string in which to search for the substring.
+        pattern (str): The pattern to compare against.
+
+    Returns:
+        Tuple[str, int]: A tuple containing the best matching substring
+                         and its starting index in the content string.
+                         If no substring is found, the starting index
+                         will be -1.
+    """
+    content_length = len(content_string)
+    pattern_length = len(pattern)
+    min_distance = float("inf")
+    best_substring: str = ""
+    best_substring_start_index: int = -1
+
+    for i in range(content_length - pattern_length + 1):
+        substring = content_string[i : i + pattern_length]
+        distance = Levenshtein.distance(substring, pattern)
+        if distance < min_distance:
+            min_distance = distance
+            best_substring = substring
+            best_substring_start_index = i
+
+    return best_substring, best_substring_start_index
+
+

From a965cd1b3ced36afd2391bbaaff405038add4514 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:48:26 +0530
Subject: [PATCH 07/24] feat: Calculate bounding box enclosing the matched
 substring

---
 py_zerox/pyzerox/processor/bounding_box.py | 127 +++++++++++++++++++++
 1 file changed, 127 insertions(+)

diff --git a/py_zerox/pyzerox/processor/bounding_box.py b/py_zerox/pyzerox/processor/bounding_box.py
index 22fea0b..f06c156 100644
--- a/py_zerox/pyzerox/processor/bounding_box.py
+++ b/py_zerox/pyzerox/processor/bounding_box.py
@@ -39,3 +39,130 @@ async def find_substring_with_minimum_edit_distance(
     return best_substring, best_substring_start_index
 
 
+async def find_substring_indices_from_ocr_data(
+    content_list: List[str], substring: str, substring_start_index: int
+) -> Tuple[int, int]:
+    """
+    Find the indices of the first and last strings in a list
+    of strings that contain a specified substring starting from a given index.
+
+    Args:
+        content_list (List[str]): A list of strings to search through.
+        substring (str): The substring to find within the content list.
+        substring_start_index (int): The starting index in the combined string
+                                      representation of the content list from
+                                      which to search for the substring.
+
+    Returns:
+        Tuple[int, int]: A tuple containing the index of the first string
+                         and the index of the last string that contains
+                         the substring. Returns (-1, -1) if the substring
+                         is not found within the specified range.
+    """
+    substring_length = len(substring)
+    current_length: int = 0
+    first_string_index: int = -1
+    last_string_index: int = -1
+
+    for index, string in enumerate(content_list):
+        string_length = len(string) + 1
+        if current_length <= substring_start_index + 1 < current_length + string_length:
+            first_string_index = index
+        if (
+            current_length
+            <= substring_start_index + substring_length - 1
+            < current_length + string_length
+        ):
+            last_string_index = index
+        current_length += string_length
+
+    return first_string_index, last_string_index
+
+
+async def calculate_bounding_box(
+    ocr_data: Dict[str, list], first_string_index: int, last_string_index: int
+) -> Tuple[float, float, float, float]:
+    """
+    Calculate the bounding box coordinates that encompasses a set of strings based on OCR data.
+
+    Args:
+        ocr_data (Dict[str, list]): A dictionary containing lists of
+            'left_list', 'top_list', 'width_list', and 'height_list'
+            representing the OCR data for the strings.
+        first_string_index (int): The index of the first string to consider.
+        last_string_index (int): The index of the last string to consider.
+
+    Returns:
+        Tuple[float, float, float, float]: A tuple containing the coordinates
+            of the bounding box in the format (left, top, width, height).
+    """
+    leftmost_string_x = ocr_data["left_list"][first_string_index]
+    rightmost_string_x = 0
+    topmost_string_y = ocr_data["top_list"][first_string_index]
+    bottommost_string_y = 0
+    rightmost_string_width = ocr_data["width_list"][first_string_index]
+    bottommost_string_height = ocr_data["height_list"][first_string_index]
+
+    for i in range(first_string_index + 1, last_string_index + 1):
+        if ocr_data["left_list"][i] < leftmost_string_x:
+            leftmost_string_x = ocr_data["left_list"][i]
+        if ocr_data["top_list"][i] < topmost_string_y:
+            topmost_string_y = ocr_data["top_list"][i]
+        if ocr_data["left_list"][i] > rightmost_string_x:
+            rightmost_string_x = ocr_data["left_list"][i]
+            rightmost_string_width = ocr_data["width_list"][i]
+        if ocr_data["top_list"][i] > bottommost_string_y:
+            bottommost_string_y = ocr_data["top_list"][i]
+            bottommost_string_height = ocr_data["height_list"][i]
+
+    width = 0
+    height = 0
+    if rightmost_string_x > leftmost_string_x:
+        width = rightmost_string_x - leftmost_string_x
+    if bottommost_string_y > topmost_string_y:
+        height = bottommost_string_y - topmost_string_y
+    max_width = width + rightmost_string_width
+    max_height = height + bottommost_string_height
+
+    return leftmost_string_x, topmost_string_y, max_width, max_height
+
+
+async def find_bounding_box(
+    ocr_data: Dict[str, list], string_to_compare: str
+) -> Tuple[float, float, float, float]:
+    """
+    Find the bounding box coordinates for a given string within the OCR data.
+
+    Args:
+        ocr_data (Dict[str, list]): A dictionary containing OCR data.
+        string_to_compare (str): The string for which the bounding box needs to be found.
+
+    Returns:
+        List[float]: A list containing the bounding box coordinates in the
+            format [left, top, width, height].
+    """
+    try:
+        text_content = " ".join(ocr_data["text_list"])
+        substring, substring_start_index = (
+            await find_substring_with_minimum_edit_distance(
+                content_string=text_content, pattern=string_to_compare
+            )
+        )
+        first_string_index, last_string_index = (
+            await find_substring_indices_from_ocr_data(
+                content_list=ocr_data["text_list"],
+                substring=substring,
+                substring_start_index=substring_start_index,
+            )
+        )
+        left, top, width, height = await calculate_bounding_box(
+            ocr_data=ocr_data,
+            first_string_index=first_string_index,
+            last_string_index=last_string_index,
+        )
+        return left, top, width, height
+    except Exception as err:
+        raise Exception(Messages.FAILED_TO_FIND_BOUNDING_BOX.format(err))
+
+
+# TODO Normalize the coords

From c01af47d373e0b5793375a7ee3e301882128eec1 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:49:39 +0530
Subject: [PATCH 08/24] feat: Specify the section delimiter used in the
 markdown

---
 py_zerox/pyzerox/constants/patterns.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/py_zerox/pyzerox/constants/patterns.py b/py_zerox/pyzerox/constants/patterns.py
index 6be1a77..b4506ca 100644
--- a/py_zerox/pyzerox/constants/patterns.py
+++ b/py_zerox/pyzerox/constants/patterns.py
@@ -4,3 +4,9 @@ class Patterns:
     MATCH_MARKDOWN_BLOCKS = r"^```[a-z]*\n([\s\S]*?)\n```$"
 
     MATCH_CODE_BLOCKS = r"^```\n([\s\S]*?)\n```$"
+
+
+class MarkdownConstants:
+    """A class to hold constants related to Markdown formatting."""
+
+    SECTION_DELIMITER = "<!-- section -->"

From c8dce48840dbf9cc1b8c05eb973efe8b8b799a07 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:51:26 +0530
Subject: [PATCH 09/24] feat: Add error messages for OCR and Bounding Box

---
 py_zerox/pyzerox/constants/messages.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/py_zerox/pyzerox/constants/messages.py b/py_zerox/pyzerox/constants/messages.py
index e3ca826..318678e 100644
--- a/py_zerox/pyzerox/constants/messages.py
+++ b/py_zerox/pyzerox/constants/messages.py
@@ -54,3 +54,11 @@ class Messages:
     FAILED_TO_SAVE_FILE = """Failed to save file to local drive"""
 
     FAILED_TO_PROCESS_IMAGE = """Failed to process image"""
+
+    FAILED_TO_PERFORM_OCR = """
+    Failed to perform OCR on image: {0}
+    """
+
+    FAILED_TO_FIND_BOUNDING_BOX = """
+    Failed to find bounding box for the section: {0}
+    """

From 2821f8623cbf8dd34ad1af1f00fa9bb8a3136bb5 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:54:51 +0530
Subject: [PATCH 10/24] feat: Add Section to include the various sections in
 the markdown, along with it's bounding boxes

---
 py_zerox/pyzerox/core/types.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/py_zerox/pyzerox/core/types.py b/py_zerox/pyzerox/core/types.py
index ffe251d..6cdce88 100644
--- a/py_zerox/pyzerox/core/types.py
+++ b/py_zerox/pyzerox/core/types.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Any, Union, Iterable
+from typing import List, Optional, Dict, Any, Tuple, Union, Iterable
 from dataclasses import dataclass, field
 
 
@@ -12,6 +12,7 @@ class ZeroxArgs:
     cleanup: bool = True
     concurrency: int = 10
     maintain_format: bool = False
+    bounding_box: bool = False
     model: str = "gpt-4o-mini",
     output_dir: Optional[str] = None
     temp_dir: Optional[str] = None
@@ -19,6 +20,17 @@ class ZeroxArgs:
     select_pages: Optional[Union[int, Iterable[int]]] = None
     kwargs: Dict[str, Any] = field(default_factory=dict)
 
+
+@dataclass
+class Section:
+    """
+    Dataclass to represent a section of content within a page.
+    """
+
+    content: str
+    bounding_box: Tuple[float, float, float, float]
+
+
 @dataclass
 class Page:
     """
@@ -27,6 +39,7 @@ class Page:
 
     content: str
     content_length: int
+    sections: List[Section]
     page: int
 
 

From 57fdcc617eb3065bbc30d3f5e086e76c57425f26 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:56:04 +0530
Subject: [PATCH 11/24] feat: Functionality to remove markdown format from the
 specified text

---
 py_zerox/pyzerox/processor/text.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/py_zerox/pyzerox/processor/text.py b/py_zerox/pyzerox/processor/text.py
index 9b8fbe5..8927eb1 100644
--- a/py_zerox/pyzerox/processor/text.py
+++ b/py_zerox/pyzerox/processor/text.py
@@ -1,4 +1,6 @@
 import re
+from bs4 import BeautifulSoup
+import markdown
 
 # Package imports
 from ..constants.patterns import Patterns
@@ -10,3 +12,27 @@ def format_markdown(text: str) -> str:
     formatted_markdown = re.sub(Patterns.MATCH_MARKDOWN_BLOCKS, r"\1", text)
     formatted_markdown = re.sub(Patterns.MATCH_CODE_BLOCKS, r"\1", formatted_markdown)
     return formatted_markdown
+
+
+def remove_markdown(content: str) -> str:
+    """
+    Converts a Markdown formatted string to plain text.
+
+    Args:
+        content (str): A string containing Markdown formatted text.
+
+    Returns:
+        str: A plain text representation of the input Markdown content.
+    """
+    html = markdown.markdown(content)
+
+    parsed_html = BeautifulSoup(html, "html.parser")
+    content_text = parsed_html.get_text()
+
+    content_text = re.sub(r"-+", "", content_text)
+    content_text = re.sub(r"\|", "", content_text)
+    content_text = re.sub(r"\n+", "\n", content_text)
+    content_text = re.sub(r"\s+", " ", content_text)
+    content_text = content_text.strip()
+
+    return content_text

From 4944d07701d60bbd28f6595fd586d02bc869d7eb Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 12:58:46 +0530
Subject: [PATCH 12/24] feat: Perform OCR and find the bounding box for each
 section, if bounding_box param is set to True

---
 py_zerox/pyzerox/processor/pdf.py | 43 +++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py
index c3b3fa6..efa12e8 100644
--- a/py_zerox/pyzerox/processor/pdf.py
+++ b/py_zerox/pyzerox/processor/pdf.py
@@ -4,9 +4,14 @@
 from typing import List, Optional, Tuple
 from pdf2image import convert_from_path
 
+from py_zerox.pyzerox.constants.patterns import MarkdownConstants
+from py_zerox.pyzerox.core.types import Section
+from py_zerox.pyzerox.processor.bounding_box import find_bounding_box
+from py_zerox.pyzerox.processor.ocr import perform_image_ocr
+
 # Package Imports
 from .image import save_image
-from .text import format_markdown
+from .text import format_markdown, remove_markdown
 from ..constants import PDFConversionDefaultOptions, Messages
 from ..models import litellmmodel
 
@@ -37,6 +42,7 @@ async def process_page(
     image: str,
     model: litellmmodel,
     temp_directory: str = "",
+    bounding_box: bool = False,
     input_token_count: int = 0,
     output_token_count: int = 0,
     prior_page: str = "",
@@ -44,6 +50,9 @@ async def process_page(
 ) -> Tuple[str, int, int, str]:
     """Process a single page of a PDF"""
 
+    markdown_sections: List[str] = []
+    sections: Optional[List[Section]] = None
+
     # If semaphore is provided, acquire it before processing the page
     if semaphore:
         async with semaphore:
@@ -51,6 +60,7 @@ async def process_page(
                 image,
                 model,
                 temp_directory,
+                bounding_box,
                 input_token_count,
                 output_token_count,
                 prior_page,
@@ -63,15 +73,42 @@ async def process_page(
         completion = await model.completion(
             image_path=image_path,
             maintain_format=True,
+            bounding_box=bounding_box,
             prior_page=prior_page,
         )
 
         formatted_markdown = format_markdown(completion.content)
+
+        if bounding_box:
+            sections = []
+            ocr_data = await perform_image_ocr(image_path=image_path)
+
+            markdown_sections = formatted_markdown.split(
+                MarkdownConstants.SECTION_DELIMITER
+            )
+            for markdown_section in markdown_sections:
+                text_section = remove_markdown(markdown_section)
+                bounding_box_coords = await find_bounding_box(
+                    ocr_data=ocr_data, string_to_compare=text_section
+                )
+                section = Section(
+                    content=markdown_section, bounding_box=bounding_box_coords
+                )
+                sections.append(section)
+
+            formatted_markdown = "".join(markdown_sections)
+
         input_token_count += completion.input_tokens
         output_token_count += completion.output_tokens
         prior_page = formatted_markdown
 
-        return formatted_markdown, input_token_count, output_token_count, prior_page
+        return (
+            formatted_markdown,
+            input_token_count,
+            output_token_count,
+            prior_page,
+            sections,
+        )
 
     except Exception as error:
         logging.error(f"{Messages.FAILED_TO_PROCESS_IMAGE} Error:{error}")
@@ -83,6 +120,7 @@ async def process_pages_in_batches(
     concurrency: int,
     model: litellmmodel,
     temp_directory: str = "",
+    bounding_box: bool = False,
     input_token_count: int = 0,
     output_token_count: int = 0,
     prior_page: str = "",
@@ -96,6 +134,7 @@ async def process_pages_in_batches(
             image,
             model,
             temp_directory,
+            bounding_box,
             input_token_count,
             output_token_count,
             prior_page,

From edc09efe85c4a12240609b7bbe9ab4ffd27b119c Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 13:00:54 +0530
Subject: [PATCH 13/24] feat: Add sections to the Page, if the bounding_box
 param is set to True

---
 py_zerox/pyzerox/core/zerox.py | 75 ++++++++++++++++++++++------------
 1 file changed, 48 insertions(+), 27 deletions(-)

diff --git a/py_zerox/pyzerox/core/zerox.py b/py_zerox/pyzerox/core/zerox.py
index e7505bc..7da1ab5 100644
--- a/py_zerox/pyzerox/core/zerox.py
+++ b/py_zerox/pyzerox/core/zerox.py
@@ -19,7 +19,7 @@
 from ..errors import FileUnavailable
 from ..constants.messages import Messages
 from ..models import litellmmodel
-from .types import Page, ZeroxOutput
+from .types import Page, Section, ZeroxOutput
 
 
 async def zerox(
@@ -27,6 +27,7 @@ async def zerox(
     concurrency: int = 10,
     file_path: Optional[str] = "",
     maintain_format: bool = False,
+    bounding_box: bool = False,
     model: str = "gpt-4o-mini",
     output_dir: Optional[str] = None,
     temp_dir: Optional[str] = None,
@@ -46,6 +47,8 @@ async def zerox(
     :type file_path: str, optional
     :param maintain_format: Whether to maintain the format from the previous page, defaults to False
     :type maintain_format: bool, optional
+    :param bounding_box: Whether to include bounding box information in the output.
+    :type bounding_box: bool, optional
     :param model: The model to use for generating completions, defaults to "gpt-4o-mini". Note - Refer: https://docs.litellm.ai/docs/providers to pass correct model name as according to provider it might be different from actual name.
     :type model: str, optional
     :param output_dir: The directory to save the markdown output, defaults to None
@@ -61,17 +64,17 @@ async def zerox(
     :return: The markdown content generated by the model.
     """
 
-
     input_token_count = 0
     output_token_count = 0
     prior_page = ""
     aggregated_markdown: List[str] = []
+    sections_list: List[List[Section]] = []
     start_time = datetime.now()
-    
+
     # File Path Validators
     if not file_path:
         raise FileUnavailable()
-    
+
     # Create an instance of the litellm model interface
     vision_model = litellmmodel(model=model,**kwargs)
 
@@ -84,11 +87,12 @@ async def zerox(
         warnings.warn(Messages.MAINTAIN_FORMAT_SELECTED_PAGES_WARNING)
 
     # If select_pages is a single integer, convert it to a list for consistency
-    if isinstance(select_pages, int):
-        select_pages = [select_pages]
-    
-    # Sort the pages to maintain consistency
-    select_pages = sorted(select_pages)
+    if select_pages:
+        if isinstance(select_pages, int):
+            select_pages = [select_pages]
+        else:
+            # Sort the pages to maintain consistency
+            select_pages = sorted(list(select_pages))
 
     # Ensure the output directory exists
     if output_dir:
@@ -100,7 +104,6 @@ async def zerox(
             await async_shutil.rmtree(temp_dir)
         await async_os.makedirs(temp_dir, exist_ok=True)
 
-
     # Create a temporary directory to store the PDF and images
     with tempfile.TemporaryDirectory() as temp_dir_:
 
@@ -115,10 +118,10 @@ async def zerox(
         local_path = await download_file(file_path=file_path, temp_dir=temp_directory)
         if not local_path:
             raise FileUnavailable()
-        
+
         raw_file_name = os.path.splitext(os.path.basename(local_path))[0]
         file_name = "".join(c.lower() if c.isalnum() else "_" for c in raw_file_name)
-        
+
         # create a subset pdf in temp dir with only the requested pages if select_pages is provided
         if select_pages is not None:
             subset_pdf_create_kwargs = {"original_pdf_path":local_path, "select_pages":select_pages, 
@@ -131,29 +134,37 @@ async def zerox(
 
         if maintain_format:
             for image in images:
-                result, input_token_count, output_token_count, prior_page = await process_page(
-                    image,
-                    vision_model,
-                    temp_directory,
-                    input_token_count,
-                    output_token_count,
-                    prior_page,
+                result, input_token_count, output_token_count, prior_page, sections = (
+                    await process_page(
+                        image,
+                        vision_model,
+                        temp_directory,
+                        bounding_box,
+                        input_token_count,
+                        output_token_count,
+                        prior_page,
+                    )
                 )
 
                 if result:
                     aggregated_markdown.append(result)
+                    sections_list.append(sections)
         else:
             results = await process_pages_in_batches(
                 images,
                 concurrency,
                 vision_model,
                 temp_directory,
+                bounding_box,
                 input_token_count,
                 output_token_count,
                 prior_page,
             )
 
-            aggregated_markdown = [result[0] for result in results if isinstance(result[0], str)]
+            for result in results:
+                if isinstance(result[0], str):
+                    aggregated_markdown.append(result[0])
+                    sections_list.append(result[-1])
 
             ## add token usage
             input_token_count += sum([result[1] for result in results])
@@ -177,15 +188,25 @@ async def zerox(
         if select_pages is not None:
             # Map aggregated markdown to the selected pages
             formatted_pages = [
-                        Page(content=content, page=select_pages[i], content_length=len(content))
-                        for i, content in enumerate(aggregated_markdown)
-                    ]
+                Page(
+                    content=content,
+                    page=select_pages[i],
+                    content_length=len(content),
+                    sections=sections_list[i],
+                )
+                for i, content in enumerate(aggregated_markdown)
+            ]
         else:
             # Default behavior when no select_pages is provided
             formatted_pages = [
-                        Page(content=content, page=i + 1, content_length=len(content))
-                        for i, content in enumerate(aggregated_markdown)
-                    ]
+                Page(
+                    content=content,
+                    page=i + 1,
+                    content_length=len(content),
+                    sections=sections_list[i],
+                )
+                for i, content in enumerate(aggregated_markdown)
+            ]
 
         return ZeroxOutput(
             completion_time=completion_time,
@@ -193,4 +214,4 @@ async def zerox(
             input_tokens=input_token_count,
             output_tokens=output_token_count,
             pages=formatted_pages,
-        )
\ No newline at end of file
+        )

From 327dc89650486d2c2c38911c0559a02091043b99 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 13:01:35 +0530
Subject: [PATCH 14/24] chore: Add dependencies in pyproject.toml

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index b34a194..a2280e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,6 +20,11 @@ pdf2image = "^1.17.0"
 litellm = "^1.44.15"
 aioshutil = "^1.5"
 pypdf2 = "^3.0.1"
+pytesseract = "^0.3.13"
+pillow = "^10.4.0"
+python-Levenshtein = "^0.25.1"
+Markdown = "^3.7"
+beautifulsoup4 = "^4.12.3"
 
 [tool.poetry.scripts]
 pre-install = "py_zerox.scripts.pre_install:check_and_install"

From 93b8bab456283a7ba5ea7d6669f54700c6341137 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 18:59:59 +0530
Subject: [PATCH 15/24] docs: Update README.md with bounding_box details

---
 README.md | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/README.md b/README.md
index 6d76032..55136d8 100644
--- a/README.md
+++ b/README.md
@@ -263,6 +263,8 @@ Parameters
   The path to the PDF file to process. Defaults to an empty string.
 - **maintain_format** (bool, optional):
   Whether to maintain the format from the previous page. Defaults to False.
+- **bounding_box** (bool, optional):
+  Whether to return the bounding box of the identified sections in the page.
 - **model** (str, optional):
   The model to use for generating completions. Defaults to "gpt-4o-mini".
   Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider.
@@ -332,6 +334,32 @@ ZeroxOutput(
                     'boolean type, `false` is the default, and the default char value is `\\0`, the null-terminating ' +
                     'character (zero in the ASCII table).',
             content_length=2333,
+            sections=[
+              Section(
+                content='| Type    | Description                             | Wrapper Class |\n|---------|-----------------------------------------|---------------|\n| byte    | 8-bit signed 2s complement integer      | Byte          |\n| short   | 16-bit signed 2s complement integer     | Short         |\n| int     | 32-bit signed 2s complement integer     | Integer       |\n| long    | 64-bit signed 2s complement integer     | Long          |\n| float   | 32-bit IEEE 754 floating point number   | Float         |\n| double  | 64-bit floating po16-bit Unicode (UTF-16) character      | Character     |\n\nTable 26.2.: Primitive types in Java  ',
+                bounding_box=[121, 82, 438, 207]
+              ),
+              Section(
+                content='\n\n## 26.3.1. Declaration & Assignment  ',
+                bounding_box=[63, 320, 282, 13]
+              ),
+              Section(
+                content="\n\nJava is a statically typed language meaning that all variables must be declared before you can use them or refer to them. In addition, when declaring a variable, you must specify both its type and its identifier. For example:\n\n```java\nint numUnits;  \ndouble costPerUnit;  \nchar firstInitial;  \nboolean isStudent;  \n```\n\nEach declaration specifies the variable's type followed by the identifier and ending with a semicolon. The identifier rules are fairly standard: a name can consist of lowercase and uppercase alphabetic characters, numbers, and underscores but may not begin with a numeric character. We adopt the modern camelCasing naming convention for variables in our code. In general, variables **must** be assigned a value before you can use them in an expression. You do not have to immediately assign a value when you declare them (though it is good practice), but some value must be assigned before they can be used or the compiler will issue an error.  ",
+                bounding_box=[62, 364, 561, 383]
+              ),
+              Section(
+                content='\n\nThe assignment operator is a single equal sign, `=` and is a right-to-left assignment. That is, the variable that we wish to assign the value to appears on the left-hand-side while the value (literal, variable or expression) is on the right-hand-side. Using our variables from before, we can assign them values:  ', 
+                bounding_box=[62, 690, 559, 114]
+              ),
+              Section(
+                content='\n\n2Instance variables, that is variables declared as part of an object do have default values. For objects, the default is `null`, for all numeric types, zero is the default value. For the `boolean` type, `false` is the default, and the default `char` value is `\\0`, the null-terminating character (zero in the ASCII table).  ',
+                bounding_box=[62, 739, 560, 146]
+              ),
+              Section(
+                content='',
+                bounding_box=[527, 51, 26, 10]
+              )
+            ],
             page=1
         )
     ]

From 72e245e5d40d26045c748bcbe616c42e9225b84e Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 19:04:08 +0530
Subject: [PATCH 16/24] docs: Update README.md by adding `bounding_box` param

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 55136d8..60edc8e 100644
--- a/README.md
+++ b/README.md
@@ -243,6 +243,7 @@ async def zerox(
     concurrency: int = 10,
     file_path: Optional[str] = "",
     maintain_format: bool = False,
+    bounding_box: bool = False,
     model: str = "gpt-4o-mini",
     output_dir: Optional[str] = None,
     temp_dir: Optional[str] = None,

From 75e42e964dda82eb50c1202bbee17b19ce2407d0 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 19:36:57 +0530
Subject: [PATCH 17/24] build: Script for pre-installing Tesseract

---
 py_zerox/scripts/pre_install.py | 61 +++++++++++++++++++++++++--------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/py_zerox/scripts/pre_install.py b/py_zerox/scripts/pre_install.py
index ed22a61..717205a 100644
--- a/py_zerox/scripts/pre_install.py
+++ b/py_zerox/scripts/pre_install.py
@@ -23,23 +23,54 @@ def install_package(command, package_name):
         raise RuntimeError(f"Failed to install {package_name}: {e}")
 
 
+def check_and_install_poppler():
+    """Check for the installation of Poppler and install if not present."""
+
+    try:
+        run_command("pdftoppm -h")
+    except RuntimeError:
+        if platform.system() == "Darwin":  # macOS
+            install_package("brew install poppler", "Poppler")
+        elif platform.system() == "Linux":  # Linux
+            install_package(
+                "sudo apt-get update && sudo apt-get install -y poppler-utils",
+                "Poppler",
+            )
+        else:
+            raise RuntimeError(
+                "Please install Poppler manually from https://poppler.freedesktop.org/"
+            )
+
+
+def check_and_install_tesseract():
+    """Check for the installation of Tesseract and install if not present."""
+    try:
+        run_command("tesseract --version")
+    except RuntimeError:
+        if platform.system() == "Darwin":  # macOS
+            install_package("brew install tesseract", "Tesseract")
+        elif platform.system() == "Linux":  # Linux
+            install_package(
+                "sudo apt-get update && sudo apt-get install -y tesseract-ocr",
+                "Tesseract",
+            )
+        elif platform.system() == "Windows":  # Windows
+            print(
+                "Please download and install Tesseract from the official GitHub repository: https://github.com/UB-Mannheim/tesseract/wiki"
+            )
+            print(
+                "Make sure to add the Tesseract installation path to your system's PATH environment variable."
+            )
+        else:
+            raise RuntimeError(
+                "Please install Tesseract manually from the official website."
+            )
+
+
 def check_and_install():
     try:
-        # Check and install Poppler
-        try:
-            run_command("pdftoppm -h")
-        except RuntimeError:
-            if platform.system() == "Darwin":  # macOS
-                install_package("brew install poppler", "Poppler")
-            elif platform.system() == "Linux":  # Linux
-                install_package(
-                    "sudo apt-get update && sudo apt-get install -y poppler-utils",
-                    "Poppler",
-                )
-            else:
-                raise RuntimeError(
-                    "Please install Poppler manually from https://poppler.freedesktop.org/"
-                )
+        check_and_install_poppler()
+        check_and_install_tesseract()
 
     except RuntimeError as err:
         print(f"Error during installation: {err}", file=sys.stderr)

From 94bbee631c8761cbcd05a254bed582ca7055b19b Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 19:38:23 +0530
Subject: [PATCH 18/24] build: Update package metadata for py-zerox

---
 setup.cfg | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index f8bbc0f..20c6776 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,7 +4,7 @@ version = 0.0.5
 description = ocr documents using vision models from all popular providers like OpenAI, Azure OpenAI, Anthropic, AWS Bedrock etc
 long_description = file: README.md
 long_description_content_type = text/markdown
-author = wizenheimer, pradhyumna85
+author = wizenheimer, pradhyumna85, getwithashish
 license = MIT
 license_file = LICENSE
 classifiers =
@@ -24,6 +24,11 @@ install_requires =
     litellm>=1.44.15
     aioshutil>=1.5
     PyPDF2>=3.0.1
+    pytesseract>=0.3.13
+    pillow>=10.4.0
+    python-Levenshtein>=0.25.1
+    Markdown>=3.7
+    beautifulsoup4>=4.12.3
 
 [options.packages.find]
 where = py_zerox.pyzerox

From 5403b342119b16163cf12ac87889691e8439e5e4 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 19:40:09 +0530
Subject: [PATCH 19/24] docs: Update README.md to install Tesseract

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 60edc8e..7cf5254 100644
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ Request #3 => page_2_markdown + page_3_image
 
 ### Installation:
 
-- Install **poppler-utils** on the system, it should be available in path variable
+- Install **poppler-utils** and **tesseract** on the system, these should be available in path variable
 - Install py-zerox:
 ```sh
 pip install py-zerox

From cd57ff094a2b99ab8e5a5515d4b9cd3f2a788527 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 21:28:57 +0530
Subject: [PATCH 20/24] feat: Normalize the bounding box coordinates

---
 py_zerox/pyzerox/processor/bounding_box.py | 38 ++++++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/py_zerox/pyzerox/processor/bounding_box.py b/py_zerox/pyzerox/processor/bounding_box.py
index f06c156..604bac0 100644
--- a/py_zerox/pyzerox/processor/bounding_box.py
+++ b/py_zerox/pyzerox/processor/bounding_box.py
@@ -127,6 +127,34 @@ async def calculate_bounding_box(
     return leftmost_string_x, topmost_string_y, max_width, max_height
 
 
+async def normalize_bounding_box(
+    left: float,
+    top: float,
+    width: float,
+    height: float,
+    image_dimensions: Tuple[float, float],
+) -> Tuple[float, float, float, float]:
+    """
+    Normalize the bounding box coordinates and dimensions based on the image dimensions.
+
+    Args:
+        left (float): The x-coordinate of the top-left corner of the bounding box.
+        top (float): The y-coordinate of the top-left corner of the bounding box.
+        width (float): The width of the bounding box.
+        height (float): The height of the bounding box.
+        image_dimensions (Tuple[float, float]): A tuple containing the width and height of the image.
+
+    Returns:
+        Tuple[float, float, float, float]: A tuple containing the normalized left, top, width, and height of the bounding box.
+    """
+    normalized_left = left / image_dimensions[0]
+    normalized_top = top / image_dimensions[1]
+    normalized_width = width / image_dimensions[0]
+    normalized_height = height / image_dimensions[1]
+
+    return normalized_left, normalized_top, normalized_width, normalized_height
+
+
 async def find_bounding_box(
     ocr_data: Dict[str, list], string_to_compare: str
 ) -> Tuple[float, float, float, float]:
@@ -160,9 +188,13 @@ async def find_bounding_box(
             first_string_index=first_string_index,
             last_string_index=last_string_index,
         )
+        left, top, width, height = await normalize_bounding_box(
+            left=left,
+            top=top,
+            width=width,
+            height=height,
+            image_dimensions=ocr_data["dimensions"],
+        )
         return left, top, width, height
     except Exception as err:
         raise Exception(Messages.FAILED_TO_FIND_BOUNDING_BOX.format(err))
-
-
-# TODO Normalize the coords

From 558513792b5c0de0a1c82791876c4fe41efafab6 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 21:30:10 +0530
Subject: [PATCH 21/24] feat: Include image dimensions in the OCR data

---
 py_zerox/pyzerox/processor/ocr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/py_zerox/pyzerox/processor/ocr.py b/py_zerox/pyzerox/processor/ocr.py
index f5b6257..de305e3 100644
--- a/py_zerox/pyzerox/processor/ocr.py
+++ b/py_zerox/pyzerox/processor/ocr.py
@@ -76,6 +76,7 @@ async def perform_image_ocr(image_path: str) -> Dict[str, list]:
         image = enhance_image_for_ocr(image=image)
         data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
         cleaned_data = await _clean_ocr_text(data=data)
+        cleaned_data["dimensions"] = image.size
         return cleaned_data
     except Exception as err:
         raise Exception(Messages.FAILED_TO_PERFORM_OCR.format(err))

From ac5e0dea45232fa96cc7ea1287724f2ffe01e7fc Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 21:32:38 +0530
Subject: [PATCH 22/24] docs: Update docstring in bounding_box.py

---
 py_zerox/pyzerox/processor/bounding_box.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/py_zerox/pyzerox/processor/bounding_box.py b/py_zerox/pyzerox/processor/bounding_box.py
index 604bac0..3fb10fd 100644
--- a/py_zerox/pyzerox/processor/bounding_box.py
+++ b/py_zerox/pyzerox/processor/bounding_box.py
@@ -166,8 +166,7 @@ async def find_bounding_box(
         string_to_compare (str): The string for which the bounding box needs to be found.
 
     Returns:
-        List[float]: A list containing the bounding box coordinates in the
-            format [left, top, width, height].
+        Tuple[float, float, float, float]: A tuple containing the bounding box coordinates in the format (left, top, width, height).
     """
     try:
         text_content = " ".join(ocr_data["text_list"])

From ab90e2fbdb0a304270b5adc679f750ff4efbaf6b Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Fri, 20 Sep 2024 23:12:46 +0530
Subject: [PATCH 23/24] docs: Update README.md for normalized bounding box

---
 README.md | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 7cf5254..8eb7e44 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,7 @@ Parameters
 - **maintain_format** (bool, optional):
   Whether to maintain the format from the previous page. Defaults to False.
 - **bounding_box** (bool, optional):
-  Whether to return the bounding box of the identified sections in the page.
+  Whether to return the normalized bounding box of the identified sections in the page.
 - **model** (str, optional):
   The model to use for generating completions. Defaults to "gpt-4o-mini".
   Refer to LiteLLM Providers for the correct model name, as it may differ depending on the provider.
@@ -337,28 +337,32 @@ ZeroxOutput(
             content_length=2333,
             sections=[
               Section(
-                content='| Type    | Description                             | Wrapper Class |\n|---------|-----------------------------------------|---------------|\n| byte    | 8-bit signed 2s complement integer      | Byte          |\n| short   | 16-bit signed 2s complement integer     | Short         |\n| int     | 32-bit signed 2s complement integer     | Integer       |\n| long    | 64-bit signed 2s complement integer     | Long          |\n| float   | 32-bit IEEE 754 floating point number   | Float         |\n| double  | 64-bit floating po16-bit Unicode (UTF-16) character      | Character     |\n\nTable 26.2.: Primitive types in Java  ',
-                bounding_box=[121, 82, 438, 207]
+                content='| Type    | Description                               | Wrapper Class |\n|---------|-------------------------------------------|---------------|\n| byte    | 8-bit signed 2s complement integer        | Byte          |\n| short   | 16-bit signed 2s complement integer       | Short         |\n| int     | 32-bit signed 2s complement integer       | Integer       |\n| long    | 64-bit signed 2s complement integer       | Long          |\n| float   | 32-bit IEEE 754 floating point number     | Float         |\n| double  | 64-bit floating point number               | Double        |\n| boolean | may be set to `true` or `false`          | Boolean       |\n| char    | 16-bit Unicode (UTF-16) character        | Character     |\n\n**Table 26.2.: Primitive types in Java**  ',
+                bounding_box=(0.16198125836680052, 0.07765151515151515, 0.5863453815261044, 0.19602272727272727)
               ),
               Section(
-                content='\n\n## 26.3.1. Declaration & Assignment  ',
-                bounding_box=[63, 320, 282, 13]
+                content='\n\n### 26.3.1. Declaration & Assignment  ',
+                bounding_box=(0.08433734939759036, 0.30303030303030304, 0.37751004016064255, 0.01231060606060606)
               ),
               Section(
-                content="\n\nJava is a statically typed language meaning that all variables must be declared before you can use them or refer to them. In addition, when declaring a variable, you must specify both its type and its identifier. For example:\n\n```java\nint numUnits;  \ndouble costPerUnit;  \nchar firstInitial;  \nboolean isStudent;  \n```\n\nEach declaration specifies the variable's type followed by the identifier and ending with a semicolon. The identifier rules are fairly standard: a name can consist of lowercase and uppercase alphabetic characters, numbers, and underscores but may not begin with a numeric character. We adopt the modern camelCasing naming convention for variables in our code. In general, variables **must** be assigned a value before you can use them in an expression. You do not have to immediately assign a value when you declare them (though it is good practice), but some value must be assigned before they can be used or the compiler will issue an error.  ",
-                bounding_box=[62, 364, 561, 383]
+                content='\n\nJava is a statically typed language meaning that all variables must be declared before you can use them or refer to them. In addition, when declaring a variable, you must specify both its type and its identifier. For example:\n\n```java\nint numUnits;  \ndouble costPerUnit;  \nchar firstInitial;  \nboolean isStudent;  \n```  ',
+                bounding_box=(0.08299866131191433, 0.3446969696969697, 0.749665327978581, 0.13541666666666666)
               ),
               Section(
-                content='\n\nThe assignment operator is a single equal sign, `=` and is a right-to-left assignment. That is, the variable that we wish to assign the value to appears on the left-hand-side while the value (literal, variable or expression) is on the right-hand-side. Using our variables from before, we can assign them values:  ', 
-                bounding_box=[62, 690, 559, 114]
+                  content="\n\nEach declaration specifies the variable's type followed by the identifier and ending with a semicolon. The identifier rules are fairly standard: a name can consist of lowercase and uppercase alphabetic characters, numbers, and underscores but may not begin with a numeric character. We adopt the modern camelCasing naming convention for variables in our code. In general, variables **must** be assigned a value before you can use them in an expression. You do not have to immediately assign a value when you declare them (though it is good practice), but some value must be assigned before they can be used or the compiler will issue an error.²  ",
+                  bounding_box=(0.08299866131191433, 0.5501893939393939, 0.751004016064257, 0.1571969696969697)
               ),
               Section(
-                content='\n\n2Instance variables, that is variables declared as part of an object do have default values. For objects, the default is `null`, for all numeric types, zero is the default value. For the `boolean` type, `false` is the default, and the default `char` value is `\\0`, the null-terminating character (zero in the ASCII table).  ',
-                bounding_box=[62, 739, 560, 146]
+                content='\n\nThe assignment operator is a single equal sign, `=` and is a right-to-left assignment. That is, the variable that we wish to assign the value to appears on the left-hand-side while the value (literal, variable or expression) is on the right-hand-side. Using our variables from before, we can assign them values:  ',
+                bounding_box=(0.08299866131191433, 0.6534090909090909, 0.7483266398929049, 0.10795454545454546)
+              ),
+              Section(
+                content='\n\n²Instance variables, that is variables declared as part of an object do have default values. For objects, the default is `null`, for all numeric types, zero is the default value. For the `boolean` type, `false` is the default, and the default `char` value is `\\0`, the null-terminating character (zero in the ASCII table).  ',
+                bounding_box=(0.08299866131191433, 0.6998106060606061, 0.749665327978581, 0.13825757575757575)
               ),
               Section(
                 content='',
-                bounding_box=[527, 51, 26, 10]
+                bounding_box=(0.7054886211512718, 0.048295454545454544, 0.03480589022757698, 0.00946969696969697)
               )
             ],
             page=1

From be1a0999d61a6c54df7cd0dd5c4782ade92f8d09 Mon Sep 17 00:00:00 2001
From: getwithashish <ashishsamtgeorge@mca.ajce.in>
Date: Tue, 24 Sep 2024 16:37:38 +0530
Subject: [PATCH 24/24] refactor: Specify the correct return type of
 process_page() method

---
 py_zerox/pyzerox/processor/pdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py
index efa12e8..585a43c 100644
--- a/py_zerox/pyzerox/processor/pdf.py
+++ b/py_zerox/pyzerox/processor/pdf.py
@@ -47,7 +47,7 @@ async def process_page(
     output_token_count: int = 0,
     prior_page: str = "",
     semaphore: Optional[asyncio.Semaphore] = None,
-) -> Tuple[str, int, int, str]:
+) -> Tuple[str, int, int, str, Optional[List[Section]]]:
     """Process a single page of a PDF"""
 
     markdown_sections: List[str] = []