Skip to content

Commit

Permalink
Merge pull request #404 from VikParuchuri/dev-mose/blockquote-processor
Browse files Browse the repository at this point in the history
Add Blockquote Processor
  • Loading branch information
iammosespaulr authored Dec 2, 2024
2 parents 4a983a5 + 0d8e615 commit bcb9330
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 11 deletions.
16 changes: 9 additions & 7 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from marker.builders.ocr import OcrBuilder
from marker.builders.structure import StructureBuilder
from marker.converters import BaseConverter
from marker.processors.blockquote import BlockquoteProcessor
from marker.processors.code import CodeProcessor
from marker.processors.debug import DebugProcessor
from marker.processors.document_toc import DocumentTOCProcessor
Expand Down Expand Up @@ -53,17 +54,18 @@ def __init__(self, artifact_dict: Dict[str, Any], processor_list: List[str] | No
processor_list = strings_to_classes(processor_list)
else:
processor_list = [
FootnoteProcessor,
PageHeaderProcessor,
EquationProcessor,
TableProcessor,
SectionHeaderProcessor,
TextProcessor,
ListProcessor,
BlockquoteProcessor,
CodeProcessor,
DocumentTOCProcessor,
EquationProcessor,
FootnoteProcessor,
IgnoreTextProcessor,
LineNumbersProcessor,
ListProcessor,
PageHeaderProcessor,
SectionHeaderProcessor,
TableProcessor,
TextProcessor,
DebugProcessor,
]

Expand Down
48 changes: 48 additions & 0 deletions marker/processors/blockquote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document

class BlockquoteProcessor(BaseProcessor):
"""
A processor for tagging blockquotes
"""
block_types = (BlockTypes.Text, BlockTypes.TextInlineMath)
min_x_indent = 0.05 # % of block width
x_start_tolerance = 0.01 # % of block width
x_end_tolerance = 0.01 # % of block width

def __init__(self, config):
super().__init__(config)

def __call__(self, document: Document):
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
if block.structure is None:
continue

if not len(block.structure) >= 2:
continue

next_block = page.get_next_block(block)
if next_block is None:
continue
if next_block.block_type not in self.block_types:
continue
if next_block.structure is None:
continue
if next_block.ignore_for_output:
continue

matching_x_end = abs(next_block.polygon.x_end - block.polygon.x_end) < self.x_end_tolerance * block.polygon.width
matching_x_start = abs(next_block.polygon.x_start - block.polygon.x_start) < self.x_start_tolerance * block.polygon.width
x_indent = next_block.polygon.x_start > block.polygon.x_start + (self.min_x_indent * block.polygon.width)
y_indent = next_block.polygon.y_start > block.polygon.y_end

if block.block_type in self.block_types and block.blockquote:
next_block.blockquote = (matching_x_end and matching_x_start) or (x_indent and y_indent)
next_block.blockquote_level = block.blockquote_level
if (x_indent and y_indent):
next_block.blockquote_level += 1
else:
next_block.blockquote = len(next_block.structure) >= 2 and (x_indent and y_indent)
next_block.blockquote_level = 1
2 changes: 0 additions & 2 deletions marker/processors/list.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import math

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
from marker.schema.document import Document
Expand Down
11 changes: 10 additions & 1 deletion marker/schema/blocks/inlinemath.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
class InlineMath(Block):
block_type: BlockTypes = BlockTypes.TextInlineMath
has_continuation: bool = False
blockquote: bool = False
blockquote_level: int = 0

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
Expand All @@ -16,4 +18,11 @@ def assemble_html(self, child_blocks, parent_structure):
class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"

if self.blockquote:
# Add indentation for blockquote levels
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{class_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{class_attr}>{template}</p>"
10 changes: 9 additions & 1 deletion marker/schema/blocks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
class Text(Block):
block_type: BlockTypes = BlockTypes.Text
has_continuation: bool = False
blockquote: bool = False
blockquote_level: int = 0

def assemble_html(self, child_blocks, parent_structure):
if self.ignore_for_output:
Expand All @@ -16,4 +18,10 @@ def assemble_html(self, child_blocks, parent_structure):
class_attr = f" block-type='{self.block_type}'"
if self.has_continuation:
class_attr += " class='has-continuation'"
return f"<p{class_attr}>{template}</p>"

if self.blockquote:
blockquote_prefix = "<blockquote>" * self.blockquote_level
blockquote_suffix = "</blockquote>" * self.blockquote_level
return f"{blockquote_prefix}<p{class_attr}>{template}</p>{blockquote_suffix}"
else:
return f"<p{class_attr}>{template}</p>"

0 comments on commit bcb9330

Please sign in to comment.