Skip to content

Commit

Permalink
Merge pull request #412 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
VikParuchuri authored Dec 3, 2024
2 parents f446e56 + 5fd116b commit 6ded3b9
Show file tree
Hide file tree
Showing 12 changed files with 470 additions and 445 deletions.
2 changes: 1 addition & 1 deletion marker/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ def save_output(rendered: BaseModel, output_dir: str, fname_base: str):
f.write(json.dumps(rendered.metadata, indent=2))

for img_name, img in images.items():
img.save(os.path.join(output_dir, img_name), "PNG", optimize=False, compress_level=3)
img.save(os.path.join(output_dir, img_name), settings.OUTPUT_IMAGE_FORMAT)
2 changes: 1 addition & 1 deletion marker/processors/page_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class PageHeaderProcessor(BaseProcessor):
"""
A processor for moving PageHeaders to the top
"""
block_types = (BlockTypes.PageHeader)
block_types = (BlockTypes.PageHeader,)

def __call__(self, document: Document):
for page in document.pages:
Expand Down
13 changes: 5 additions & 8 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import atexit
import re
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures.process import ProcessPoolExecutor
from itertools import repeat
from typing import List, Set
import multiprocessing as mp

import pypdfium2 as pdfium
from ftfy import fix_text
Expand Down Expand Up @@ -39,15 +35,15 @@ def __init__(self, filepath: str, config=None):
if self.page_range is None:
self.page_range = range(len(self.doc))

assert max(self.page_range) < len(self.doc) and min(self.page_range) >= 0, f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
assert max(self.page_range) < len(self.doc) and min(self.page_range) >= 0, \
f"Invalid page range, values must be between 0 and {len(self.doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."

if self.force_ocr:
# Manually assign page bboxes, since we can't get them from pdftext
self.page_bboxes = {i: self.doc[i].get_bbox() for i in self.page_range}
else:
self.page_lines = self.pdftext_extraction()


atexit.register(self.cleanup_pdf_doc)

def __len__(self) -> int:
Expand Down Expand Up @@ -115,7 +111,8 @@ def pdftext_extraction(self) -> ProviderPageLines:
page_range=self.page_range,
keep_chars=False,
workers=self.pdftext_workers,
flatten_pdf=self.flatten_pdf
flatten_pdf=self.flatten_pdf,
quote_loosebox=False
)
self.page_bboxes = {i: [0, 0, page["width"], page["height"]] for i, page in zip(self.page_range, page_char_blocks)}

Expand Down Expand Up @@ -216,4 +213,4 @@ def get_page_bbox(self, idx: int) -> PolygonBox | None:
return PolygonBox.from_bbox(bbox)

def get_page_lines(self, idx: int) -> List[ProviderOutput]:
return self.page_lines[idx]
return self.page_lines[idx]
2 changes: 1 addition & 1 deletion marker/renderers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def extract_image(document: Document, image_id, to_base64=False):
cropped = page_img.crop(image_box.bbox)
if to_base64:
image_buffer = io.BytesIO()
cropped.save(image_buffer, format='PNG')
cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT)
cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING)
return cropped

Expand Down
8 changes: 6 additions & 2 deletions marker/renderers/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from typing import Literal

from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
Expand All @@ -7,11 +6,16 @@
from marker.renderers import BaseRenderer
from marker.schema import BlockTypes
from marker.schema.blocks import BlockId
from marker.settings import settings

# Ignore beautifulsoup warnings
import warnings
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

# Suppress DecompressionBombError
from PIL import Image
Image.MAX_IMAGE_PIXELS = None


class HTMLOutput(BaseModel):
html: str
Expand Down Expand Up @@ -53,7 +57,7 @@ def extract_html(self, document, document_output, level=0):
elif ref_block_id.block_type in self.image_blocks:
if self.extract_images:
image = self.extract_image(document, ref_block_id)
image_name = f"{ref_block_id.to_path()}.png"
image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}"
images[image_name] = image
ref.replace_with(BeautifulSoup(f"<p><img src='{image_name}'></p>", 'html.parser'))
else:
Expand Down
2 changes: 1 addition & 1 deletion marker/renderers/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class JSONBlockOutput(BaseModel):

class JSONOutput(BaseModel):
children: List[JSONBlockOutput]
block_type: BlockTypes = BlockTypes.Document
block_type: str = str(BlockTypes.Document)
metadata: dict


Expand Down
4 changes: 2 additions & 2 deletions marker/schema/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import auto, StrEnum
from enum import auto, Enum


class BlockTypes(StrEnum):
class BlockTypes(str, Enum):
Line = auto()
Span = auto()
FigureGroup = auto()
Expand Down
3 changes: 2 additions & 1 deletion marker/settings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, List, Dict, Literal
from typing import Optional

from dotenv import find_dotenv
from pydantic import computed_field
Expand All @@ -16,6 +16,7 @@ class Settings(BaseSettings):

# General
OUTPUT_ENCODING: str = "utf-8"
OUTPUT_IMAGE_FORMAT: str = "JPEG"

# General models
TORCH_DEVICE: Optional[str] = None # Note: MPS device does not work for text detection, and will default to CPU
Expand Down
4 changes: 2 additions & 2 deletions marker_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def open_pdf(pdf_file):

def img_to_html(img, img_alt):
img_bytes = io.BytesIO()
img.save(img_bytes, format="PNG")
img.save(img_bytes, format=settings.OUTPUT_IMAGE_FORMAT)
img_bytes = img_bytes.getvalue()
encoded = base64.b64encode(img_bytes).decode()
img_html = f'<img src="data:image/png;base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
img_html = f'<img src="data:image/{settings.OUTPUT_IMAGE_FORMAT.lower()};base64,{encoded}" alt="{img_alt}" style="max-width: 100%;">'
return img_html


Expand Down
2 changes: 1 addition & 1 deletion marker_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ async def _convert_pdf(params: CommonParams):
encoded = {}
for k, v in images.items():
byte_stream = io.BytesIO()
v.save(byte_stream, format="PNG")
v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(settings.OUTPUT_ENCODING)

return {
Expand Down
Loading

0 comments on commit 6ded3b9

Please sign in to comment.