Skip to content

Commit

Permalink
Merge pull request #363 from VikParuchuri/dev-mose/marker-v2
Browse files Browse the repository at this point in the history
Cleanup and speed up tests
  • Loading branch information
VikParuchuri authored Nov 18, 2024
2 parents 44d3a4c + 8f65acf commit 6d8e180
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 88 deletions.
11 changes: 6 additions & 5 deletions marker/v2/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from pydantic import BaseModel

from marker.v2.schema import BlockTypes
from marker.v2.schema.blocks import BlockId, BlockOutput
from marker.v2.schema.blocks import Block, BlockId, BlockOutput
from marker.v2.schema.groups.page import PageGroup


Expand All @@ -28,11 +28,12 @@ def get_block(self, block_id: BlockId):
return None

def get_page(self, page_id):
page = self.pages[page_id]
assert page.page_id == page_id, "Mismatch between page_id and page index"
return page
for page in self.pages:
if page.page_id == page_id:
return page
return None

def assemble_html(self, child_blocks):
def assemble_html(self, child_blocks: List[Block]):
template = ""
for c in child_blocks:
template += f"<content-ref src='{c.id}'></content-ref>"
Expand Down
20 changes: 13 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,28 @@ def table_rec_model():

@pytest.fixture(scope="function")
def pdf_provider(request):
mark = request.node.get_closest_marker("filename")
filename = mark.args[0] if mark else "adversarial.pdf"
filename_mark = request.node.get_closest_marker("filename")
filename = filename_mark.args[0] if filename_mark else "adversarial.pdf"

config_mark = request.node.get_closest_marker("config")
config = config_mark.args[0] if config_mark else None

dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index(filename)

temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()
yield PdfProvider(temp_pdf.name)
yield PdfProvider(temp_pdf.name, config)


@pytest.fixture(scope="function")
def pdf_document(pdf_provider, layout_model, recognition_model, detection_model) -> Document:
layout_builder = LayoutBuilder(layout_model)
ocr_builder = OcrBuilder(detection_model, recognition_model)
builder = DocumentBuilder()
def pdf_document(request, pdf_provider, layout_model, recognition_model, detection_model) -> Document:
config_mark = request.node.get_closest_marker("config")
config = config_mark.args[0] if config_mark else None

layout_builder = LayoutBuilder(layout_model, config)
ocr_builder = OcrBuilder(detection_model, recognition_model, config)
builder = DocumentBuilder(config)
document = builder(pdf_provider, layout_builder, ocr_builder)
return document
22 changes: 3 additions & 19 deletions tests/test_document_builder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import pytest

from marker.v2.schema import BlockTypes
from marker.v2.schema.text.line import Line


@pytest.mark.config({"page_range": [0]})
def test_document_builder(pdf_document):
first_page = pdf_document.pages[0]
assert first_page.structure[0] == '/page/0/SectionHeader/0'
Expand All @@ -18,22 +21,3 @@ def test_document_builder(pdf_document):
assert first_span.text == 'Subspace Adversarial Training'
assert first_span.font == 'NimbusRomNo9L-Medi'
assert first_span.formats == ['plain']

last_block = first_page.get_block(first_page.structure[-1])
assert last_block.block_type == BlockTypes.Text

last_text_block: Line = first_page.get_block(last_block.structure[-1])
assert last_text_block.block_type == BlockTypes.Line

last_span = first_page.get_block(last_text_block.structure[-1])
assert last_span.block_type == BlockTypes.Span
assert last_span.text == 'prove the quality of single-step AT solutions. However,'
assert last_span.font == 'NimbusRomNo9L-Regu'
assert last_span.formats == ['plain']


if __name__ == "__main__":
from tests.utils import setup_pdf_document

pdf_document = setup_pdf_document("adversarial.pdf")
test_document_builder(pdf_document)
10 changes: 4 additions & 6 deletions tests/test_equation_processor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
from copy import deepcopy
import pytest

from marker.v2.schema import BlockTypes
from marker.v2.processors.equation import EquationProcessor


@pytest.mark.config({"page_range": [0]})
def test_equation_processor(pdf_document, texify_model):
processor = EquationProcessor(texify_model)
processor(pdf_document)

new_document = deepcopy(pdf_document)
new_document.pages = [new_document.pages[0]]
processor(new_document)

for block in new_document.pages[0].children:
for block in pdf_document.pages[0].children:
if block.block_type == BlockTypes.Equation:
assert block.latex is not None
9 changes: 3 additions & 6 deletions tests/test_garbled_pdf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
import pytest
from marker.v2.schema import BlockTypes
from marker.v2.schema.text.line import Line
from tests.utils import setup_pdf_document


def test_ocr_pipeline():
pdf_document = setup_pdf_document(
"water_damage.pdf"
)
@pytest.mark.filename("water_damage.pdf")
def test_ocr_pipeline(pdf_document):
assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'

table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
Expand Down
17 changes: 4 additions & 13 deletions tests/test_ocr_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import pytest

from marker.v2.schema import BlockTypes
from marker.v2.schema.text.line import Line
from tests.utils import setup_pdf_document


def test_ocr_pipeline():
pdf_document = setup_pdf_document(
"adversarial.pdf",
config={
"force_ocr": True
}
)

@pytest.mark.config({"force_ocr": True, "page_range": [0]})
def test_ocr_pipeline(pdf_document):
first_page = pdf_document.pages[0]
assert first_page.structure[0] == '/page/0/SectionHeader/0'

Expand All @@ -24,7 +19,3 @@ def test_ocr_pipeline():
first_span = first_page.get_block(first_text_block.structure[0])
assert first_span.block_type == BlockTypes.Span
assert first_span.text.strip() == 'Subspace Adversarial Training'


if __name__ == "__main__":
test_ocr_pipeline()
33 changes: 7 additions & 26 deletions tests/test_pdf_provider.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,17 @@
import tempfile
import pytest

import datasets

from marker.v2.providers.pdf import PdfProvider
@pytest.mark.config({"page_range": [0]})
def test_pdf_provider(pdf_provider):
assert len(pdf_provider) == 12
assert pdf_provider.get_image(0, 72).size == (612, 792)
assert pdf_provider.get_image(0, 96).size == (816, 1056)


def test_pdf_provider():
dataset = datasets.load_dataset("datalab-to/pdfs", split="train")
idx = dataset['filename'].index('adversarial.pdf')

temp_pdf = tempfile.NamedTemporaryFile(suffix=".pdf")
temp_pdf.write(dataset['pdf'][idx])
temp_pdf.flush()

provider = PdfProvider(temp_pdf.name)
assert len(provider) == 12
assert provider.get_image(0, 72).size == (612, 792)
assert provider.get_image(0, 96).size == (816, 1056)

spans_list = provider.get_page_spans(0)
spans_list = pdf_provider.get_page_spans(0)
assert len(spans_list) == 93

spans = spans_list[0]
assert len(spans) == 2
assert spans[0].text == "Subspace Adversarial Training"
assert spans[0].font == "NimbusRomNo9L-Medi"
assert spans[0].formats == ["plain"]

# for line in provider.get_page_lines(0):
# for span in line.spans:
# print(f"{span=}")


if __name__ == "__main__":
test_pdf_provider()
3 changes: 3 additions & 0 deletions tests/test_structure.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import pytest

from marker.v2.builders.structure import StructureBuilder


@pytest.mark.config({"page_range": [0]})
def test_structure_builder(pdf_document):
structure = StructureBuilder()
structure(pdf_document)
Expand Down
10 changes: 4 additions & 6 deletions tests/test_table_processor.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
from copy import deepcopy
import pytest

from tabled.schema import SpanTableCell

from marker.v2.schema import BlockTypes
from marker.v2.processors.table import TableProcessor


@pytest.mark.config({"page_range": [5]})
def test_table_processor(pdf_document, detection_model, recognition_model, table_rec_model):
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)

new_document = deepcopy(pdf_document)
new_document.pages = new_document.pages[:5]
processor(new_document)

for block in new_document.pages[0].children:
for block in pdf_document.pages[0].children:
if block.block_type == BlockTypes.Table:
assert block.cells is not None
assert len(block.cells) > 0
Expand Down

0 comments on commit 6d8e180

Please sign in to comment.