Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial package skeleton, fix editor token bug #49

Merged
merged 2 commits into from
Dec 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Python package
on:
push:
tags:
- "v*.*.*"
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install python dependencies
run: |
pip install poetry
poetry install
poetry remove torch
poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu
- name: Build package
run: |
poetry build
- name: Publish package
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
run: |
poetry config pypi-token.pypi "$PYPI_TOKEN"
poetry publish
10 changes: 5 additions & 5 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,13 @@
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.ordering import load_ordering_model
from marker.segmentation import load_layout_model
from marker.cleaners.equations import load_nougat_model
from marker.benchmark.scoring import score_text
from marker.extract_text import naive_get_text
import json
import os
import subprocess
import shutil
import fitz as pymupdf
from marker.settings import settings
from tabulate import tabulate

configure_logging()
Expand All @@ -34,7 +30,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
return data


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
parser.add_argument("in_folder", help="Input PDF files")
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
Expand Down Expand Up @@ -126,3 +122,7 @@ def nougat_prediction(pdf_filename, batch_size=1):
print("Scores by file")
print(tabulate(score_table, headers=["Method", *score_headers]))


if __name__ == "__main__":
main()

19 changes: 19 additions & 0 deletions chunk_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import argparse
import subprocess


def main():
parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
args = parser.parse_args()

# Construct the command
cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"

# Execute the shell script
subprocess.run(cmd, shell=True, check=True)


if __name__ == "__main__":
main()
Empty file modified chunk_convert.sh
100644 → 100755
Empty file.
8 changes: 6 additions & 2 deletions convert.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
print(traceback.format_exc())


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
Expand Down Expand Up @@ -121,4 +121,8 @@ def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Option
progress_bar.update(1)

# Shutdown ray to free resources
ray.shutdown()
ray.shutdown()


if __name__ == "__main__":
main()
9 changes: 6 additions & 3 deletions convert_single.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.settings import settings
import json

configure_logging()


if __name__ == "__main__":
def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output file name")
Expand All @@ -26,4 +25,8 @@

out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
with open(out_meta_filename, "w+") as f:
f.write(json.dumps(out_meta, indent=4))
f.write(json.dumps(out_meta, indent=4))


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion marker/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from marker.markdown import merge_spans, merge_lines, get_full_text
from marker.schema import Page, BlockType
from typing import List, Dict, Tuple, Optional
from copy import deepcopy
import re
import magic
from marker.settings import settings
Expand Down
2 changes: 0 additions & 2 deletions marker/postprocessors/editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import torch.nn.functional as F
from marker.postprocessors.t5 import T5ForTokenClassification, byt5_tokenize

tokenizer = AutoTokenizer.from_pretrained(settings.EDITOR_MODEL_NAME)


def load_editing_model():
if not settings.ENABLE_EDITOR_MODEL:
Expand Down
22 changes: 19 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
[tool.poetry]
name = "marker"
version = "0.1.0"
name = "marker-pdf"
version = "0.1.1"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
license = "GPL-3.0-or-later"
repository = "https://github.com/VikParuchuri/marker"
keywords = ["pdf", "markdown", "ocr", "nlp"]
packages = [
{include = "marker"}
]
include = [
"convert.py",
"convert_single.py",
"chunk_convert.sh",
"benchmark.py",
"chunk_convert.py",
]

[tool.poetry.dependencies]
python = ">=3.9,<3.13"
Expand Down Expand Up @@ -37,6 +47,12 @@ grpcio = "^1.60.0"
[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"

[tool.poetry.scripts]
marker = "convert:main"
marker_single = "convert_single:main"
marker_benchmark = "benchmark:main"
marker_chunk_convert = "chunk_convert:main"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
build-backend = "poetry.core.masonry.api"