From db1cca16320ca41cfb2bea2e03d525c7bd465b6a Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Thu, 21 Dec 2023 14:33:53 -0800 Subject: [PATCH] Add new benchmarks --- .github/workflows/publish.yml | 29 ++++++++ .github/workflows/tests.yml | 35 ++++++++++ README.md | 105 +++++++++++++++++------------ benchmark.py | 1 + ocr_app.py | 13 ++-- ocr_image.py | 7 +- poetry.lock | 18 ++--- pyproject.toml | 4 +- run_ocr_app.py | 5 +- scripts/verify_benchmark_scores.py | 20 ++++++ texify/inference.py | 11 ++- texify/settings.py | 7 +- 12 files changed, 185 insertions(+), 70 deletions(-) create mode 100644 .github/workflows/publish.yml create mode 100644 .github/workflows/tests.yml create mode 100644 scripts/verify_benchmark_scores.py diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..7e1723c --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,29 @@ +name: Python package +on: + push: + tags: + - "v*.*.*" +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install python dependencies + run: | + pip install poetry + poetry install + poetry remove torch + poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu + - name: Build package + run: | + poetry build + - name: Publish package + env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} + run: | + poetry config pypi-token.pypi "$PYPI_TOKEN" + poetry publish diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..d048cc2 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,35 @@ +name: Integration test + +on: [push] + +env: + TORCH_DEVICE: "cpu" + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install python dependencies + run: | + pip install poetry + poetry install + poetry remove torch + poetry run pip install torch --index-url https://download.pytorch.org/whl/cpu + - name: Download benchmark data + run: | + wget -O benchmark_data.zip "https://drive.google.com/uc?export=download&id=1dbY0kBq2SUa885gmbLPUWSRzy5K7O5XJ" + unzip benchmark_data.zip + mkdir data + mv bench_data.json data/bench_data.json + - name: Run benchmark test + run: | + poetry run texify_benchmark --max 16 + poetry run python scripts/verify_benchmark_scores.py data/bench_results.json + + + diff --git a/README.md b/README.md index 842c9f8..354305a 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,15 @@ # Texify -Texify converts equations and surrounding text into markdown with LaTeX math that can be rendered by MathJax ($$ and $ are delimiters). It will work with images or pdfs, and can run on CPU, GPU, or MPS. +Texify converts equations and surrounding text into markdown and LaTeX that can be rendered by MathJax ($$ and $ are delimiters). It will work with images or pdfs, and can run on CPU, GPU, or MPS. -https://github.com/VikParuchuri/texify/assets/913340/39b1f139-872f-4ae8-9f31-39e396953bd9 +https://github.com/VikParuchuri/texify/assets/913340/882022a6-020d-4796-af02-67cb77bc084c -> **Example** -> -> ![image](data/examples/0.png) -> -> **Detected Text** The potential $V_{i}$ of cell $\mathcal{C}_{j}$ centred at position $\mathbf{r}_{i}$ is related to the surface charge densities $\sigma_{j}$ of cells $\mathcal{E}_{j}$ $j\in[1,N]$ through the superposition principle as: -> -> $$V_{i}\,=\,\sum_{j=0}^{N}\,\frac{\sigma_{j}}{4\pi\varepsilon_{0}}\,\int_{\mathcal{E}_{j}}\frac{1}{\left|\mathbf{r}_{i}-\mathbf{r}^{\prime}\right|}\,\mathrm{d}^{2}\mathbf{r}^{\prime}\,=\,\sum_{j=0}^{N}\,Q_{ij}\,\sigma_{j},$$ -> -> where the integral over the surface of cell $\mathcal{C}_{j}$ only depends on $ \mathcal{C}_{j} $ shape and on the relative position of the target point $\mathbf{r}_{i}$ with respect to $\mathcal{C}_{j}$ location, as $\sigma_{j}$ is assumed constant over the whole surface of cell $\mathcal{C}_{j}$. +The closest open source comparisons to texify are [pix2tex](https://github.com/lukas-blecher/LaTeX-OCR) and [nougat](https://github.com/facebookresearch/nougat), although they're designed for different purposes: -The closest open source comparisons to texify are pix2tex and nougat, although they're designed for different purposes: +- Pix2tex is designed for block LaTeX equations, and hallucinates more on text. Texify can work with inline equations and text. +- Nougat is designed to OCR entire pages, and hallucinates more on small images. Texify is optimized for equations and small page regions. -- Compared to [pix2tex](https://github.com/lukas-blecher/LaTeX-OCR), texify can detect text and inline equations. Pix2tex is designed for block LaTeX equations, and hallucinates more on text. -- Compared to [nougat](https://github.com/facebookresearch/nougat), texify is optimized for equations and small page regions. Nougat is designed to OCR entire pages, and hallucinates more on small images. - -I created texify to render equations in [marker](https://github.com/VikParuchuri/marker), but realized it could also be valuable on its own. +Pix2tex is trained on im2latex, and nougat is trained on arxiv. Texify is trained on a broader set of web data, and works on a range of images. See more details in the [benchmarks](#benchmarks) section. @@ -27,44 +17,64 @@ See more details in the [benchmarks](#benchmarks) section. [Discord](https://discord.gg//KuZwXNGnfH) is where we discuss future development. +## Examples + +![Example 0](data/examples/0.png) + +> **Detected Text** The potential $V_{i}$ of cell $\mathcal{C}_{j}$ centred at position $\mathbf{r}_{i}$ is related to the surface charge densities $\sigma_{j}$ of cells $\mathcal{E}_{j}$ $j\in[1,N]$ through the superposition principle as: +> +> $$V_{i}\,=\,\sum_{j=0}^{N}\,\frac{\sigma_{j}}{4\pi\varepsilon_{0}}\,\int_{\mathcal{E}_{j}}\frac{1}{\left|\mathbf{r}_{i}-\mathbf{r}^{\prime}\right|}\,\mathrm{d}^{2}\mathbf{r}^{\prime}\,=\,\sum_{j=0}^{N}\,Q_{ij}\,\sigma_{j},$$ +> +> where the integral over the surface of cell $\mathcal{C}_{j}$ only depends on $ \mathcal{C}_{j} $ shape and on the relative position of the target point $\mathbf{r}_{i}$ with respect to $\mathcal{C}_{j}$ location, as $\sigma_{j}$ is assumed constant over the whole surface of cell $\mathcal{C}_{j}$. + +| Image | OCR Markdown | +|----------------------------|---------------------------| +| [1](data/examples/100.png) | [1](data/examples/100.md) | +| [2](data/examples/300.png) | [2](data/examples/300.md) | +| [3](data/examples/400.png) | [3](data/examples/400.md) | + # Installation -This has been tested on Mac and Linux (Ubuntu and Debian). You'll need python 3.10+ and [poetry](https://python-poetry.org/docs/#installing-with-the-official-installer). +This has been tested on Mac and Linux (Ubuntu and Debian). You'll need python 3.10+ and PyTorch. You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine. See [here](https://pytorch.org/get-started/locally/) for more details. -- `git clone https://github.com/VikParuchuri/texify.git` -- `cd texify` -- `poetry install --without dev` # This skips the dev dependencies +`pip install texify` Model weights will automatically download the first time you run it. # Usage -First, some configuration: - -- Inspect the settings in `texify/settings.py`. You can override any settings in a `local.env` file, or by setting environment variables. +- Inspect the settings in `texify/settings.py`. You can override any settings with environment variables. - Your torch device will be automatically detected, but you can override this. For example, `TORCH_DEVICE=cuda` or `TORCH_DEVICE=mps`. ## App for interactive conversion -I've included a streamlit app that lets you interactively select and convert equations from images or PDF files. To run it, do this: +I've included a streamlit app that lets you interactively select and convert equations from images or PDF files. Run it with: ``` -streamlit run ocr_app.py +texify_gui ``` The app will allow you to select the specific equations you want to convert on each page, then render the results with KaTeX and enable easy copying. -## Convert an image or directory of images +## Convert images -Run `ocr_image.py`, like this: +You can OCR a single image or a folder of images with: ``` -python ocr_image.py /path/to/folder_or_file --max 8 --json_path results.json +texify /path/to/folder_or_file --max 8 --json_path results.json ``` - `--max` is how many images in the folder to convert at most. Omit this to convert all images in the folder. - `--json_path` is an optional path to a json file where the results will be saved. If you omit this, the results will be saved to `data/results.json`. +# Manual install + +If you want to develop texify, you can install it manually: + +- `git clone https://github.com/VikParuchuri/texify.git` +- `cd texify` +- `poetry install` # This skips the dev dependencies + # Limitations OCR is complicated, and texify is not perfect. Here are some known limitations: @@ -76,30 +86,39 @@ OCR is complicated, and texify is not perfect. Here are some known limitations: # Benchmarks -Benchmarking OCR quality is hard - you ideally need a parallel corpus that models haven't been trained on. I've sampled some images from across a range of sources (web, arxiv, im2latex) to create a representative benchmark set. +Benchmarking OCR quality is hard - you ideally need a parallel corpus that models haven't been trained on. I sampled from arxiv and im2latex to create the benchmark set. + +![Benchmark results](data/images/texify_bench.png) + +Each model is trained on one of the benchmark tasks: + +- Nougat was trained on arxiv, possibly the images in the benchmark. +- Pix2tex was trained on im2latex. +- Texify was trained on im2latex. It was trained on arxiv, but not the images in the benchmark. -Of these, here is what is known about the training data: +Although this makes the benchmark results biased, it does seem like a good compromise, since nougat and pix2tex don't work as well out of domain. Note that neither pix2tex or nougat is really designed for this task (OCR inline equations and text), so this is not a perfect comparison. -- Nougat was trained on arxiv. -- Pix2tex was trained on im2latex and web images. -- Texify was trained on im2latex and web images. +| Model | BLEU ⬆ | METEOR ⬆ | Edit Distance ⬇ | +|---------|--------------|--------------|-----------------| +| pix2tex | 0.382659 | 0.543363 | 0.352533 | +| nougat | 0.697667 | 0.668331 | 0.288159 | +| texify | **0.837895** | **0.865492** | **0.0842209** | ## Running your own benchmarks You can benchmark the performance of texify on your machine. -- Clone the repo if you haven't already (see above for manual installation instructions) -- Install dev dependencies with `poetry install` - - If you want to use pix2tex, run `pip install pix2tex` - - If you want to use nougat, run `pip install nougat-ocr` -- Download the benchmark data [here]() and put it in the `data` folder. +- Follow the manual install instructions above. +- If you want to use pix2tex, run `pip install pix2tex` +- If you want to use nougat, run `pip install nougat-ocr` +- Download the benchmark data [here](https://drive.google.com/file/d/1dbY0kBq2SUa885gmbLPUWSRzy5K7O5XJ/view?usp=sharing) and put it in the `data` folder. - Run `benchmark.py` like this: ``` python benchmark.py --max 100 --pix2tex --nougat --data_path data/bench_data.json --result_path data/bench_results.json ``` -This will benchmark marker against Latex-OCR. It will do batch inference with texify, but not with Latex-OCR, since I couldn't find an option for batching. +This will benchmark marker against pix2tex and nougat. It will do batch inference with texify and nougat, but not with pix2tex, since I couldn't find an option for batching. - `--max` is how many benchmark images to convert at most. - `--data_path` is the path to the benchmark data. If you omit this, it will use the default path. @@ -109,17 +128,17 @@ This will benchmark marker against Latex-OCR. It will do batch inference with t # Training -Texify was trained on latex images and paired equations from across the web. It includes the [im2latex](https://github.com/guillaumegenthial/im2latex) dataset. Training happened on 4x A6000 GPUs for 3 days. +Texify was trained on latex images and paired equations from across the web. It includes the [im2latex](https://github.com/guillaumegenthial/im2latex) dataset. Training happened on 4x A6000s for 2 days (~6 epochs). # Commercial usage -This model is trained on top of the openly licensed [Donut](https://huggingface.co/naver-clova-ix/donut-base) model, and thus can be used for commercial purposes. +This model is trained on top of the openly licensed [Donut](https://huggingface.co/naver-clova-ix/donut-base) model, and thus can be used for commercial purposes. Model weights are licensed under the [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) license. # Thanks -This work would not have been possible without lots of amazing open source work. I particularly want to acknowledge Lukas Blecher, whose work on Nougat and Latex-OCR was key for this project. I learned a lot from his code, and used parts of it for texify. +This work would not have been possible without lots of amazing open source work. I particularly want to acknowledge [Lukas Blecher](https://github.com/lukas-blecher), whose work on Nougat and pix2tex was key for this project. I learned a lot from his code, and used parts of it for texify. - [im2latex](https://github.com/guillaumegenthial/im2latex) - one of the datasets used for training - [Donut](https://huggingface.co/naver-clova-ix/donut-base) from Naver, the base model for texify -- [Nougat](https://github.com/facebookresearch/nougat) - I used the tokenized from Nougat +- [Nougat](https://github.com/facebookresearch/nougat) - I used the tokenizer from Nougat - [Latex-OCR](https://github.com/lukas-blecher/LaTeX-OCR) - The original open source Latex OCR project \ No newline at end of file diff --git a/benchmark.py b/benchmark.py index dc526de..519119d 100644 --- a/benchmark.py +++ b/benchmark.py @@ -146,6 +146,7 @@ def main(): source_path = os.path.abspath(args.data_path) result_path = os.path.abspath(args.result_path) + os.makedirs(os.path.dirname(result_path), exist_ok=True) model = load_model() processor = load_processor() diff --git a/ocr_app.py b/ocr_app.py index 7a13200..2afbd0e 100644 --- a/ocr_app.py +++ b/ocr_app.py @@ -9,6 +9,7 @@ from texify.inference import batch_inference from texify.model.model import load_model from texify.model.processor import load_processor +from texify.settings import settings import subprocess import re from PIL import Image @@ -30,9 +31,9 @@ def load_processor_cached(): @st.cache_data() -def infer_image(pil_image, bbox): +def infer_image(pil_image, bbox, temperature): input_img = pil_image.crop(bbox) - model_output = batch_inference([input_img], model, processor) + model_output = batch_inference([input_img], model, processor, temperature=temperature) return model_output[0] @@ -85,7 +86,9 @@ def get_image_size(pil_image): top_message = """### Texify -After the model loads, upload an image or a pdf, then draw a box around the equation or text you want to OCR by clicking and dragging. Texify will convert it to Markdown with LaTeX math on the right. If you have already cropped your image, select "OCR image" in the sidebar instead. +After the model loads, upload an image or a pdf, then draw a box around the equation or text you want to OCR by clicking and dragging. Texify will convert it to Markdown with LaTeX math on the right. If you don't get good results, try selecting a slightly different box, or changing the temperature value. + +If you have already cropped your image, select "OCR image" in the sidebar instead. """ st.markdown(top_message) @@ -109,6 +112,8 @@ def get_image_size(pil_image): pil_image = get_uploaded_image(in_file) whole_image = st.sidebar.button("OCR image") +temperature = st.sidebar.slider("Generation temperature:", min_value=0.0, max_value=1.0, value=0.0, step=0.05) + canvas_hash = get_canvas_hash(pil_image) if pil_image else "canvas" with col1: @@ -140,7 +145,7 @@ def get_image_size(pil_image): if bbox_list: with col2: - inferences = [infer_image(pil_image, bbox) for bbox in bbox_list] + inferences = [infer_image(pil_image, bbox, temperature) for bbox in bbox_list] for idx, inference in enumerate(reversed(inferences)): st.markdown(f"### {len(inferences) - idx}") katex_markdown = replace_katex_invalid(inference) diff --git a/ocr_image.py b/ocr_image.py index e876a29..3ad2361 100644 --- a/ocr_image.py +++ b/ocr_image.py @@ -14,9 +14,8 @@ def inference_single_image(image_path, json_path, model, processor): image = Image.open(image_path) text = batch_inference([image], model, processor) write_data = [{"image_path": image_path, "text": text[0]}] - with open(json_path, "w") as f: + with open(json_path, "w+") as f: json_repr = json.dumps(write_data, indent=4) - print(json_repr) f.write(json_repr) @@ -34,9 +33,8 @@ def inference_image_dir(image_dir, json_path, model, processor, max=None): for image_path, t in zip(batch, text): write_data.append({"image_path": image_path, "text": t}) - with open(json_path, "w") as f: + with open(json_path, "w+") as f: json_repr = json.dumps(write_data, indent=4) - print(json_repr) f.write(json_repr) @@ -52,6 +50,7 @@ def main(): processor = load_processor() json_path = os.path.abspath(args.json_path) + os.makedirs(os.path.dirname(json_path), exist_ok=True) if os.path.isfile(image_path): inference_single_image(image_path, json_path, model, processor) diff --git a/poetry.lock b/poetry.lock index 581edc7..8b00dad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3609,25 +3609,21 @@ watchdog = {version = ">=2.1.5", markers = "platform_system != \"Darwin\""} snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python (>=0.9.0)"] [[package]] -name = "streamlit-drawable-canvas" +name = "streamlit-drawable-canvas-jsretry" version = "0.9.3" -description = "" +description = "A Streamlit custom component for a free drawing canvas using Fabric.js. A fork to enable retrying for bg images." optional = false python-versions = ">=3.6" -files = [] -develop = false +files = [ + {file = "streamlit-drawable-canvas-jsretry-0.9.3.tar.gz", hash = "sha256:d9da8a863faeeae01c8521e8e282ed83cc15b845962519149a61fc8eead7afe6"}, + {file = "streamlit_drawable_canvas_jsretry-0.9.3-py3-none-any.whl", hash = "sha256:e8035daa0297b504cc184e58ddf15cfd59680241ce1c2d0d554de507a263ca20"}, +] [package.dependencies] numpy = "*" Pillow = "*" streamlit = ">=0.63" -[package.source] -type = "git" -url = "https://github.com/VikParuchuri/streamlit-drawable-canvas.git" -reference = "develop" -resolved_reference = "c6b9290ddfae8f347c652986cfd844065c6195c5" - [[package]] name = "sympy" version = "1.12" @@ -4501,4 +4497,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "478d2aced0cd44b507e46f980fd5b28dd0ccc446367a6e39aabd1bd80cd5ca49" +content-hash = "20021764b80e2bafe0a5f923108cf684ddeea08cc50a04aae0835891d479646e" diff --git a/pyproject.toml b/pyproject.toml index 7014d74..82e99a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "texify" -version = "0.1.0" +version = "0.1.2" description = "OCR for latex images" authors = ["Vik Paruchuri "] readme = "README.md" @@ -26,9 +26,9 @@ numpy = "^1.26.2" pypdfium2 = "^4.25.0" python-dotenv = "^1.0.0" watchdog = "^3.0.0" -streamlit-drawable-canvas = {git = "https://github.com/VikParuchuri/streamlit-drawable-canvas.git", rev = "develop"} ftfy = "^6.1.3" tabulate = "^0.9.0" +streamlit-drawable-canvas-jsretry = "^0.9.3" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" diff --git a/run_ocr_app.py b/run_ocr_app.py index 718f32e..27235fe 100644 --- a/run_ocr_app.py +++ b/run_ocr_app.py @@ -1,5 +1,8 @@ import subprocess +import os def run_app(): - subprocess.run(["streamlit", "run", "ocr_app.py"]) \ No newline at end of file + cur_dir = os.path.dirname(os.path.abspath(__file__)) + ocr_app_path = os.path.join(cur_dir, "ocr_app.py") + subprocess.run(["streamlit", "run", ocr_app_path]) \ No newline at end of file diff --git a/scripts/verify_benchmark_scores.py b/scripts/verify_benchmark_scores.py new file mode 100644 index 0000000..494ba8c --- /dev/null +++ b/scripts/verify_benchmark_scores.py @@ -0,0 +1,20 @@ +import json +import argparse + + +def verify_scores(file_path): + with open(file_path, 'r') as file: + data = json.load(file) + + scores = data["texify"]["scores"] + + if scores["bleu"] <= 0.7 or scores["meteor"] <= 0.7 or scores["edit"] > 0.2: + print(scores) + raise ValueError("Scores do not meet the required threshold") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Verify benchmark scores") + parser.add_argument("file_path", type=str, help="Path to the json file") + args = parser.parse_args() + verify_scores(args.file_path) diff --git a/texify/inference.py b/texify/inference.py index a53ab56..34fa8f2 100644 --- a/texify/inference.py +++ b/texify/inference.py @@ -2,16 +2,23 @@ from texify.output import postprocess -def batch_inference(images, model, processor): +def batch_inference(images, model, processor, temperature=settings.TEMPERATURE): images = [image.convert("RGB") for image in images] encodings = processor(images=images, return_tensors="pt", add_special_tokens=False) pixel_values = encodings["pixel_values"].to(settings.MODEL_DTYPE) pixel_values = pixel_values.to(settings.TORCH_DEVICE_MODEL) + additional_kwargs = {} + if temperature > 0: + additional_kwargs["temperature"] = temperature + additional_kwargs["do_sample"] = True + additional_kwargs["top_p"] = 0.95 + generated_ids = model.generate( pixel_values=pixel_values, max_new_tokens=settings.MAX_TOKENS, - decoder_start_token_id=processor.tokenizer.bos_token_id + decoder_start_token_id=processor.tokenizer.bos_token_id, + **additional_kwargs, ) generated_text = processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) diff --git a/texify/settings.py b/texify/settings.py index 78e1f1d..f157bab 100644 --- a/texify/settings.py +++ b/texify/settings.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import Dict, Optional from dotenv import find_dotenv from pydantic import computed_field @@ -8,12 +8,13 @@ class Settings(BaseSettings): # General - TORCH_DEVICE: str = None - MAX_TOKENS: int = 384 + TORCH_DEVICE: Optional[str] = None + MAX_TOKENS: int = 384 # Will not work well above 768, since it was not trained with more MAX_IMAGE_SIZE: Dict = {"height": 420, "width": 420} MODEL_CHECKPOINT: str = "vikp/texify" BATCH_SIZE: int = 16 # Should use ~5GB of RAM DATA_DIR: str = "data" + TEMPERATURE: float = 0.0 # Temperature for generation, 0.0 means greedy @computed_field @property