Skip to content

Commit

Permalink
Performance exploration with onnx (#2)
Browse files Browse the repository at this point in the history
* adds onnx model

* adds onnx model and utilities

* rollback
  • Loading branch information
sfc-gh-dkajtoch authored Nov 12, 2024
1 parent 67ab18a commit 65d4e39
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 18 deletions.
Empty file added profile/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions profile/commons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
from pdf2image import convert_from_bytes
from PIL.Image import Image


def load_pdf_bytes() -> bytes:
dir_path = os.path.dirname(__file__)
path = os.path.join(dir_path, "resources", "sample_pdf.pdf")
with open(path, "rb") as fh:
return fh.read()


def pdf_to_image(pdf_bytes: bytes, dpi: int) -> Image:
images = convert_from_bytes(pdf_bytes, dpi=dpi, grayscale=False)
return images[0]
54 changes: 54 additions & 0 deletions profile/paddle_onnx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from paddleocr import PaddleOCR

import os
import numpy as np
import timeit
import click

from profile.commons import load_pdf_bytes, pdf_to_image
import onnxruntime as ort


def _init_model(base_dir_path: str) -> PaddleOCR:
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = 8
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL

return PaddleOCR(
use_onnx=True,
det_model_dir=os.path.join(base_dir_path, "det.onnx"),
rec_model_dir=os.path.join(base_dir_path, "rec.onnx"),
cls_model_dir=os.path.join(base_dir_path, "cls.onnx"),
ocr_version="PP-OCRv4",
rec_batch_num=6,
onnx_providers=["CPUExecutionProvider"],
onnx_sess_options=sess_options,
)


@click.command()
@click.option("--paddle-dir-path", default="/home/play/models/paddle2onnx/")
def main(
paddle_dir_path: str,
):
pdf_bytes = load_pdf_bytes()
image = pdf_to_image(pdf_bytes, dpi=200)

model = _init_model(paddle_dir_path)

times = timeit.Timer(
lambda: model.ocr(
np.asarray(image),
det=True,
rec=True,
cls=False,
)
).repeat(repeat=4, number=1)

print(f"Mean time: {np.mean(times[1:])}")
print(f"Std time: {np.std(times[1:])}")


if __name__ == "__main__":
main()
23 changes: 5 additions & 18 deletions profile/profile_paddleocr.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from paddleocr import PaddleOCR

from PIL.Image import Image
import os
import numpy as np
from pdf2image import convert_from_bytes
import timeit
import cProfile
import click
from pydantic_settings import BaseSettings, SettingsConfigDict
from profile.commons import load_pdf_bytes, pdf_to_image


class PaddleSettings(BaseSettings):
Expand All @@ -27,18 +26,6 @@ class PaddleSettings(BaseSettings):
model_config = SettingsConfigDict(env_prefix="PADDLE_")


def _load_pdf_bytes() -> bytes:
dir_path = os.path.dirname(__file__)
path = os.path.join(dir_path, "resources", "sample_pdf.pdf")
with open(path, "rb") as fh:
return fh.read()


def pdf_to_image(pdf_bytes: bytes, dpi: int) -> Image:
images = convert_from_bytes(pdf_bytes, dpi=dpi, grayscale=False)
return images[0]


class TimeitDpi:
def __init__(self, settings: PaddleSettings) -> None:
self.settings = settings
Expand All @@ -47,7 +34,7 @@ def __init__(self, settings: PaddleSettings) -> None:
self.std_exec_time = [0] * len(self.dpi_values)

def setup(self) -> None:
self.pdf_bytes = _load_pdf_bytes()
self.pdf_bytes = load_pdf_bytes()
self.model = PaddleOCR(**self.settings.model_dump())

def timeit(self) -> None:
Expand Down Expand Up @@ -75,7 +62,7 @@ def __init__(self, settings: PaddleSettings) -> None:
self.dpi_values = [50, 100, 150, 200, 250, 300]

def setup(self) -> None:
self.pdf_bytes = _load_pdf_bytes()
self.pdf_bytes = load_pdf_bytes()
self.model = PaddleOCR(**self.settings.model_dump())

def profile(self) -> None:
Expand All @@ -95,7 +82,7 @@ def profile(self) -> None:


def print_recognition_pred():
pdf_bytes = _load_pdf_bytes()
pdf_bytes = load_pdf_bytes()
model = PaddleOCR(det=False, rec=True, cls=False)
image = pdf_to_image(pdf_bytes, dpi=300)
ocr_result = model.ocr(np.asarray(image), det=False, rec=True, cls=False)
Expand All @@ -107,7 +94,7 @@ def print_recognition_pred():
"--mode",
default="timeit",
help="Mode to run the script in.",
choices=["timeit", "cprofile", "print_recognition_pred"],
type=click.Choice(["timeit", "cprofile", "print_recognition_pred"]),
)
def main(mode: str) -> None:
if mode == "timeit":
Expand Down
30 changes: 30 additions & 0 deletions profile/scripts/convert_to_onnx.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

DET_PATH=~/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/
REC_PATH=~/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/
CLS_PATH=~/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/
SAVE_DIR_PATH=~/models/paddle2onnx

paddle2onnx --model_dir $DET_PATH \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
--save_file $SAVE_DIR_PATH/det.onnx \
--opset_version 14 \
--enable_onnx_checker True \
--custom_ops '{"paddle_op":"onnx_op"}'

paddle2onnx --model_dir $REC_PATH \
--model_filename inference.pdmodel \
--params_filename inference.pdiparams \
--save_file $SAVE_DIR_PATH/rec.onnx \
--opset_version 14 \
--enable_onnx_checker True \
--custom_ops '{"paddle_op":"onnx_op"}'

paddle2onnx --model_dir $CLS_PATH \
--model_filename ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel \
--params_filename ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams \
--save_file @SAVE_DIR_PATH/cls.onnx \
--opset_version 14 \
--enable_onnx_checker True \
--custom_ops '{"paddle_op":"onnx_op"}'
Empty file added profile/utilities/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions profile/utilities/onnx_quantization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from onnxruntime.quantization import (
quantize_dynamic,
QuantType,
quant_pre_process,
)
import os
import click


# TODO: produces error: Incomplete symbolic shape inference from
# onnxruntime.tools.symbolic_shape_infer line 2932.
def run(dir_path: str, filename: str):
base_name = filename.split(".")[0]
model_path = os.path.join(dir_path, filename)
infer_model_path = os.path.join(dir_path, f"{base_name}_infer.onnx")
quant_model_path = os.path.join(dir_path, f"{base_name}_quant.onnx")
quant_pre_process(
input_model=model_path,
output_model_path=infer_model_path,
)
quantize_dynamic(
model_input=infer_model_path,
model_output=quant_model_path,
weight_type=QuantType.QInt8,
)


@click.command()
@click.option("--dir_path", type=str, required=True)
@click.option("--det_filename", type=str, required=True, default="det.onnx")
@click.option("--rec_filename", type=str, required=True, default="rec.onnx")
def main(dir_path: str, det_filename: str, rec_filename: str) -> None:
for filename in [det_filename, rec_filename]:
run(dir_path=dir_path, filename=filename)


if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ line-profiler==4.1.3
pdf2image==1.17.0
pydantic-settings==2.6.0
click==8.1.7
paddlepaddle==2.6.0
paddle2onnx==1.2.11
onnxruntime==1.20.0

0 comments on commit 65d4e39

Please sign in to comment.