Skip to content

Commit

Permalink
refactor(docling): pipeline options (#21951)
Browse files Browse the repository at this point in the history
  • Loading branch information
hongbo-miao authored Dec 25, 2024
1 parent c605af1 commit 876ad3c
Show file tree
Hide file tree
Showing 16 changed files with 34 additions and 10 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions .markdownlint-cli2.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@
"machine-learning/graph-neural-network/dataset",
"machine-learning/graph-neural-network/wandb",
"machine-learning/hm-autogluon/AutogluonModels",
"machine-learning/hm-docling/data",
"machine-learning/hm-langchain/applications/*/data",
"machine-learning/hm-llama-index/applications/*/data",
"machine-learning/hm-mlflow/experiments/*/data",
Expand Down
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ AllCops:
- 'machine-learning/graph-neural-network/dataset/**/*'
- 'machine-learning/graph-neural-network/wandb/**/*'
- 'machine-learning/hm-autogluon/AutogluonModels/**/*'
- 'machine-learning/hm-docling/data/**/*'
- 'machine-learning/hm-langchain/applications/*/data/**/*'
- 'machine-learning/hm-llama-index/applications/*/data/**/*'
- 'machine-learning/hm-mlflow/experiments/*/data/**/*'
Expand Down
1 change: 1 addition & 0 deletions .ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ exclude = [
"machine-learning/graph-neural-network/dataset",
"machine-learning/graph-neural-network/wandb",
"machine-learning/hm-autogluon/AutogluonModels",
"machine-learning/hm-docling/data",
"machine-learning/hm-langchain/applications/*/data",
"machine-learning/hm-llama-index/applications/*/data",
"machine-learning/hm-mlflow/experiments/*/data",
Expand Down
1 change: 1 addition & 0 deletions .solhintignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ machine-learning/feature-store/driver_features/data
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions .sqlfluffignore
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ machine-learning/feature-store/driver_features/data
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions .stylelintignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions .textlintignore
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data/**/*
machine-learning/graph-neural-network/dataset/**/*
machine-learning/graph-neural-network/wandb/**/*
machine-learning/hm-autogluon/AutogluonModels/**/*
machine-learning/hm-docling/data/**/*
machine-learning/hm-langchain/applications/*/data/**/*
machine-learning/hm-llama-index/applications/*/data/**/*
machine-learning/hm-mlflow/experiments/*/data/**/*
Expand Down
1 change: 1 addition & 0 deletions .yamllint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ ignore: |
machine-learning/graph-neural-network/dataset
machine-learning/graph-neural-network/wandb
machine-learning/hm-autogluon/AutogluonModels
machine-learning/hm-docling/data
machine-learning/hm-langchain/applications/*/data
machine-learning/hm-llama-index/applications/*/data
machine-learning/hm-mlflow/experiments/*/data
Expand Down
1 change: 1 addition & 0 deletions eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ export default [
'machine-learning/graph-neural-network/dataset',
'machine-learning/graph-neural-network/wandb',
'machine-learning/hm-autogluon/AutogluonModels',
'machine-learning/hm-docling/data',
'machine-learning/hm-langchain/applications/*/data',
'machine-learning/hm-llama-index/applications/*/data',
'machine-learning/hm-mlflow/experiments/*/data',
Expand Down
Empty file.
26 changes: 19 additions & 7 deletions machine-learning/hm-docling/src/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,31 @@
import logging
from pathlib import Path

from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
TableStructureOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


def main() -> None:
data_dir = Path("data")
pdf_paths = data_dir.glob("**/*.pdf")

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True)
pipeline_options = PdfPipelineOptions(
do_ocr=True,
do_table_structure=True,
table_structure_options=TableStructureOptions(do_cell_matching=True),
ocr_options=EasyOcrOptions(),
)
converter = DocumentConverter(
allowed_formats=[InputFormat.PDF],
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
},
)

converter = DocumentConverter()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ def main(model_path: str, pdf_path: str, question: str) -> None:
# https://gpt4all.io/index.html
external_model_path = "data/ggml-model-gpt4all-falcon-q4_0.bin"
external_pdf_path = "data/my.pdf"
external_question = "Could you please summarize this PDF? Thank you!"
external_question = "Could you please summarize this PDF?"

main(external_model_path, external_pdf_path, external_question)
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def main() -> None:
temperature=0.7,
)

pdf_path = Path("data/paper.pdf")
question = "Could you please summarize this PDF? Thank you!"
pdf_path = Path("data/file.pdf")
question = "Could you please summarize this PDF?"
answer = chat_with_pdf(pdf_path, question)
logging.info(answer)

Expand Down

0 comments on commit 876ad3c

Please sign in to comment.