diff --git a/.dockerignore b/.dockerignore index 0d7e5a2ece..5eb5b7b85b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels +machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/.gitignore b/.gitignore index 312027b3a4..0e145eb436 100644 --- a/.gitignore +++ b/.gitignore @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels +machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc index a839c102fd..db9a4e4ecd 100644 --- a/.markdownlint-cli2.jsonc +++ b/.markdownlint-cli2.jsonc @@ -126,6 +126,7 @@ "machine-learning/graph-neural-network/dataset", "machine-learning/graph-neural-network/wandb", "machine-learning/hm-autogluon/AutogluonModels", + "machine-learning/hm-docling/data", "machine-learning/hm-langchain/applications/*/data", "machine-learning/hm-llama-index/applications/*/data", "machine-learning/hm-mlflow/experiments/*/data", diff --git a/.prettierignore b/.prettierignore index f7767270db..7fabd52689 100644 --- a/.prettierignore +++ b/.prettierignore @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels +machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/.rubocop.yml b/.rubocop.yml index 83f7d6cc95..6a80e3032f 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -123,6 +123,7 @@ AllCops: - 'machine-learning/graph-neural-network/dataset/**/*' - 'machine-learning/graph-neural-network/wandb/**/*' - 'machine-learning/hm-autogluon/AutogluonModels/**/*' + - 'machine-learning/hm-docling/data/**/*' - 'machine-learning/hm-langchain/applications/*/data/**/*' - 'machine-learning/hm-llama-index/applications/*/data/**/*' - 'machine-learning/hm-mlflow/experiments/*/data/**/*' diff --git a/.ruff.toml b/.ruff.toml index a25980d9f0..54d9153bf4 100644 --- a/.ruff.toml +++ b/.ruff.toml @@ -117,6 +117,7 @@ exclude = [ "machine-learning/graph-neural-network/dataset", "machine-learning/graph-neural-network/wandb", "machine-learning/hm-autogluon/AutogluonModels", + "machine-learning/hm-docling/data", "machine-learning/hm-langchain/applications/*/data", "machine-learning/hm-llama-index/applications/*/data", "machine-learning/hm-mlflow/experiments/*/data", diff --git a/.solhintignore b/.solhintignore index 6f2c584f54..aa430ad5dc 100644 --- a/.solhintignore +++ b/.solhintignore @@ -116,6 +116,7 @@ machine-learning/feature-store/driver_features/data machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels +machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/.sqlfluffignore b/.sqlfluffignore index 37e19776d8..b5e67653a5 100644 --- a/.sqlfluffignore +++ b/.sqlfluffignore @@ -117,6 +117,7 @@ machine-learning/feature-store/driver_features/data machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels +machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/.stylelintignore b/.stylelintignore index 2ea7db8f4b..42b2d00858 100644 --- a/.stylelintignore +++ b/.stylelintignore @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels +machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/.textlintignore b/.textlintignore index a6199b491e..19cf7ec12f 100644 --- a/.textlintignore +++ b/.textlintignore @@ -118,6 +118,7 @@ machine-learning/feature-store/driver_features/data/**/* machine-learning/graph-neural-network/dataset/**/* machine-learning/graph-neural-network/wandb/**/* machine-learning/hm-autogluon/AutogluonModels/**/* +machine-learning/hm-docling/data/**/* machine-learning/hm-langchain/applications/*/data/**/* machine-learning/hm-llama-index/applications/*/data/**/* machine-learning/hm-mlflow/experiments/*/data/**/* diff --git a/.yamllint.yaml b/.yamllint.yaml index fa4732e714..1eec31c8a2 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -123,6 +123,7 @@ ignore: | machine-learning/graph-neural-network/dataset machine-learning/graph-neural-network/wandb machine-learning/hm-autogluon/AutogluonModels + machine-learning/hm-docling/data machine-learning/hm-langchain/applications/*/data machine-learning/hm-llama-index/applications/*/data machine-learning/hm-mlflow/experiments/*/data diff --git a/eslint.config.mjs b/eslint.config.mjs index 671304696e..22c2015742 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -130,6 +130,7 @@ export default [ 'machine-learning/graph-neural-network/dataset', 'machine-learning/graph-neural-network/wandb', 'machine-learning/hm-autogluon/AutogluonModels', + 'machine-learning/hm-docling/data', 'machine-learning/hm-langchain/applications/*/data', 'machine-learning/hm-llama-index/applications/*/data', 'machine-learning/hm-mlflow/experiments/*/data', diff --git a/machine-learning/hm-docling/data/.gitkeep b/machine-learning/hm-docling/data/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/machine-learning/hm-docling/src/main.py b/machine-learning/hm-docling/src/main.py index c2627e78d4..3c0c9822bf 100644 --- a/machine-learning/hm-docling/src/main.py +++ b/machine-learning/hm-docling/src/main.py @@ -1,19 +1,31 @@ import logging from pathlib import Path -from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions -from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PdfPipelineOptions, + TableStructureOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption def main() -> None: data_dir = Path("data") pdf_paths = data_dir.glob("**/*.pdf") - pipeline_options = PdfPipelineOptions() - pipeline_options.do_ocr = True - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = True - pipeline_options.ocr_options = EasyOcrOptions(force_full_page_ocr=True) + pipeline_options = PdfPipelineOptions( + do_ocr=True, + do_table_structure=True, + table_structure_options=TableStructureOptions(do_cell_matching=True), + ocr_options=EasyOcrOptions(), + ) + converter = DocumentConverter( + allowed_formats=[InputFormat.PDF], + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + }, + ) converter = DocumentConverter() diff --git a/machine-learning/hm-langchain/applications/chat-pdf/src/main.py b/machine-learning/hm-langchain/applications/chat-pdf/src/main.py index 8461f566a5..ce08101af7 100644 --- a/machine-learning/hm-langchain/applications/chat-pdf/src/main.py +++ b/machine-learning/hm-langchain/applications/chat-pdf/src/main.py @@ -29,6 +29,6 @@ def main(model_path: str, pdf_path: str, question: str) -> None: # https://gpt4all.io/index.html external_model_path = "data/ggml-model-gpt4all-falcon-q4_0.bin" external_pdf_path = "data/my.pdf" - external_question = "Could you please summarize this PDF? Thank you!" + external_question = "Could you please summarize this PDF?" main(external_model_path, external_pdf_path, external_question) diff --git a/machine-learning/hm-llama-index/applications/chat-pdf/src/main.py b/machine-learning/hm-llama-index/applications/chat-pdf/src/main.py index 42f3c7e61c..dacd99e064 100644 --- a/machine-learning/hm-llama-index/applications/chat-pdf/src/main.py +++ b/machine-learning/hm-llama-index/applications/chat-pdf/src/main.py @@ -28,8 +28,8 @@ def main() -> None: temperature=0.7, ) - pdf_path = Path("data/paper.pdf") - question = "Could you please summarize this PDF? Thank you!" + pdf_path = Path("data/file.pdf") + question = "Could you please summarize this PDF?" answer = chat_with_pdf(pdf_path, question) logging.info(answer)