diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57235cc11..916cdd56d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: sudo apt-get update sudo apt-get install podman bats bash codespell python3-argcomplete pipx make install-requirements - pip install tqdm --break-system-packages + pip install docling tqdm - name: run bats run: | @@ -37,7 +37,7 @@ jobs: - name: bats-nocontainer run: | - pip install tqdm --break-system-packages + pip install docling tqdm make bats-nocontainer docker: @@ -73,7 +73,7 @@ jobs: - name: bats-docker run: | docker info - pip install tqdm --break-system-packages + pip install docling tqdm make bats-docker macos: @@ -90,7 +90,7 @@ jobs: run: | make install-requirements make validate - pipx install . + pipx install . docling make bats-nocontainer # FIXME: ci script should be able to run on MAC. diff --git a/docs/ramalama-rag.1.md b/docs/ramalama-rag.1.md index a6bfa7cf8..6e2dda963 100644 --- a/docs/ramalama-rag.1.md +++ b/docs/ramalama-rag.1.md @@ -5,14 +5,12 @@ ramalama\-rag - generate rag (Retrieval Augmented Generation) data from provided ## SYNOPSIS **ramalama rag** [options] [path ...] image - ## DESCRIPTION Generate rag data from provided documents and convert into an OCI Image positional arguments: path Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown formatted files to be processed. Can be specified multiple times. - image OCI Image name to contain processed rag data @@ -21,6 +19,15 @@ positional arguments: #### **--help**, **-h** Print usage message +## EXAMPLES + +``` +$ ramalama rag https://arxiv.org/pdf/2408.09869 /tmp/pdf quay.io/rhatdan/myrag +Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 68509.50it/s] +Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU. +2024-12-04 13:49:07.372 ( 70.927s) [ 75AB6740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected +``` + ## SEE ALSO **[ramalama(1)](ramalama.1.md)** diff --git a/pyproject.toml b/pyproject.toml index b521f1b97..7e42bc64c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.2.0" dependencies = [ "argcomplete", "tqdm", + "docling", ] requires-python = ">= 3.8" maintainers = [ diff --git a/ramalama/rag.py b/ramalama/rag.py index a45302170..84947d5ee 100644 --- a/ramalama/rag.py +++ b/ramalama/rag.py @@ -1,3 +1,106 @@ +import tempfile +import os +import json +import logging +from pathlib import Path +from typing import Iterable + +import yaml + +from ramalama.common import run_cmd +from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.document import ConversionResult +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + +ociimage_rag = "org.containers.type=ai.image.rag" + +def walk(path): + targets = [] + for root, dirs, files in os.walk(path, topdown=True): + if len(files) == 0: + continue + for f in files: + file = os.path.join(root, f) + if os.path.isfile(file): + targets.append(file) + return targets + + +def export_documents( + conv_results: Iterable[ConversionResult], + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + success_count = 0 + failure_count = 0 + partial_success_count = 0 + + for conv_res in conv_results: + if conv_res.status == ConversionStatus.SUCCESS: + success_count += 1 + doc_filename = conv_res.input.file.stem + + # Export Docling document format to JSON: + with (output_dir / f"{doc_filename}.json").open("w") as fp: + fp.write(json.dumps(conv_res.document.export_to_dict())) + + elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: + _log.info(f"Document {conv_res.input.file} was partially converted with the following errors:") + for item in conv_res.errors: + _log.info(f"\t{item.error_message}") + partial_success_count += 1 + else: + _log.info(f"Document {conv_res.input.file} failed to convert.") + failure_count += 1 + + _log.info( + f"Processed {success_count + partial_success_count + failure_count} docs, " + f"of which {failure_count} failed " + f"and {partial_success_count} were partially converted." + ) + return success_count, partial_success_count, failure_count + + +def build(source, target, args): + print(f"Building {target}...") + src = os.path.realpath(source) + contextdir = os.path.dirname(src) + model = os.path.basename(src) + model_name = os.path.basename(source) + containerfile = tempfile.NamedTemporaryFile(prefix='RamaLama_Containerfile_', delete=False) + # Open the file for writing. + with open(containerfile.name, 'w') as c: + c.write(f"""\ +FROM scratch +COPY {model} / +LABEL {ociimage_rag} +""") + imageid = ( + run_cmd([args.engine, "build", "-t", target, "--no-cache", "-q", "-f", containerfile.name, contextdir], debug=args.debug) + .stdout.decode("utf-8") + .strip() + ) + return imageid + def generate(args): - print(args.PATH) - print(args.IMAGE) + tmpdir = tempfile.TemporaryDirectory(prefix="ramalama_", delete=True) + targets = [] + for p in args.PATH: + if os.path.isfile(p): + targets.append(p) # Process selected file + continue + if os.path.isdir(p): + targets.extend(walk(p)) # Walk directory and process all files + continue + targets.append(p) # WEB? + + converter = DocumentConverter() + conv_results = converter.convert_all(targets, raises_on_error=False) + success_count, partial_success_count, failure_count = export_documents(conv_results, output_dir=Path(tmpdir.name)) + if failure_count > 0: + raise RuntimeError(f"failed to convert {failure_count} target(s) out of {len(targets)} documents.") + + build(tmpdir.name, args.IMAGE, args)