diff --git a/.github/workflows/ci-images.yml b/.github/workflows/ci-images.yml index 273d3dea..b96bac0f 100644 --- a/.github/workflows/ci-images.yml +++ b/.github/workflows/ci-images.yml @@ -15,7 +15,7 @@ on: jobs: build: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b05a4a0c..916cdd56 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,7 @@ on: jobs: bats: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: install bats @@ -17,7 +17,7 @@ jobs: sudo apt-get update sudo apt-get install podman bats bash codespell python3-argcomplete pipx make install-requirements - pip install tqdm --break-system-packages + pip install docling tqdm - name: run bats run: | @@ -25,7 +25,7 @@ jobs: make bats bats-nocontainer: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: install bats @@ -37,11 +37,11 @@ jobs: - name: bats-nocontainer run: | - pip install tqdm --break-system-packages + pip install docling tqdm make bats-nocontainer docker: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: install bats @@ -73,11 +73,11 @@ jobs: - name: bats-docker run: | docker info - pip install tqdm --break-system-packages + pip install docling tqdm make bats-docker macos: - runs-on: macos-14 + runs-on: macos-latest steps: - uses: actions/checkout@v4 - name: install golang @@ -90,7 +90,7 @@ jobs: run: | make install-requirements make validate - pipx install . + pipx install . docling make bats-nocontainer # FIXME: ci script should be able to run on MAC. diff --git a/.github/workflows/latest.yml b/.github/workflows/latest.yml index 232d7c0a..8e77c730 100644 --- a/.github/workflows/latest.yml +++ b/.github/workflows/latest.yml @@ -6,7 +6,7 @@ on: jobs: linux: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Free Disk Space Linux @@ -41,7 +41,7 @@ jobs: run: make test macos: - runs-on: macos-14 + runs-on: macos-latest steps: - uses: actions/checkout@v4 - name: install golang @@ -52,7 +52,7 @@ jobs: run: make test build: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [linux, macos] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 49726b28..96186e28 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -6,7 +6,7 @@ on: jobs: linux: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Free Disk Space Linux @@ -36,7 +36,7 @@ jobs: run: make test macos: - runs-on: macos-14 + runs-on: macos-latest steps: - uses: actions/checkout@v4 - name: install golang @@ -47,7 +47,7 @@ jobs: run: make test build: - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest needs: [linux, macos] steps: - uses: actions/checkout@v4 diff --git a/docs/ramalama-info.1.md b/docs/ramalama-info.1.md index 5664848e..d6ad6143 100644 --- a/docs/ramalama-info.1.md +++ b/docs/ramalama-info.1.md @@ -1,7 +1,7 @@ % ramalama-info 1 ## NAME -ramalama\-info - Display RamaLama configuration information +ramalama\-info - display RamaLama configuration information ## SYNOPSIS diff --git a/docs/ramalama-rag.1.md b/docs/ramalama-rag.1.md new file mode 100644 index 00000000..6e2dda96 --- /dev/null +++ b/docs/ramalama-rag.1.md @@ -0,0 +1,35 @@ +% ramalama-rag 1 + +## NAME +ramalama\-rag - generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image + +## SYNOPSIS +**ramalama rag** [options] [path ...] image + +## DESCRIPTION +Generate rag data from provided documents and convert into an OCI Image + +positional arguments: + path Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown formatted files to be processed. Can be specified multiple times. + image OCI Image name to contain processed rag data + + +## OPTIONS + +#### **--help**, **-h** +Print usage message + +## EXAMPLES + +``` +$ ramalama rag https://arxiv.org/pdf/2408.09869 /tmp/pdf quay.io/rhatdan/myrag +Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 68509.50it/s] +Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU. +2024-12-04 13:49:07.372 ( 70.927s) [ 75AB6740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected +``` + +## SEE ALSO +**[ramalama(1)](ramalama.1.md)** + +## HISTORY +Dec 2024, Originally compiled by Dan Walsh diff --git a/docs/ramalama.1.md b/docs/ramalama.1.md index 6cccfc1e..75c72f14 100644 --- a/docs/ramalama.1.md +++ b/docs/ramalama.1.md @@ -130,12 +130,13 @@ The default can be overridden in the ramalama.conf file. | ------------------------------------------------- | ---------------------------------------------------------- | | [ramalama-containers(1)](ramalama-containers.1.md)| list all RamaLama containers | | [ramalama-convert(1)](ramalama-convert.1.md) | convert AI Models from local storage to OCI Image | -| [ramalama-info(1)](ramalama-info.1.md) | Display RamaLama configuration information | +| [ramalama-info(1)](ramalama-info.1.md) | display RamaLama configuration information | | [ramalama-list(1)](ramalama-list.1.md) | list all downloaded AI Models | | [ramalama-login(1)](ramalama-login.1.md) | login to remote registry | | [ramalama-logout(1)](ramalama-logout.1.md) | logout from remote registry | | [ramalama-pull(1)](ramalama-pull.1.md) | pull AI Models from Model registries to local storage | | [ramalama-push(1)](ramalama-push.1.md) | push AI Models from local storage to remote registries | +| [ramalama-rag(1)](ramalama-rag.1.md) | generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image | | [ramalama-rm(1)](ramalama-rm.1.md) | remove AI Models from local storage | | [ramalama-run(1)](ramalama-run.1.md) | run specified AI Model as a chatbot | | [ramalama-serve(1)](ramalama-serve.1.md) | serve REST API on specified AI Model | diff --git a/pyproject.toml b/pyproject.toml index b521f1b9..7e42bc64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.2.0" dependencies = [ "argcomplete", "tqdm", + "docling", ] requires-python = ">= 3.8" maintainers = [ diff --git a/ramalama/cli.py b/ramalama/cli.py index 153347d3..831a789e 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -6,6 +6,7 @@ import subprocess import time import ramalama.oci +import ramalama.rag from ramalama.huggingface import Huggingface from ramalama.common import ( @@ -14,6 +15,7 @@ perror, run_cmd, ) + from ramalama.model import model_types from ramalama.oci import OCI from ramalama.ollama import Ollama @@ -229,6 +231,7 @@ def configure_subcommands(parser): logout_parser(subparsers) pull_parser(subparsers) push_parser(subparsers) + rag_parser(subparsers) rm_parser(subparsers) run_parser(subparsers) serve_parser(subparsers) @@ -408,7 +411,7 @@ def list_containers(args): def info_parser(subparsers): - parser = subparsers.add_parser("info", help="Display information pertaining to setup of RamaLama.") + parser = subparsers.add_parser("info", help="display information pertaining to setup of RamaLama.") parser.add_argument("--container", default=False, action="store_false", help=argparse.SUPPRESS) parser.set_defaults(func=info_cli) @@ -771,6 +774,26 @@ def version_parser(subparsers): parser.set_defaults(func=print_version) +def rag_parser(subparsers): + parser = subparsers.add_parser( + "rag", + help="generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image", + ) + parser.add_argument( + "PATH", + nargs="*", + help="""\ +Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown +formatted files to be processed""", + ) + parser.add_argument("IMAGE", help="OCI Image name to contain processed rag data") + parser.set_defaults(func=rag_cli) + + +def rag_cli(args): + ramalama.rag.generate(args) + + def rm_parser(subparsers): parser = subparsers.add_parser("rm", help="remove AI Model from local storage") parser.add_argument("--container", default=False, action="store_false", help=argparse.SUPPRESS) diff --git a/ramalama/rag.py b/ramalama/rag.py new file mode 100644 index 00000000..833497be --- /dev/null +++ b/ramalama/rag.py @@ -0,0 +1,110 @@ +import tempfile +import os +import json +import logging +from pathlib import Path +from typing import Iterable + +from ramalama.common import run_cmd +from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.document import ConversionResult +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + +ociimage_rag = "org.containers.type=ai.image.rag" + + +def walk(path): + targets = [] + for root, dirs, files in os.walk(path, topdown=True): + if len(files) == 0: + continue + for f in files: + file = os.path.join(root, f) + if os.path.isfile(file): + targets.append(file) + return targets + + +def export_documents( + conv_results: Iterable[ConversionResult], + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + success_count = 0 + failure_count = 0 + partial_success_count = 0 + + for conv_res in conv_results: + if conv_res.status == ConversionStatus.SUCCESS: + success_count += 1 + doc_filename = conv_res.input.file.stem + + # Export Docling document format to JSON: + with (output_dir / f"{doc_filename}.json").open("w") as fp: + fp.write(json.dumps(conv_res.document.export_to_dict())) + + elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: + _log.info(f"Document {conv_res.input.file} was partially converted with the following errors:") + for item in conv_res.errors: + _log.info(f"\t{item.error_message}") + partial_success_count += 1 + else: + _log.info(f"Document {conv_res.input.file} failed to convert.") + failure_count += 1 + + _log.info( + f"Processed {success_count + partial_success_count + failure_count} docs, " + f"of which {failure_count} failed " + f"and {partial_success_count} were partially converted." + ) + return success_count, partial_success_count, failure_count + + +def build(source, target, args): + print(f"Building {target}...") + src = os.path.realpath(source) + contextdir = os.path.dirname(src) + model = os.path.basename(src) + containerfile = tempfile.NamedTemporaryFile(prefix='RamaLama_Containerfile_', delete=True) + # Open the file for writing. + with open(containerfile.name, 'w') as c: + c.write( + f"""\ +FROM scratch +COPY {model} / +LABEL {ociimage_rag} +""" + ) + imageid = ( + run_cmd( + [args.engine, "build", "-t", target, "--no-cache", "-q", "-f", containerfile.name, contextdir], + debug=args.debug, + ) + .stdout.decode("utf-8") + .strip() + ) + return imageid + + +def generate(args): + tmpdir = tempfile.TemporaryDirectory(prefix="ramalama_", delete=True) + targets = [] + for p in args.PATH: + if os.path.isfile(p): + targets.append(p) # Process selected file + continue + if os.path.isdir(p): + targets.extend(walk(p)) # Walk directory and process all files + continue + targets.append(p) # WEB? + + converter = DocumentConverter() + conv_results = converter.convert_all(targets, raises_on_error=False) + success_count, partial_success_count, failure_count = export_documents(conv_results, output_dir=Path(tmpdir.name)) + if failure_count > 0: + raise RuntimeError(f"failed to convert {failure_count} target(s) out of {len(targets)} documents.") + + build(tmpdir.name, args.IMAGE, args)