Add ramalama rag command

Signed-off-by: Daniel J Walsh <[email protected]>
containers · Dec 6, 2024 · ff9f7fa · ff9f7fa
1 parent 62a51ba
commit ff9f7fa
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 8 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
            sudo apt-get update
            sudo apt-get install podman bats bash codespell python3-argcomplete pipx
            make install-requirements
-           pip install tqdm --break-system-packages
+           pip install docling tqdm
 
       - name: run bats
         run: |
@@ -37,7 +37,7 @@ jobs:
 
       - name: bats-nocontainer
         run: |
-           pip install tqdm --break-system-packages
+           pip install docling tqdm
            make bats-nocontainer
 
   docker:
@@ -73,7 +73,7 @@ jobs:
       - name: bats-docker
         run: |
            docker info
-           pip install tqdm --break-system-packages
+           pip install docling tqdm
            make bats-docker
 
   macos:
@@ -90,7 +90,7 @@ jobs:
         run: |
            make install-requirements
            make validate
-           pipx install .
+           pipx install . docling
            make bats-nocontainer
 
 # FIXME: ci script should be able to run on MAC.

diff --git a/docs/ramalama-rag.1.md b/docs/ramalama-rag.1.md
@@ -5,14 +5,12 @@ ramalama\-rag - generate rag (Retrieval Augmented Generation) data from provided
 
 ## SYNOPSIS
 **ramalama rag** [options] [path ...] image
-
 
 ## DESCRIPTION
 Generate rag data from provided documents and convert into an OCI Image
 
 positional arguments:
   path        Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown formatted files to be processed. Can be specified multiple times.
-
   image       OCI Image name to contain processed rag data
 
 
@@ -21,6 +19,15 @@ positional arguments:
 #### **--help**, **-h**
 Print usage message
 
+## EXAMPLES
+
+```
+$ ramalama rag https://arxiv.org/pdf/2408.09869 /tmp/pdf quay.io/rhatdan/myrag
+Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 68509.50it/s]
+Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
+2024-12-04 13:49:07.372 (  70.927s) [        75AB6740]    doc_normalisation.h:448   WARN| found new `other` type: checkbox-unselected
+```
+
 ## SEE ALSO
 **[ramalama(1)](ramalama.1.md)**
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,6 +4,7 @@ version = "0.2.0"
 dependencies = [
   "argcomplete",
   "tqdm",
+  "docling",
 ]
 requires-python = ">= 3.8"
 maintainers = [

diff --git a/ramalama/rag.py b/ramalama/rag.py
@@ -1,3 +1,106 @@
+import tempfile
+import os
+import json
+import logging
+from pathlib import Path
+from typing import Iterable
+
+import yaml
+
+from ramalama.common import run_cmd
+from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+ociimage_rag = "org.containers.type=ai.image.rag"
+
+def walk(path):
+    targets = []
+    for root, dirs, files in os.walk(path, topdown=True):
+        if len(files) == 0:
+            continue
+        for f in files:
+            file = os.path.join(root, f)
+            if os.path.isfile(file):
+                targets.append(file)
+    return targets
+
+
+def export_documents(
+    conv_results: Iterable[ConversionResult],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+    partial_success_count = 0
+
+    for conv_res in conv_results:
+        if conv_res.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = conv_res.input.file.stem
+
+            # Export Docling document format to JSON:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(conv_res.document.export_to_dict()))
+
+        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
+            _log.info(f"Document {conv_res.input.file} was partially converted with the following errors:")
+            for item in conv_res.errors:
+                _log.info(f"\t{item.error_message}")
+            partial_success_count += 1
+        else:
+            _log.info(f"Document {conv_res.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + partial_success_count + failure_count} docs, "
+        f"of which {failure_count} failed "
+        f"and {partial_success_count} were partially converted."
+    )
+    return success_count, partial_success_count, failure_count
+
+
+def build(source, target, args):
+    print(f"Building {target}...")
+    src = os.path.realpath(source)
+    contextdir = os.path.dirname(src)
+    model = os.path.basename(src)
+    model_name = os.path.basename(source)
+    containerfile = tempfile.NamedTemporaryFile(prefix='RamaLama_Containerfile_', delete=False)
+    # Open the file for writing.
+    with open(containerfile.name, 'w') as c:
+            c.write(f"""\
+FROM scratch
+COPY {model} /
+LABEL {ociimage_rag}
+""")
+    imageid = (
+        run_cmd([args.engine, "build", "-t", target, "--no-cache", "-q", "-f", containerfile.name, contextdir], debug=args.debug)
+        .stdout.decode("utf-8")
+        .strip()
+    )
+    return imageid
+
 def generate(args):
-    print(args.PATH)
-    print(args.IMAGE)
+    tmpdir = tempfile.TemporaryDirectory(prefix="ramalama_", delete=True)
+    targets = []
+    for p in args.PATH:
+        if os.path.isfile(p):
+            targets.append(p) # Process selected file
+            continue
+        if os.path.isdir(p):
+            targets.extend(walk(p)) # Walk directory and process all files
+            continue
+        targets.append(p) # WEB?
+
+    converter = DocumentConverter()
+    conv_results = converter.convert_all(targets, raises_on_error=False)
+    success_count, partial_success_count, failure_count = export_documents(conv_results, output_dir=Path(tmpdir.name))
+    if failure_count > 0:
+        raise RuntimeError(f"failed to convert {failure_count} target(s) out of {len(targets)} documents.")
+
+    build(tmpdir.name, args.IMAGE, args)