Skip to content

Commit

Permalink
Add ramalama rag command
Browse files Browse the repository at this point in the history
Signed-off-by: Daniel J Walsh <[email protected]>
  • Loading branch information
rhatdan committed Dec 6, 2024
1 parent 62a51ba commit ff9f7fa
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 8 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
sudo apt-get update
sudo apt-get install podman bats bash codespell python3-argcomplete pipx
make install-requirements
pip install tqdm --break-system-packages
pip install docling tqdm
- name: run bats
run: |
Expand All @@ -37,7 +37,7 @@ jobs:
- name: bats-nocontainer
run: |
pip install tqdm --break-system-packages
pip install docling tqdm
make bats-nocontainer
docker:
Expand Down Expand Up @@ -73,7 +73,7 @@ jobs:
- name: bats-docker
run: |
docker info
pip install tqdm --break-system-packages
pip install docling tqdm
make bats-docker
macos:
Expand All @@ -90,7 +90,7 @@ jobs:
run: |
make install-requirements
make validate
pipx install .
pipx install . docling
make bats-nocontainer
# FIXME: ci script should be able to run on MAC.
Expand Down
11 changes: 9 additions & 2 deletions docs/ramalama-rag.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@ ramalama\-rag - generate rag (Retrieval Augmented Generation) data from provided

## SYNOPSIS
**ramalama rag** [options] [path ...] image


## DESCRIPTION
Generate rag data from provided documents and convert into an OCI Image

positional arguments:
path Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown formatted files to be processed. Can be specified multiple times.

image OCI Image name to contain processed rag data


Expand All @@ -21,6 +19,15 @@ positional arguments:
#### **--help**, **-h**
Print usage message

## EXAMPLES

```
$ ramalama rag https://arxiv.org/pdf/2408.09869 /tmp/pdf quay.io/rhatdan/myrag
Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 68509.50it/s]
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
2024-12-04 13:49:07.372 ( 70.927s) [ 75AB6740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected
```

## SEE ALSO
**[ramalama(1)](ramalama.1.md)**

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.2.0"
dependencies = [
"argcomplete",
"tqdm",
"docling",
]
requires-python = ">= 3.8"
maintainers = [
Expand Down
107 changes: 105 additions & 2 deletions ramalama/rag.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,106 @@
import tempfile
import os
import json
import logging
from pathlib import Path
from typing import Iterable

import yaml

from ramalama.common import run_cmd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)

ociimage_rag = "org.containers.type=ai.image.rag"

def walk(path):
targets = []
for root, dirs, files in os.walk(path, topdown=True):
if len(files) == 0:
continue
for f in files:
file = os.path.join(root, f)
if os.path.isfile(file):
targets.append(file)
return targets


def export_documents(
conv_results: Iterable[ConversionResult],
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

success_count = 0
failure_count = 0
partial_success_count = 0

for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem

# Export Docling document format to JSON:
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.document.export_to_dict()))

elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(f"Document {conv_res.input.file} was partially converted with the following errors:")
for item in conv_res.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1

_log.info(
f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count


def build(source, target, args):
print(f"Building {target}...")
src = os.path.realpath(source)
contextdir = os.path.dirname(src)
model = os.path.basename(src)
model_name = os.path.basename(source)
containerfile = tempfile.NamedTemporaryFile(prefix='RamaLama_Containerfile_', delete=False)
# Open the file for writing.
with open(containerfile.name, 'w') as c:
c.write(f"""\
FROM scratch
COPY {model} /
LABEL {ociimage_rag}
""")
imageid = (
run_cmd([args.engine, "build", "-t", target, "--no-cache", "-q", "-f", containerfile.name, contextdir], debug=args.debug)
.stdout.decode("utf-8")
.strip()
)
return imageid

def generate(args):
print(args.PATH)
print(args.IMAGE)
tmpdir = tempfile.TemporaryDirectory(prefix="ramalama_", delete=True)
targets = []
for p in args.PATH:
if os.path.isfile(p):
targets.append(p) # Process selected file
continue
if os.path.isdir(p):
targets.extend(walk(p)) # Walk directory and process all files
continue
targets.append(p) # WEB?

converter = DocumentConverter()
conv_results = converter.convert_all(targets, raises_on_error=False)
success_count, partial_success_count, failure_count = export_documents(conv_results, output_dir=Path(tmpdir.name))
if failure_count > 0:
raise RuntimeError(f"failed to convert {failure_count} target(s) out of {len(targets)} documents.")

build(tmpdir.name, args.IMAGE, args)

0 comments on commit ff9f7fa

Please sign in to comment.