-
Notifications
You must be signed in to change notification settings - Fork 51
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Allow users to specify Docx, PDF, Markdown ... files on the command line and then processes them with docling and rag finally putting the output into the specified container image. Signed-off-by: Daniel J Walsh <[email protected]>
- Loading branch information
Showing
10 changed files
with
188 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
% ramalama-rag 1 | ||
|
||
## NAME | ||
ramalama\-rag - generate rag (Retrieval Augmented Generation) data from provided documents and convert into an OCI Image | ||
|
||
## SYNOPSIS | ||
**ramalama rag** [options] [path ...] image | ||
|
||
## DESCRIPTION | ||
Generate rag data from provided documents and convert into an OCI Image | ||
|
||
positional arguments: | ||
path Files/Directory containing PDF, DOCX, PPTX, XLSX, HTML, AsciiDoc & Markdown formatted files to be processed. Can be specified multiple times. | ||
image OCI Image name to contain processed rag data | ||
|
||
|
||
## OPTIONS | ||
|
||
#### **--help**, **-h** | ||
Print usage message | ||
|
||
## EXAMPLES | ||
|
||
``` | ||
$ ramalama rag https://arxiv.org/pdf/2408.09869 /tmp/pdf quay.io/rhatdan/myrag | ||
Fetching 9 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 68509.50it/s] | ||
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU. | ||
2024-12-04 13:49:07.372 ( 70.927s) [ 75AB6740] doc_normalisation.h:448 WARN| found new `other` type: checkbox-unselected | ||
``` | ||
|
||
## SEE ALSO | ||
**[ramalama(1)](ramalama.1.md)** | ||
|
||
## HISTORY | ||
Dec 2024, Originally compiled by Dan Walsh <[email protected]> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import tempfile | ||
import os | ||
import json | ||
import logging | ||
from pathlib import Path | ||
from typing import Iterable | ||
|
||
from ramalama.common import run_cmd | ||
from docling.datamodel.base_models import ConversionStatus | ||
from docling.datamodel.document import ConversionResult | ||
from docling.document_converter import DocumentConverter | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
ociimage_rag = "org.containers.type=ai.image.rag" | ||
|
||
|
||
def walk(path): | ||
targets = [] | ||
for root, dirs, files in os.walk(path, topdown=True): | ||
if len(files) == 0: | ||
continue | ||
for f in files: | ||
file = os.path.join(root, f) | ||
if os.path.isfile(file): | ||
targets.append(file) | ||
return targets | ||
|
||
|
||
def export_documents( | ||
conv_results: Iterable[ConversionResult], | ||
output_dir: Path, | ||
): | ||
output_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
success_count = 0 | ||
failure_count = 0 | ||
partial_success_count = 0 | ||
|
||
for conv_res in conv_results: | ||
if conv_res.status == ConversionStatus.SUCCESS: | ||
success_count += 1 | ||
doc_filename = conv_res.input.file.stem | ||
|
||
# Export Docling document format to JSON: | ||
with (output_dir / f"{doc_filename}.json").open("w") as fp: | ||
fp.write(json.dumps(conv_res.document.export_to_dict())) | ||
|
||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: | ||
_log.info(f"Document {conv_res.input.file} was partially converted with the following errors:") | ||
for item in conv_res.errors: | ||
_log.info(f"\t{item.error_message}") | ||
partial_success_count += 1 | ||
else: | ||
_log.info(f"Document {conv_res.input.file} failed to convert.") | ||
failure_count += 1 | ||
|
||
_log.info( | ||
f"Processed {success_count + partial_success_count + failure_count} docs, " | ||
f"of which {failure_count} failed " | ||
f"and {partial_success_count} were partially converted." | ||
) | ||
return success_count, partial_success_count, failure_count | ||
|
||
|
||
def build(source, target, args): | ||
print(f"Building {target}...") | ||
src = os.path.realpath(source) | ||
contextdir = os.path.dirname(src) | ||
model = os.path.basename(src) | ||
containerfile = tempfile.NamedTemporaryFile(prefix='RamaLama_Containerfile_', delete=True) | ||
# Open the file for writing. | ||
with open(containerfile.name, 'w') as c: | ||
c.write( | ||
f"""\ | ||
FROM scratch | ||
COPY {model} / | ||
LABEL {ociimage_rag} | ||
""" | ||
) | ||
imageid = ( | ||
run_cmd( | ||
[args.engine, "build", "-t", target, "--no-cache", "-q", "-f", containerfile.name, contextdir], | ||
debug=args.debug, | ||
) | ||
.stdout.decode("utf-8") | ||
.strip() | ||
) | ||
return imageid | ||
|
||
|
||
def generate(args): | ||
tmpdir = tempfile.TemporaryDirectory(prefix="ramalama_", delete=True) | ||
targets = [] | ||
for p in args.PATH: | ||
if os.path.isfile(p): | ||
targets.append(p) # Process selected file | ||
continue | ||
if os.path.isdir(p): | ||
targets.extend(walk(p)) # Walk directory and process all files | ||
continue | ||
targets.append(p) # WEB? | ||
|
||
converter = DocumentConverter() | ||
conv_results = converter.convert_all(targets, raises_on_error=False) | ||
success_count, partial_success_count, failure_count = export_documents(conv_results, output_dir=Path(tmpdir.name)) | ||
if failure_count > 0: | ||
raise RuntimeError(f"failed to convert {failure_count} target(s) out of {len(targets)} documents.") | ||
|
||
build(tmpdir.name, args.IMAGE, args) |