Add helper for getting content from metadata (#231)

NVIDIA · Nov 26, 2024 · c76631d · c76631d
1 parent 00abf47
commit c76631d
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 1 deletion.
diff --git a/client/setup.py b/client/setup.py
@@ -59,7 +59,7 @@ def read_requirements(file_name):
     author_email="[email protected]",
     classifiers=[],
     description="Python client for the nv-ingest service",
-    entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main"]},
+    entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main", "process-json-files=nv_ingest_client.util.process_json_files:main"]},
     install_requires=combined_requirements,
     name="nv_ingest_client",
     package_dir={"": "src"},

diff --git a/client/src/nv_ingest_client/util/process_json_files.py b/client/src/nv_ingest_client/util/process_json_files.py
@@ -0,0 +1,99 @@
+import click
+import json
+
+def ingest_json_results_to_blob(result_content):
+    """
+    Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.
+
+    Returns:
+        str: The generated blob string.
+    """
+    try:
+        # Load the JSON data
+        data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content)
+
+        # Smarter sorting: by page, then structured objects by x0, y0
+        def sorting_key(entry):
+            page = entry['metadata']['content_metadata']['page_number']
+            if entry['document_type'] == 'structured':
+                # Use table location's x0 and y0 as secondary keys
+                x0 = entry['metadata']['table_metadata']['table_location'][0]
+                y0 = entry['metadata']['table_metadata']['table_location'][1]
+            else:
+                # Non-structured objects are sorted after structured ones
+                x0 = float('inf')
+                y0 = float('inf')
+            return page, x0, y0
+
+        data.sort(key=sorting_key)
+
+        # Initialize the blob string
+        blob = []
+
+        for entry in data:
+            document_type = entry.get('document_type', '')
+
+            if document_type == 'structured':
+                # Add table content to the blob
+                blob.append(entry['metadata']['table_metadata']['table_content'])
+                blob.append("\n")
+
+            elif document_type == 'text':
+                # Add content to the blob
+                blob.append(entry['metadata']['content'])
+                blob.append("\n")
+
+            elif document_type == 'image':
+                # Add image caption to the blob
+                caption = entry['metadata']['image_metadata'].get('caption', '')
+                blob.append(f"image_caption:[{caption}]")
+                blob.append("\n")
+
+        # Join all parts of the blob into a single string
+        return ''.join(blob)
+
+    except Exception as e:
+        print(f"[ERROR] An error occurred while processing JSON content: {e}")
+        return ""
+
+@click.command()
+@click.argument('json_files', type=click.Path(exists=True), nargs=-1, required=True)
+@click.option('--output-file', type=click.Path(dir_okay=False, writable=True, resolve_path=True), required=True,
+              help="Path to save the combined blob output file.")
+def main(json_files, output_file):
+    """
+    Process multiple JSON files, combine and sort entries, and generate a single blob file.
+
+    JSON_FILES: One or more JSON files to process.
+    """
+    click.echo(f"Processing {len(json_files)} JSON files...")
+    all_entries = []
+
+    try:
+        # Read and collect entries from all files
+        for json_file in json_files:
+            click.echo(f"Reading file: {json_file}")
+            with open(json_file, 'r') as file:
+                content = file.read()
+                all_entries.extend(json.loads(content))
+
+        # Convert collected entries to JSON string
+        combined_content = json.dumps(all_entries)
+
+        # Generate the blob string
+        blob_string = ingest_json_results_to_blob(combined_content)
+
+        if blob_string:
+            # Write the blob to the output file
+            with open(output_file, 'w+') as file:
+                file.write(blob_string)
+            click.echo(f"Blob string has been generated and saved to: {output_file}")
+        else:
+            click.echo("No valid data processed. Blob file not created.")
+
+    except Exception as e:
+        click.echo(f"[ERROR] An error occurred: {e}")
+
+
+if __name__ == "__main__":
+    process_json_files()
diff --git a/client/src/nv_ingest_client/util/util.py b/client/src/nv_ingest_client/util/util.py
@@ -411,3 +411,45 @@ def filter_function_kwargs(func, **kwargs):
     args_dict = {k: kwargs.pop(k) for k in dict(kwargs) if k in func_args}
 
     return args_dict
+
+
+def get_content(results: List[any]):
+    """
+    Extracts the text and table text content from the results of an NV-Ingest python client job
+
+    Parameters
+    ----------
+    results: List[Any]
+        The results of NV-Ingest python client job that contains the desired text and table content
+
+    Returns
+    -------
+    Dict
+        A dictionary containing the extracted text content and the extracted table content
+    """
+
+    text_elems = [
+        elem for result in results
+        for elem in result if elem["document_type"] == "text"
+    ]
+    structured_elems = [
+        elem for result in results
+        for elem in result if elem["document_type"] == "structured"
+    ]
+
+    text_content = [
+        {
+            "page_number": elem["metadata"]["content_metadata"]["page_number"],
+            "content": elem["metadata"]["content"],
+        }
+        for elem in text_elems
+    ]
+    structured_content = [
+        {
+            "page_number": elem["metadata"]["content_metadata"]["page_number"],
+            "content": elem["metadata"]["table_content"]
+        }
+        for elem in structured_elems
+    ]
+
+    return {"text_content": text_content, "structured_content": structured_content}