Skip to content

Commit

Permalink
Add helper for getting content from metadata (#231)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisJar authored Nov 26, 2024
1 parent 00abf47 commit c76631d
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 1 deletion.
2 changes: 1 addition & 1 deletion client/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def read_requirements(file_name):
author_email="[email protected]",
classifiers=[],
description="Python client for the nv-ingest service",
entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main"]},
entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main", "process-json-files=nv_ingest_client.util.process_json_files:main"]},
install_requires=combined_requirements,
name="nv_ingest_client",
package_dir={"": "src"},
Expand Down
99 changes: 99 additions & 0 deletions client/src/nv_ingest_client/util/process_json_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import click
import json

def ingest_json_results_to_blob(result_content):
"""
Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string.
Returns:
str: The generated blob string.
"""
try:
# Load the JSON data
data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content)

# Smarter sorting: by page, then structured objects by x0, y0
def sorting_key(entry):
page = entry['metadata']['content_metadata']['page_number']
if entry['document_type'] == 'structured':
# Use table location's x0 and y0 as secondary keys
x0 = entry['metadata']['table_metadata']['table_location'][0]
y0 = entry['metadata']['table_metadata']['table_location'][1]
else:
# Non-structured objects are sorted after structured ones
x0 = float('inf')
y0 = float('inf')
return page, x0, y0

data.sort(key=sorting_key)

# Initialize the blob string
blob = []

for entry in data:
document_type = entry.get('document_type', '')

if document_type == 'structured':
# Add table content to the blob
blob.append(entry['metadata']['table_metadata']['table_content'])
blob.append("\n")

elif document_type == 'text':
# Add content to the blob
blob.append(entry['metadata']['content'])
blob.append("\n")

elif document_type == 'image':
# Add image caption to the blob
caption = entry['metadata']['image_metadata'].get('caption', '')
blob.append(f"image_caption:[{caption}]")
blob.append("\n")

# Join all parts of the blob into a single string
return ''.join(blob)

except Exception as e:
print(f"[ERROR] An error occurred while processing JSON content: {e}")
return ""

@click.command()
@click.argument('json_files', type=click.Path(exists=True), nargs=-1, required=True)
@click.option('--output-file', type=click.Path(dir_okay=False, writable=True, resolve_path=True), required=True,
help="Path to save the combined blob output file.")
def main(json_files, output_file):
"""
Process multiple JSON files, combine and sort entries, and generate a single blob file.
JSON_FILES: One or more JSON files to process.
"""
click.echo(f"Processing {len(json_files)} JSON files...")
all_entries = []

try:
# Read and collect entries from all files
for json_file in json_files:
click.echo(f"Reading file: {json_file}")
with open(json_file, 'r') as file:
content = file.read()
all_entries.extend(json.loads(content))

# Convert collected entries to JSON string
combined_content = json.dumps(all_entries)

# Generate the blob string
blob_string = ingest_json_results_to_blob(combined_content)

if blob_string:
# Write the blob to the output file
with open(output_file, 'w+') as file:
file.write(blob_string)
click.echo(f"Blob string has been generated and saved to: {output_file}")
else:
click.echo("No valid data processed. Blob file not created.")

except Exception as e:
click.echo(f"[ERROR] An error occurred: {e}")


if __name__ == "__main__":
process_json_files()
42 changes: 42 additions & 0 deletions client/src/nv_ingest_client/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,45 @@ def filter_function_kwargs(func, **kwargs):
args_dict = {k: kwargs.pop(k) for k in dict(kwargs) if k in func_args}

return args_dict


def get_content(results: List[any]):
"""
Extracts the text and table text content from the results of an NV-Ingest python client job
Parameters
----------
results: List[Any]
The results of NV-Ingest python client job that contains the desired text and table content
Returns
-------
Dict
A dictionary containing the extracted text content and the extracted table content
"""

text_elems = [
elem for result in results
for elem in result if elem["document_type"] == "text"
]
structured_elems = [
elem for result in results
for elem in result if elem["document_type"] == "structured"
]

text_content = [
{
"page_number": elem["metadata"]["content_metadata"]["page_number"],
"content": elem["metadata"]["content"],
}
for elem in text_elems
]
structured_content = [
{
"page_number": elem["metadata"]["content_metadata"]["page_number"],
"content": elem["metadata"]["table_content"]
}
for elem in structured_elems
]

return {"text_content": text_content, "structured_content": structured_content}

0 comments on commit c76631d

Please sign in to comment.