-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add helper for getting content from metadata (#231)
- Loading branch information
Showing
3 changed files
with
142 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,7 +59,7 @@ def read_requirements(file_name): | |
author_email="[email protected]", | ||
classifiers=[], | ||
description="Python client for the nv-ingest service", | ||
entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main"]}, | ||
entry_points={"console_scripts": ["nv-ingest-cli=nv_ingest_client.nv_ingest_cli:main", "process-json-files=nv_ingest_client.util.process_json_files:main"]}, | ||
install_requires=combined_requirements, | ||
name="nv_ingest_client", | ||
package_dir={"": "src"}, | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import click | ||
import json | ||
|
||
def ingest_json_results_to_blob(result_content): | ||
""" | ||
Parse a JSON string or BytesIO object, combine and sort entries, and create a blob string. | ||
Returns: | ||
str: The generated blob string. | ||
""" | ||
try: | ||
# Load the JSON data | ||
data = json.loads(result_content) if isinstance(result_content, str) else json.loads(result_content) | ||
|
||
# Smarter sorting: by page, then structured objects by x0, y0 | ||
def sorting_key(entry): | ||
page = entry['metadata']['content_metadata']['page_number'] | ||
if entry['document_type'] == 'structured': | ||
# Use table location's x0 and y0 as secondary keys | ||
x0 = entry['metadata']['table_metadata']['table_location'][0] | ||
y0 = entry['metadata']['table_metadata']['table_location'][1] | ||
else: | ||
# Non-structured objects are sorted after structured ones | ||
x0 = float('inf') | ||
y0 = float('inf') | ||
return page, x0, y0 | ||
|
||
data.sort(key=sorting_key) | ||
|
||
# Initialize the blob string | ||
blob = [] | ||
|
||
for entry in data: | ||
document_type = entry.get('document_type', '') | ||
|
||
if document_type == 'structured': | ||
# Add table content to the blob | ||
blob.append(entry['metadata']['table_metadata']['table_content']) | ||
blob.append("\n") | ||
|
||
elif document_type == 'text': | ||
# Add content to the blob | ||
blob.append(entry['metadata']['content']) | ||
blob.append("\n") | ||
|
||
elif document_type == 'image': | ||
# Add image caption to the blob | ||
caption = entry['metadata']['image_metadata'].get('caption', '') | ||
blob.append(f"image_caption:[{caption}]") | ||
blob.append("\n") | ||
|
||
# Join all parts of the blob into a single string | ||
return ''.join(blob) | ||
|
||
except Exception as e: | ||
print(f"[ERROR] An error occurred while processing JSON content: {e}") | ||
return "" | ||
|
||
@click.command() | ||
@click.argument('json_files', type=click.Path(exists=True), nargs=-1, required=True) | ||
@click.option('--output-file', type=click.Path(dir_okay=False, writable=True, resolve_path=True), required=True, | ||
help="Path to save the combined blob output file.") | ||
def main(json_files, output_file): | ||
""" | ||
Process multiple JSON files, combine and sort entries, and generate a single blob file. | ||
JSON_FILES: One or more JSON files to process. | ||
""" | ||
click.echo(f"Processing {len(json_files)} JSON files...") | ||
all_entries = [] | ||
|
||
try: | ||
# Read and collect entries from all files | ||
for json_file in json_files: | ||
click.echo(f"Reading file: {json_file}") | ||
with open(json_file, 'r') as file: | ||
content = file.read() | ||
all_entries.extend(json.loads(content)) | ||
|
||
# Convert collected entries to JSON string | ||
combined_content = json.dumps(all_entries) | ||
|
||
# Generate the blob string | ||
blob_string = ingest_json_results_to_blob(combined_content) | ||
|
||
if blob_string: | ||
# Write the blob to the output file | ||
with open(output_file, 'w+') as file: | ||
file.write(blob_string) | ||
click.echo(f"Blob string has been generated and saved to: {output_file}") | ||
else: | ||
click.echo("No valid data processed. Blob file not created.") | ||
|
||
except Exception as e: | ||
click.echo(f"[ERROR] An error occurred: {e}") | ||
|
||
|
||
if __name__ == "__main__": | ||
process_json_files() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters