From bdc99973e1420bac00fea834c4656b3b447d03a8 Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Fri, 15 Nov 2024 15:35:31 -0800 Subject: [PATCH] Add helper for getting content from metadata --- client/src/nv_ingest_client/util/util.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/client/src/nv_ingest_client/util/util.py b/client/src/nv_ingest_client/util/util.py index ebb6db3..1437908 100644 --- a/client/src/nv_ingest_client/util/util.py +++ b/client/src/nv_ingest_client/util/util.py @@ -401,3 +401,15 @@ def filter_function_kwargs(func, **kwargs): args_dict = {k: kwargs.pop(k) for k in dict(kwargs) if k in func_args} return args_dict + + +def get_content(results: List[any]): + + text_elems = [elem for elem in result if element["document_type"] == "text" for result in results] + structured_elems = [elem for elem in result if element["document_type"] == "structured" for result in results] + + text_content = [{"page_number": elem["metadata"]["content_metadata"]["page_number"], "content": elem["metadata"]["content"]} for elem in text_elems] + structured_content = [{"page_number": elem["metadata"]["content_metadata"]["page_number"], "content": elem["metadata"]["table_content"]} for elem in structured_elems] + + + return {"text_content": text_content, "structured_content": structured_content}