Skip to content

Commit

Permalink
NiFi scripts: file grab potential fix.
Browse files Browse the repository at this point in the history
  • Loading branch information
vladd-bit committed Nov 27, 2023
1 parent e3c35ec commit 2c8bdc7
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions nifi/user-scripts/get_files_from_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,16 +160,18 @@ def get_files_and_metadata():
if generate_pseudo_doc_id != False:
_file_id_dict["document_Pseudo_Id"] = str(uuid.uuid4().hex)

folders_ingested[root].append(file_id)
txt_file_df = pandas.concat([txt_file_df, pandas.DataFrame.from_dict([_file_id_dict], orient="columns")])
folders_ingested[root].append(file_id)

else:
for i in range(0, len(txt_file_df)):
file_id = txt_file_df.iloc[i][file_id_csv_column_name_match]

if file_id not in folders_ingested[root]:
if file_id in list(doc_files.keys()):
txt_file_df.at[i, "binarydoc"] = base64.b64encode(doc_files[file_id]).decode()
txt_file_df.at[i, "text"] = ""
if file_id in list(doc_files.keys()) and file_id not in folders_ingested[root]:
txt_file_df.at[i, "binarydoc"] = base64.b64encode(doc_files[file_id]).decode()
txt_file_df.at[i, "text"] = ""

if file_id not in folders_ingested[root]:
folders_ingested[root].append(file_id)

txt_file_df = txt_file_df.loc[txt_file_df["binarydoc"].notna()]
Expand All @@ -186,7 +188,7 @@ def get_files_and_metadata():

if record_counter >= output_batch_size - 1:
break

get_files_and_metadata()

with open(ingested_folders_file, "w+") as f:
Expand Down

0 comments on commit 2c8bdc7

Please sign in to comment.