-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_lsr_data.py
78 lines (62 loc) · 2.67 KB
/
merge_lsr_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import pandas as pd
def translate_and_combine_columns(df, translation_dict):
"""
Translates and combines columns in a DataFrame based on a translation dictionary.
Parameters:
df (pd.DataFrame): The input DataFrame.
translation_dict (dict): A dictionary where keys are column names to be translated,
and values are the new column name.
Returns:
pd.DataFrame: The modified DataFrame.
"""
for original_col, translation in translation_dict.items():
new_col = translation
if original_col in df.columns:
# If the target column (new_col) already exists, combine the data
if new_col in df.columns:
raise ValueError(
f"Column name conflict: both {original_col} and {new_col} are present."
)
else:
# Rename the column to the new label
df[new_col] = df[original_col]
# Drop the original column after translation
df.drop(columns=[original_col], inplace=True)
return df
def read_file(filepath):
"""
Reads a file (CSV or XLSX) into a pandas DataFrame.
"""
if filepath.endswith(".csv"):
return pd.read_csv(filepath, na_values=[], keep_default_na=False)
elif filepath.endswith((".xls", ".xlsx")):
return pd.read_excel(filepath, na_values=[], keep_default_na=False)
else:
raise ValueError(f"Unsupported file type for: {filepath}")
def merge_files_into_json(
directory_path, lsr_files, output_json_path, extraction_variables
):
"""
Reads and merges all CSV and XLSX files in a directory into a single JSON array.
Columns with matching names are merged, and missing values are set to `null`.
Blank values are retained as empty strings.
"""
dataframes = []
all_columns = set()
for index, filename in lsr_files.items():
file_path = os.path.join(directory_path, filename)
if os.path.isfile(file_path) and filename.endswith((".csv", ".xls", ".xlsx")):
print(f"Processing file: {filename}")
df = read_file(file_path)
df = translate_and_combine_columns(df, extraction_variables)
df.insert(0, "LSR #", index)
all_columns.update(df.columns)
dataframes.append(df)
if not dataframes:
raise ValueError("No CSV or XLSX files found in the directory.")
merged_df = pd.concat(dataframes, ignore_index=True, sort=False)
merged_json = merged_df.to_json(orient="records", indent=4)
with open(output_json_path, "w") as f:
f.write(merged_json)
print(f"Merged data saved to {output_json_path}")