Skip to content

Commit

Permalink
Merge pull request #16 from HBPMedical/fix/generate-dataset-with-all-…
Browse files Browse the repository at this point in the history
…cde-columns

Fix: generate dataset with all cde columns
  • Loading branch information
sebastientourbier authored Jul 21, 2023
2 parents a442269 + f92b588 commit f967be3
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 43 deletions.
6 changes: 5 additions & 1 deletion mip_dmp/cli/mip_dataset_mapper_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from mip_dmp.io import (
load_csv,
load_excel,
# load_excel,
load_json,
)
Expand All @@ -31,6 +32,8 @@ def main():
args.source_dataset = Path(args.source_dataset).absolute()
# Set mapping file path
args.mapping_file = Path(args.mapping_file).absolute()
# Set cdes file path
args.cdes_file = Path(args.cdes_file).absolute()
# Set target dataset file path
args.target_dataset = Path(args.target_dataset).absolute()
# Set path of log file
Expand All @@ -48,8 +51,9 @@ def main():
print("Loading the files...")
source_dataset = load_csv(args.source_dataset)
mappings = load_json(args.mapping_file)
cde_codes = load_excel(args.cdes_file)["code"].unique().tolist()
# Map the input dataset to the target CDEs
output_dataset = map_dataset(source_dataset, mappings)
output_dataset = map_dataset(source_dataset, mappings, cde_codes)
# Save the output dataset
output_dataset.to_csv(
args.target_dataset,
Expand Down
5 changes: 5 additions & 0 deletions mip_dmp/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ def create_parser():
help="Source Dataset Columns / Common data elements (CDEs) mapping file in JSON format. "
"The mapping file can be generated by the MIP Dataset Mapper UI application.",
)
p.add_argument(
"--cdes_file",
required=True,
help="Common data elements (CDEs) metadata schema file in EXCEL format. ",
)
p.add_argument(
"--target_dataset",
required=True,
Expand Down
74 changes: 48 additions & 26 deletions mip_dmp/process/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
}


def map_dataset(dataset, mappings):
def map_dataset(dataset, mappings, cde_codes):
"""Map the dataset to the schema.
Parameters
Expand All @@ -35,6 +35,9 @@ def map_dataset(dataset, mappings):
mappings : dict
Mappings of the dataset columns to the schema columns.
cde_codes : list
List of codes of the CDE metadata schema.
Returns
-------
pandas.DataFrame
Expand All @@ -43,29 +46,41 @@ def map_dataset(dataset, mappings):
# create a list to hold the mapped columns
mapped_columns = []

# Convert the list of mappings to a dictionary using cde_code as the key
mapping_dict = {mapping["cde_code"]: mapping for mapping in mappings}
print(f"len(mapping_dict) = {len(mapping_dict)}")

# Map and apply transformation to each dataset column described in the
# mapping JSON file.
for mapping in mappings:
# Extract the mapping information of the column.
dataset_column = mapping["dataset_column"]
cde_code = mapping["cde_code"]
cde_type = mapping["cde_type"]
transform_type = mapping["transform_type"]
transform = mapping["transform"]
# Copy the dataset column to the mapped dataset for which the column name
# is the CDE code.
# map the input data to the CDE code and append to the list of mapped columns

# Apply the transformation to the mapped dataset column.
mapped_columns.append(
transform_dataset_column(
dataset[dataset_column].rename(cde_code),
cde_code,
cde_type,
transform_type,
transform,
for cde_code in cde_codes:
if cde_code in mapping_dict:
mapping = mapping_dict[cde_code]
# Extract the mapping information of the column.
dataset_column = mapping["dataset_column"]
cde_code = mapping["cde_code"]
cde_type = mapping["cde_type"]
transform_type = mapping["transform_type"]
transform = mapping["transform"]
print(
f" > Process column {dataset_column} with CDE code {cde_code}, CDE type {cde_type}, transform type {transform_type}, and transform {transform}"
)
)
# If the column is present in the dataset, copy the dataset column to
# the mapped dataset for which the column name is the CDE code, map
# the input data to the CDE code, apply the transformation, and append
# to the list of mapped columns.
if dataset_column in dataset.columns:
mapped_columns.append(
transform_dataset_column(
dataset[dataset_column].rename(cde_code),
cde_code,
cde_type,
transform_type,
transform,
)
)
else:
print(f"WARNING: No mapping found for CDE code {cde_code}. Fill with NaN.")
mapped_columns.append(pd.DataFrame(columns=[cde_code]))
mapped_dataset = pd.concat(mapped_columns, axis=1)
# Return the mapped dataset.
print(mapped_dataset)
Expand Down Expand Up @@ -133,9 +148,13 @@ def apply_transform_map(dataset_column, transform):
The transformed dataset column."""
# Parse the mapping values from the JSON string
mapping_values = eval(transform)
dataset_column = dataset_column.map(
lambda x: x.lower() if isinstance(x, str) else x
)

# Map the values.
for mapping_value_item in mapping_values.items():
old_value = mapping_value_item[0]
old_value = mapping_value_item[0].lower()
new_value = mapping_value_item[1]
dataset_column.iloc[dataset_column == old_value] = new_value
return dataset_column
Expand Down Expand Up @@ -167,10 +186,13 @@ def apply_transform_scale(dataset_column, cde_code, cde_type, scaling_factor):
# not applied. Otherwise, the scaling is applied.
if not dataset_column.isnull().values.any():
# Cast the column to the correct type and apply the scaling factor.
if cde_type == "integer":
dataset_column = dataset_column.astype(int) * int(scaling_factor)
elif cde_type == "real":
dataset_column = dataset_column.astype(float) * scaling_factor
try:
if cde_type == "integer":
dataset_column = dataset_column.astype(int) * int(scaling_factor)
elif cde_type == "real":
dataset_column = dataset_column.astype(float) * scaling_factor
except ValueError:
print(f"WARNING: The column {cde_code} could not be cast to {cde_type}.")
else:
# Cast and scale only the non-NaN values.
if cde_type == "integer":
Expand Down
2 changes: 1 addition & 1 deletion mip_dmp/process/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def make_initial_transform(dataset, schema, dataset_column, cde_code):
# Make the initial transform.
if cde_type in ["integer", "real"]:
return "1.0"
elif cde_type in ["binominal", "multinominal"]:
elif cde_type in ["binominal", "multinominal", "nominal"]:
# Extract the CDE code values from the corresponding cell of
# the "values" column of the schema.
cde_code_values_str = (
Expand Down
78 changes: 63 additions & 15 deletions mip_dmp/qt5/components/dataset_mapper_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,10 @@ def initializeMappingEditForm(self, index):
rowData = self.columnsCDEsMappingData.iloc[index.row(), :]
self.mappingRowIndex.setText(str(index.row()))
self.datasetColumn.setText(str(rowData["dataset_column"]))
columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
if self.matchedCdeCodes:
columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
else:
columnMatches = self.targetCDEs["code"].unique().tolist()
self.cdeCode.clear()
self.cdeCode.addItems(columnMatches)
ind = columnMatches.index(rowData["cde_code"])
Expand All @@ -748,11 +751,14 @@ def updateMappingEditForm(self, index):
# Get the data for the current row and update the widgets in the form
rowIndex = int(self.mappingRowIndex.text())
rowData = self.columnsCDEsMappingData.iloc[rowIndex, :]
columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
if self.matchedCdeCodes:
columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
else:
columnMatches = self.targetCDEs["code"].unique().tolist()
cdeType = self.targetCDEs[self.targetCDEs["code"] == columnMatches[index]][
"type"
].unique()[0]
self.cdeType.setText(cdeType)
self.cdeType.setText(str(cdeType))
if cdeType == "real" or cdeType == "integer":
if self.cdeCode.currentText() == rowData["cde_code"]:
self.transformType.setText("scale")
Expand Down Expand Up @@ -892,8 +898,34 @@ def loadMapping(self):
self.disableMappingMapButtons()
else:
try:
# Load the mapping table file in JSON format
self.columnsCDEsMappingData = load_mapping_json(self.mappingFilePath[0])
print(f"Mapping loaded from {self.mappingFilePath[0]}")
# Create a pandas model for the mapping table
self.columnsCDEsMappingPandasModel = PandasTableModel(
self.columnsCDEsMappingData
)
# Set the model of the table view to the pandas model
self.mappingTableView.setModel(self.columnsCDEsMappingPandasModel)
self.mappingTableView.setSelectionBehavior(
self.mappingTableView.SelectRows
)
self.mappingTableView.setSelectionMode(
self.mappingTableView.SingleSelection
)
self.mappingTableView.setEditTriggers(
self.mappingTableView.NoEditTriggers
) # disable editing
# Handle the mapping table view row selection changed signal
self.mappingTableView.selectionModel().currentRowChanged.connect(
self.initializeMappingEditForm
)
# Select the first row of the mapping table view at the beginning
indexRow = 0
self.mappingTableView.selectRow(indexRow)
# Handle the combox box current index changed signal for the CDE code column
self.cdeCode.currentIndexChanged.connect(self.updateMappingEditForm)
# Display a success message
successMsg = (
f"Loaded mapping file {self.mappingFilePath[0]}. \n"
"Please Check the mapping, Save it and Click on the "
Expand All @@ -906,6 +938,7 @@ def loadMapping(self):
)
self.updateStatusbar(successMsg)
except ValueError as e:
# Display an error message
errMsg = (
f"The mapping file {self.mappingFilePath[0]} is not valid: {repr(e)} \n"
"Please select a valid file! "
Expand Down Expand Up @@ -981,27 +1014,40 @@ def enableMappingButtons(self):
def checkMapping(self):
"""Check the mapping."""
# Check if the mapping contains unique column / CDE pairs
if (
len(self.columnsCDEsMappingData["cde_code"].unique())
!= len(self.columnsCDEsMappingData["cde_code"])
) or (
len(self.columnsCDEsMappingData["dataset_column"].unique())
!= len(self.columnsCDEsMappingData["dataset_column"])
if len(self.columnsCDEsMappingData["cde_code"].unique()) != len(
self.columnsCDEsMappingData["cde_code"]
):
errMsg = (
"The mapping is not valid. "
"Please check it and remove any mapping row "
"that might re-map a CDE code or a column of "
"the source dataset!"
"that might map multiple columns of the input dataset "
"to the same CDE code!"
)
QMessageBox.warning(
None,
"Error: Duplicate Column / CDEs Pairs",
"Error: Duplicated mapped CDE code",
errMsg,
)
self.updateStatusbar(errMsg)
self.disableMappingMapButtons()
return
# if len(self.columnsCDEsMappingData["dataset_column"].unique()) != len(
# self.columnsCDEsMappingData["dataset_column"]
# ):
# errMsg = (
# "The mapping is not valid. "
# "Please check it and remove any mapping row(s) "
# "that might map the same column(s) of "
# "the source dataset to multiple CDE codes!"
# )
# QMessageBox.warning(
# None,
# "Error: Duplicate Column / CDEs Pairs",
# errMsg,
# )
# self.updateStatusbar(errMsg)
# self.disableMappingMapButtons()
# return
# Check if the mapping contains only valid CDE codes
if self.columnsCDEsMappingData[
self.columnsCDEsMappingData["cde_code"].isin(self.targetCDEs["code"])
Expand Down Expand Up @@ -1033,7 +1079,7 @@ def is_invalid_map_transform(transform):
The transform to check.
"""
try:
ast.literal_eval(f'"{transform}"')
ast.literal_eval(f"{transform}")
return False
except ValueError:
return True
Expand Down Expand Up @@ -1135,7 +1181,7 @@ def mappingMatch(self):
) = match_columns_to_cdes(
dataset=self.inputDataset,
schema=self.targetCDEs,
nb_kept_matches=10,
nb_kept_matches=819,
matching_method=matchingMethod,
)
# Create a pandas model for the mapping table
Expand Down Expand Up @@ -1235,7 +1281,9 @@ def map(self):
with open(self.mappingFilePathLabel.text(), "r") as f:
mapping = json.load(f)
# Map the input dataset to the target CDEs
output_dataset = map_dataset(input_dataset, mapping)
output_dataset = map_dataset(
input_dataset, mapping, self.targetCDEs["code"].tolist()
)
# Save the output dataset
output_dataset.to_csv(
self.outputFilename[0],
Expand Down

0 comments on commit f967be3

Please sign in to comment.