Merge pull request #16 from HBPMedical/fix/generate-dataset-with-all-…

…cde-columns Fix: generate dataset with all cde columns
HBPMedical · Jul 21, 2023 · f967be3 · f967be3
2 parents a442269 + f92b588
commit f967be3
Show file tree

Hide file tree

Showing 5 changed files with 122 additions and 43 deletions.
diff --git a/mip_dmp/cli/mip_dataset_mapper_cli.py b/mip_dmp/cli/mip_dataset_mapper_cli.py
@@ -8,6 +8,7 @@
 
 from mip_dmp.io import (
     load_csv,
+    load_excel,
     # load_excel,
     load_json,
 )
@@ -31,6 +32,8 @@ def main():
     args.source_dataset = Path(args.source_dataset).absolute()
     # Set mapping file path
     args.mapping_file = Path(args.mapping_file).absolute()
+    # Set cdes file path
+    args.cdes_file = Path(args.cdes_file).absolute()
     # Set target dataset file path
     args.target_dataset = Path(args.target_dataset).absolute()
     # Set path of log file
@@ -48,8 +51,9 @@ def main():
     print("Loading the files...")
     source_dataset = load_csv(args.source_dataset)
     mappings = load_json(args.mapping_file)
+    cde_codes = load_excel(args.cdes_file)["code"].unique().tolist()
     # Map the input dataset to the target CDEs
-    output_dataset = map_dataset(source_dataset, mappings)
+    output_dataset = map_dataset(source_dataset, mappings, cde_codes)
     # Save the output dataset
     output_dataset.to_csv(
         args.target_dataset,

diff --git a/mip_dmp/parser.py b/mip_dmp/parser.py
@@ -27,6 +27,11 @@ def create_parser():
         help="Source Dataset Columns / Common data elements (CDEs) mapping file in JSON format. "
         "The mapping file can be generated by the MIP Dataset Mapper UI application.",
     )
+    p.add_argument(
+        "--cdes_file",
+        required=True,
+        help="Common data elements (CDEs) metadata schema file in EXCEL format. ",
+    )
     p.add_argument(
         "--target_dataset",
         required=True,

diff --git a/mip_dmp/process/mapping.py b/mip_dmp/process/mapping.py
@@ -24,7 +24,7 @@
 }
 
 
-def map_dataset(dataset, mappings):
+def map_dataset(dataset, mappings, cde_codes):
     """Map the dataset to the schema.
 
     Parameters
@@ -35,6 +35,9 @@ def map_dataset(dataset, mappings):
     mappings : dict
         Mappings of the dataset columns to the schema columns.
 
+    cde_codes : list
+        List of codes of the CDE metadata schema.
+
     Returns
     -------
     pandas.DataFrame
@@ -43,29 +46,41 @@ def map_dataset(dataset, mappings):
     # create a list to hold the mapped columns
     mapped_columns = []
 
+    # Convert the list of mappings to a dictionary using cde_code as the key
+    mapping_dict = {mapping["cde_code"]: mapping for mapping in mappings}
+    print(f"len(mapping_dict) = {len(mapping_dict)}")
+
     # Map and apply transformation to each dataset column described in the
     # mapping JSON file.
-    for mapping in mappings:
-        # Extract the mapping information of the column.
-        dataset_column = mapping["dataset_column"]
-        cde_code = mapping["cde_code"]
-        cde_type = mapping["cde_type"]
-        transform_type = mapping["transform_type"]
-        transform = mapping["transform"]
-        # Copy the dataset column to the mapped dataset for which the column name
-        # is the CDE code.
-        # map the input data to the CDE code and append to the list of mapped columns
-
-        # Apply the transformation to the mapped dataset column.
-        mapped_columns.append(
-            transform_dataset_column(
-                dataset[dataset_column].rename(cde_code),
-                cde_code,
-                cde_type,
-                transform_type,
-                transform,
+    for cde_code in cde_codes:
+        if cde_code in mapping_dict:
+            mapping = mapping_dict[cde_code]
+            # Extract the mapping information of the column.
+            dataset_column = mapping["dataset_column"]
+            cde_code = mapping["cde_code"]
+            cde_type = mapping["cde_type"]
+            transform_type = mapping["transform_type"]
+            transform = mapping["transform"]
+            print(
+                f"  > Process column {dataset_column} with CDE code {cde_code}, CDE type {cde_type}, transform type {transform_type}, and transform {transform}"
             )
-        )
+            # If the column is present in the dataset, copy the dataset column to
+            # the mapped dataset for which the column name is the CDE code, map
+            # the input data to the CDE code, apply the transformation, and append
+            # to the list of mapped columns.
+            if dataset_column in dataset.columns:
+                mapped_columns.append(
+                    transform_dataset_column(
+                        dataset[dataset_column].rename(cde_code),
+                        cde_code,
+                        cde_type,
+                        transform_type,
+                        transform,
+                    )
+                )
+        else:
+            print(f"WARNING: No mapping found for CDE code {cde_code}. Fill with NaN.")
+            mapped_columns.append(pd.DataFrame(columns=[cde_code]))
     mapped_dataset = pd.concat(mapped_columns, axis=1)
     # Return the mapped dataset.
     print(mapped_dataset)
@@ -133,9 +148,13 @@ def apply_transform_map(dataset_column, transform):
         The transformed dataset column."""
     # Parse the mapping values from the JSON string
     mapping_values = eval(transform)
+    dataset_column = dataset_column.map(
+        lambda x: x.lower() if isinstance(x, str) else x
+    )
+
     # Map the values.
     for mapping_value_item in mapping_values.items():
-        old_value = mapping_value_item[0]
+        old_value = mapping_value_item[0].lower()
         new_value = mapping_value_item[1]
         dataset_column.iloc[dataset_column == old_value] = new_value
     return dataset_column
@@ -167,10 +186,13 @@ def apply_transform_scale(dataset_column, cde_code, cde_type, scaling_factor):
     # not applied. Otherwise, the scaling is applied.
     if not dataset_column.isnull().values.any():
         # Cast the column to the correct type and apply the scaling factor.
-        if cde_type == "integer":
-            dataset_column = dataset_column.astype(int) * int(scaling_factor)
-        elif cde_type == "real":
-            dataset_column = dataset_column.astype(float) * scaling_factor
+        try:
+            if cde_type == "integer":
+                dataset_column = dataset_column.astype(int) * int(scaling_factor)
+            elif cde_type == "real":
+                dataset_column = dataset_column.astype(float) * scaling_factor
+        except ValueError:
+            print(f"WARNING: The column {cde_code} could not be cast to {cde_type}.")
     else:
         # Cast and scale only the non-NaN values.
         if cde_type == "integer":

diff --git a/mip_dmp/process/matching.py b/mip_dmp/process/matching.py
@@ -189,7 +189,7 @@ def make_initial_transform(dataset, schema, dataset_column, cde_code):
     # Make the initial transform.
     if cde_type in ["integer", "real"]:
         return "1.0"
-    elif cde_type in ["binominal", "multinominal"]:
+    elif cde_type in ["binominal", "multinominal", "nominal"]:
         # Extract the CDE code values from the corresponding cell of
         # the "values" column of the schema.
         cde_code_values_str = (

diff --git a/mip_dmp/qt5/components/dataset_mapper_window.py b/mip_dmp/qt5/components/dataset_mapper_window.py
@@ -728,7 +728,10 @@ def initializeMappingEditForm(self, index):
         rowData = self.columnsCDEsMappingData.iloc[index.row(), :]
         self.mappingRowIndex.setText(str(index.row()))
         self.datasetColumn.setText(str(rowData["dataset_column"]))
-        columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
+        if self.matchedCdeCodes:
+            columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
+        else:
+            columnMatches = self.targetCDEs["code"].unique().tolist()
         self.cdeCode.clear()
         self.cdeCode.addItems(columnMatches)
         ind = columnMatches.index(rowData["cde_code"])
@@ -748,11 +751,14 @@ def updateMappingEditForm(self, index):
         # Get the data for the current row and update the widgets in the form
         rowIndex = int(self.mappingRowIndex.text())
         rowData = self.columnsCDEsMappingData.iloc[rowIndex, :]
-        columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
+        if self.matchedCdeCodes:
+            columnMatches = self.matchedCdeCodes[rowData["dataset_column"]]["words"]
+        else:
+            columnMatches = self.targetCDEs["code"].unique().tolist()
         cdeType = self.targetCDEs[self.targetCDEs["code"] == columnMatches[index]][
             "type"
         ].unique()[0]
-        self.cdeType.setText(cdeType)
+        self.cdeType.setText(str(cdeType))
         if cdeType == "real" or cdeType == "integer":
             if self.cdeCode.currentText() == rowData["cde_code"]:
                 self.transformType.setText("scale")
@@ -892,8 +898,34 @@ def loadMapping(self):
             self.disableMappingMapButtons()
         else:
             try:
+                # Load the mapping table file in JSON format
                 self.columnsCDEsMappingData = load_mapping_json(self.mappingFilePath[0])
                 print(f"Mapping loaded from {self.mappingFilePath[0]}")
+                # Create a pandas model for the mapping table
+                self.columnsCDEsMappingPandasModel = PandasTableModel(
+                    self.columnsCDEsMappingData
+                )
+                # Set the model of the table view to the pandas model
+                self.mappingTableView.setModel(self.columnsCDEsMappingPandasModel)
+                self.mappingTableView.setSelectionBehavior(
+                    self.mappingTableView.SelectRows
+                )
+                self.mappingTableView.setSelectionMode(
+                    self.mappingTableView.SingleSelection
+                )
+                self.mappingTableView.setEditTriggers(
+                    self.mappingTableView.NoEditTriggers
+                )  # disable editing
+                # Handle the mapping table view row selection changed signal
+                self.mappingTableView.selectionModel().currentRowChanged.connect(
+                    self.initializeMappingEditForm
+                )
+                # Select the first row of the mapping table view at the beginning
+                indexRow = 0
+                self.mappingTableView.selectRow(indexRow)
+                # Handle the combox box current index changed signal for the CDE code column
+                self.cdeCode.currentIndexChanged.connect(self.updateMappingEditForm)
+                # Display a success message
                 successMsg = (
                     f"Loaded mapping file {self.mappingFilePath[0]}. \n"
                     "Please Check the mapping, Save it and Click on the "
@@ -906,6 +938,7 @@ def loadMapping(self):
                 )
                 self.updateStatusbar(successMsg)
             except ValueError as e:
+                # Display an error message
                 errMsg = (
                     f"The mapping file {self.mappingFilePath[0]} is not valid: {repr(e)} \n"
                     "Please select a valid file! "
@@ -981,27 +1014,40 @@ def enableMappingButtons(self):
     def checkMapping(self):
         """Check the mapping."""
         # Check if the mapping contains unique column / CDE pairs
-        if (
-            len(self.columnsCDEsMappingData["cde_code"].unique())
-            != len(self.columnsCDEsMappingData["cde_code"])
-        ) or (
-            len(self.columnsCDEsMappingData["dataset_column"].unique())
-            != len(self.columnsCDEsMappingData["dataset_column"])
+        if len(self.columnsCDEsMappingData["cde_code"].unique()) != len(
+            self.columnsCDEsMappingData["cde_code"]
         ):
             errMsg = (
                 "The mapping is not valid. "
                 "Please check it and remove any mapping row "
-                "that might re-map a CDE code or a column of "
-                "the source dataset!"
+                "that might map multiple columns of the input dataset "
+                "to the same CDE code!"
             )
             QMessageBox.warning(
                 None,
-                "Error: Duplicate Column / CDEs Pairs",
+                "Error: Duplicated mapped CDE code",
                 errMsg,
             )
             self.updateStatusbar(errMsg)
             self.disableMappingMapButtons()
             return
+        # if len(self.columnsCDEsMappingData["dataset_column"].unique()) != len(
+        #     self.columnsCDEsMappingData["dataset_column"]
+        # ):
+        #     errMsg = (
+        #         "The mapping is not valid. "
+        #         "Please check it and remove any mapping row(s) "
+        #         "that might map the same column(s) of "
+        #         "the source dataset to multiple CDE codes!"
+        #     )
+        #     QMessageBox.warning(
+        #         None,
+        #         "Error: Duplicate Column / CDEs Pairs",
+        #         errMsg,
+        #     )
+        #     self.updateStatusbar(errMsg)
+        #     self.disableMappingMapButtons()
+        #     return
         # Check if the mapping contains only valid CDE codes
         if self.columnsCDEsMappingData[
             self.columnsCDEsMappingData["cde_code"].isin(self.targetCDEs["code"])
@@ -1033,7 +1079,7 @@ def is_invalid_map_transform(transform):
                 The transform to check.
             """
             try:
-                ast.literal_eval(f'"{transform}"')
+                ast.literal_eval(f"{transform}")
                 return False
             except ValueError:
                 return True
@@ -1135,7 +1181,7 @@ def mappingMatch(self):
         ) = match_columns_to_cdes(
             dataset=self.inputDataset,
             schema=self.targetCDEs,
-            nb_kept_matches=10,
+            nb_kept_matches=819,
             matching_method=matchingMethod,
         )
         # Create a pandas model for the mapping table
@@ -1235,7 +1281,9 @@ def map(self):
         with open(self.mappingFilePathLabel.text(), "r") as f:
             mapping = json.load(f)
         # Map the input dataset to the target CDEs
-        output_dataset = map_dataset(input_dataset, mapping)
+        output_dataset = map_dataset(
+            input_dataset, mapping, self.targetCDEs["code"].tolist()
+        )
         # Save the output dataset
         output_dataset.to_csv(
             self.outputFilename[0],