-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from HBPMedical/feat/initial-cdes-update-code
feat: add initial code that was updating the CDEs
- Loading branch information
Showing
14 changed files
with
588 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
|
||
# Ignore Mac DSStore | ||
.DS_Store | ||
|
||
# Ignore python cache | ||
__pycache__ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,50 @@ | ||
# mip_schema | ||
# MIP Common Data Elements Metadata Schema Tool (`mip_schema`) | ||
|
||
Open-source Python package to manipulate Common Data Elements Metadata Schema for the Medical Informatics Platform (MIP). | ||
|
||
## How to install? | ||
|
||
1. Clone the Git repository in your prefered directory: | ||
|
||
```bash | ||
$ cd "/prefered/directory" | ||
$ git clone [email protected]:HBPMedical/mip_schema.git | ||
``` | ||
|
||
2. Go to the cloned repository and create a new virtual Python 3.9 environment: | ||
|
||
```bash | ||
$ cd mip_schema | ||
$ virtualenv venv -p python3.9 | ||
``` | ||
|
||
3. Activate the environment and install the package with Pip: | ||
|
||
```bash | ||
$ source ./venv/bin/activate | ||
(venv) $ pip install -e . | ||
``` | ||
|
||
## Available command-line tools | ||
|
||
### `mip_update_cdes_json` | ||
|
||
Script to update the CDES JSON/EXCEL file pair to make this process more reproducible. | ||
|
||
**Usage** | ||
|
||
In a terminal, you can run it with the folllowing command: | ||
``` | ||
$ mip_update_cdes_json \ | ||
--cdes_json_file "/path/to/CDEsMetadata.json" \ | ||
--cdes_excel_file "/path/to/myCDEs.xlxs" \ | ||
--command "remove_dashes_and_underscores" \ | ||
--output_suffix "updated" \ | ||
--log_file "/path/to/CDEs_update.log" | ||
``` | ||
**Note:** You can use the option `-h`to show more details about usage documentation. | ||
|
||
Available commands: | ||
|
||
- `remove_dashes_and_underscores`: Remove dashes and underscores. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
"""MIP Schema main module that stores the package version.""" | ||
|
||
VERSION = "0.0.1" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"""MIP Schema module that provides functions to manipulate Common Data Elements Metadata Schema (CDEs) files.""" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
"""Module for updating the CDEs.""" | ||
|
||
import logging | ||
from openpyxl import Workbook | ||
from openpyxl.utils import rows_from_range | ||
|
||
|
||
STRING_EXCEPTIONS = [ | ||
"FDG-PET", | ||
"follow-up", | ||
"Past-surgery", | ||
"Video-EEG", | ||
"Z-scores", | ||
] | ||
|
||
|
||
def replace_chars(in_str: str, old_chars: list, new_chars: list): | ||
"""Replace a list of old and new characters in a string. | ||
Parameters | ||
---------- | ||
in_str : str | ||
Input string. | ||
old_chars : list | ||
List of characters to be replaced. | ||
new_chars : list | ||
List of new characters used as replacement. | ||
Returns | ||
------- | ||
out_str : str | ||
Updated string. | ||
""" | ||
out_str = in_str | ||
for old_c, new_c in zip(old_chars, new_chars): | ||
out_str = new_c.join(out_str.split(old_c)) | ||
return out_str | ||
|
||
|
||
def replace_characters_in_given_keys( | ||
in_dict: list, | ||
in_wb: Workbook, | ||
old_chars=["-", "_"], | ||
new_chars=[" ", " "], | ||
keys=["code", "label"], | ||
exceptions=None, | ||
): | ||
"""Replace a list of old and new characters in the given keys of a dictionary. | ||
Parameters | ||
---------- | ||
in_dict : dict | ||
Input dictionary (describing the CDEs) to be updated. | ||
in_wb : openpyxl.Workbook | ||
Input workbook loaded from the CDEs in EXCEL format. | ||
old_chars : list | ||
List of characters to be replaced. | ||
new_chars : list | ||
List of new characters used as replacement. | ||
keys : list | ||
List of dictionary fields (keys) in which the characters are replaced. | ||
exceptions : list | ||
List of string exceptions which would not be modified. | ||
Returns | ||
------- | ||
out_dict : dict | ||
Updated dictionary describing the CDEs. | ||
out_wb : openpyxl.Workbook | ||
Updated workbook describing the CDEs. | ||
""" | ||
out_dict = in_dict.copy() | ||
out_wb = in_wb | ||
try: | ||
for k in keys: | ||
if " " not in in_dict[k]: # Replace only if no space in string | ||
if exceptions is not None and in_dict[k] not in exceptions: | ||
out_dict[k] = replace_chars(in_dict[k], old_chars, new_chars) | ||
if out_dict[k] != in_dict[k]: | ||
logging.info(f"-> Change: {in_dict[k]} -> {out_dict[k]}") | ||
out_wb = replace_chars_in_workbook( | ||
in_wb, in_dict[k], out_dict[k] | ||
) | ||
return (out_dict, out_wb) | ||
except Exception as e: | ||
logging.warning(f"Exception raised: {e}") | ||
return dict({}), None | ||
|
||
|
||
def replace_chars_in_workbook(in_wb: Workbook, in_str: str, out_str: str): | ||
"""Replace a string in a workbook. | ||
Parameters | ||
---------- | ||
in_wb : openpyxl.Workbook | ||
Input workbook loaded from the CDEs in EXCEL format. | ||
in_str : str | ||
Input string. | ||
out_str : str | ||
Output string. | ||
Returns | ||
------- | ||
out_wb : openpyxl.Workbook | ||
Updated workbook describing the CDEs. | ||
""" | ||
# Copy the workbook | ||
out_wb: Workbook = in_wb | ||
|
||
# Select the sheet to modify | ||
sheet = out_wb.active | ||
|
||
# Loop through all the cells in the range and replace the string | ||
for row in rows_from_range(sheet.calculate_dimension()): | ||
for cell in row: | ||
if isinstance(cell, str) and cell is not None and in_str in cell: | ||
cell = out_str | ||
return out_wb | ||
|
||
|
||
def recursive_replace_dashes_and_underscores(cdes_data, cdes_wb): | ||
"""Replace the dashes and underscores in the name and code of CDEs' groups. | ||
Parameters | ||
---------- | ||
cdes_data : dict | ||
Dictionary loaded from CDEs file in JSON format. | ||
cdes_wb : openpyxl.Workbook | ||
Workbook loaded from the CDEs in EXCEL format. | ||
Returns | ||
------- | ||
cdes_data : dict | ||
Updated output dictionary describing the CDEs fpr the given federation. | ||
cdes_wb : openpyxl.Workbook | ||
Updated output workbook describing the CDEs for the given federation. | ||
""" | ||
if isinstance(cdes_data, dict): | ||
keys = cdes_data.keys() | ||
if ("label" in keys) and ("code" in keys): | ||
cdes_data, cdes_wb = replace_characters_in_given_keys( | ||
cdes_data, | ||
cdes_wb, | ||
old_chars=["-", "_"], | ||
new_chars=[" ", " "], | ||
keys=["code", "label"], | ||
exceptions=STRING_EXCEPTIONS, | ||
) | ||
if "groups" in keys: | ||
for i, item in enumerate(cdes_data["groups"]): | ||
( | ||
cdes_data["groups"][i], | ||
cdes_wb, | ||
) = recursive_replace_dashes_and_underscores(item, cdes_wb) | ||
return (cdes_data, cdes_wb) | ||
else: | ||
return (None, cdes_wb) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"""MIP Schema module that stores the different scripts with command line interface (CLI).""" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
"""Standalone script which updates the CDEs of the federations of the Medical Informatics Platform (MIP).""" | ||
|
||
import sys | ||
from pathlib import Path | ||
import logging | ||
|
||
from mip_datatools.io import ( | ||
load_cdes_json, | ||
write_cdes_json, | ||
load_cdes_excel, | ||
generate_output_path, | ||
write_cdes, | ||
write_cdes_excel, | ||
) | ||
from mip_datatools.parser import create_parser | ||
from mip_datatools.logger import setup_logging | ||
from mip_datatools.cdes.update import recursive_replace_dashes_and_underscores | ||
|
||
|
||
def main(): | ||
"""Main script function. | ||
Returns | ||
------- | ||
exit_code : {0, 1} | ||
Exit code (0: success / 1: error) | ||
""" | ||
# Create parser and parse script arguments | ||
parser = create_parser() | ||
args = parser.parse_args() | ||
|
||
# Set output directory | ||
args.output_dir = ( | ||
args.output_dir | ||
if args.output_dir is not None | ||
else Path(args.cdes_json_file).parent | ||
) | ||
|
||
# Set path of log file | ||
args.log_file = ( | ||
args.log_file | ||
if args.log_file is not None | ||
else (Path(args.output_dir) / "cdes_update.log").absolute() | ||
) | ||
# Set up logging with log file | ||
setup_logging(args.log_file) | ||
|
||
# Log script arguments | ||
logging.info(f"Starting script with arguments: {args}") | ||
|
||
# Load the CDEs | ||
cdes_data = load_cdes_json(args.cdes_json_file) | ||
cdes_wb = load_cdes_excel(args.cdes_excel_file) | ||
|
||
# Replace "-" and "_" characters by white space | ||
if args.command == "remove_dashes_and_underscores": | ||
(cdes_data, cdes_wb) = recursive_replace_dashes_and_underscores( | ||
cdes_data, cdes_wb | ||
) | ||
|
||
# Generate output file names for json and excel files | ||
out_cdes_json_fname = generate_output_path( | ||
args.cdes_json_file, | ||
output_dir=args.output_dir, | ||
output_suffix=args.output_suffix, | ||
) | ||
out_cdes_excel_fname = generate_output_path( | ||
args.cdes_excel_file, | ||
output_dir=args.output_dir, | ||
output_suffix=args.output_suffix, | ||
) | ||
# Write edited CDEs to json and excel files | ||
write_cdes( | ||
cdes_data, | ||
cdes_wb, | ||
out_cdes_json_fname, | ||
out_cdes_excel_fname, | ||
out_json_indent=args.output_json_indent, | ||
) | ||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
Oops, something went wrong.