Skip to content

Commit

Permalink
Merge pull request #1 from HBPMedical/feat/initial-cdes-update-code
Browse files Browse the repository at this point in the history
feat: add initial code that was updating the CDEs
  • Loading branch information
sebastientourbier authored May 15, 2023
2 parents 1bd9927 + d2f144a commit 6724b0e
Show file tree
Hide file tree
Showing 14 changed files with 588 additions and 1 deletion.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

# Ignore Mac DSStore
.DS_Store

# Ignore python cache
__pycache__

50 changes: 49 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,50 @@
# mip_schema
# MIP Common Data Elements Metadata Schema Tool (`mip_schema`)

Open-source Python package to manipulate Common Data Elements Metadata Schema for the Medical Informatics Platform (MIP).

## How to install?

1. Clone the Git repository in your prefered directory:

```bash
$ cd "/prefered/directory"
$ git clone [email protected]:HBPMedical/mip_schema.git
```

2. Go to the cloned repository and create a new virtual Python 3.9 environment:

```bash
$ cd mip_schema
$ virtualenv venv -p python3.9
```

3. Activate the environment and install the package with Pip:

```bash
$ source ./venv/bin/activate
(venv) $ pip install -e .
```

## Available command-line tools

### `mip_update_cdes_json`

Script to update the CDES JSON/EXCEL file pair to make this process more reproducible.

**Usage**

In a terminal, you can run it with the folllowing command:
```
$ mip_update_cdes_json \
--cdes_json_file "/path/to/CDEsMetadata.json" \
--cdes_excel_file "/path/to/myCDEs.xlxs" \
--command "remove_dashes_and_underscores" \
--output_suffix "updated" \
--log_file "/path/to/CDEs_update.log"
```
**Note:** You can use the option `-h`to show more details about usage documentation.

Available commands:

- `remove_dashes_and_underscores`: Remove dashes and underscores.

4 changes: 4 additions & 0 deletions mip_schema/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""MIP Schema main module that stores the package version."""

VERSION = "0.0.1"

2 changes: 2 additions & 0 deletions mip_schema/cdes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""MIP Schema module that provides functions to manipulate Common Data Elements Metadata Schema (CDEs) files."""

168 changes: 168 additions & 0 deletions mip_schema/cdes/update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Module for updating the CDEs."""

import logging
from openpyxl import Workbook
from openpyxl.utils import rows_from_range


STRING_EXCEPTIONS = [
"FDG-PET",
"follow-up",
"Past-surgery",
"Video-EEG",
"Z-scores",
]


def replace_chars(in_str: str, old_chars: list, new_chars: list):
"""Replace a list of old and new characters in a string.
Parameters
----------
in_str : str
Input string.
old_chars : list
List of characters to be replaced.
new_chars : list
List of new characters used as replacement.
Returns
-------
out_str : str
Updated string.
"""
out_str = in_str
for old_c, new_c in zip(old_chars, new_chars):
out_str = new_c.join(out_str.split(old_c))
return out_str


def replace_characters_in_given_keys(
in_dict: list,
in_wb: Workbook,
old_chars=["-", "_"],
new_chars=[" ", " "],
keys=["code", "label"],
exceptions=None,
):
"""Replace a list of old and new characters in the given keys of a dictionary.
Parameters
----------
in_dict : dict
Input dictionary (describing the CDEs) to be updated.
in_wb : openpyxl.Workbook
Input workbook loaded from the CDEs in EXCEL format.
old_chars : list
List of characters to be replaced.
new_chars : list
List of new characters used as replacement.
keys : list
List of dictionary fields (keys) in which the characters are replaced.
exceptions : list
List of string exceptions which would not be modified.
Returns
-------
out_dict : dict
Updated dictionary describing the CDEs.
out_wb : openpyxl.Workbook
Updated workbook describing the CDEs.
"""
out_dict = in_dict.copy()
out_wb = in_wb
try:
for k in keys:
if " " not in in_dict[k]: # Replace only if no space in string
if exceptions is not None and in_dict[k] not in exceptions:
out_dict[k] = replace_chars(in_dict[k], old_chars, new_chars)
if out_dict[k] != in_dict[k]:
logging.info(f"-> Change: {in_dict[k]} -> {out_dict[k]}")
out_wb = replace_chars_in_workbook(
in_wb, in_dict[k], out_dict[k]
)
return (out_dict, out_wb)
except Exception as e:
logging.warning(f"Exception raised: {e}")
return dict({}), None


def replace_chars_in_workbook(in_wb: Workbook, in_str: str, out_str: str):
"""Replace a string in a workbook.
Parameters
----------
in_wb : openpyxl.Workbook
Input workbook loaded from the CDEs in EXCEL format.
in_str : str
Input string.
out_str : str
Output string.
Returns
-------
out_wb : openpyxl.Workbook
Updated workbook describing the CDEs.
"""
# Copy the workbook
out_wb: Workbook = in_wb

# Select the sheet to modify
sheet = out_wb.active

# Loop through all the cells in the range and replace the string
for row in rows_from_range(sheet.calculate_dimension()):
for cell in row:
if isinstance(cell, str) and cell is not None and in_str in cell:
cell = out_str
return out_wb


def recursive_replace_dashes_and_underscores(cdes_data, cdes_wb):
"""Replace the dashes and underscores in the name and code of CDEs' groups.
Parameters
----------
cdes_data : dict
Dictionary loaded from CDEs file in JSON format.
cdes_wb : openpyxl.Workbook
Workbook loaded from the CDEs in EXCEL format.
Returns
-------
cdes_data : dict
Updated output dictionary describing the CDEs fpr the given federation.
cdes_wb : openpyxl.Workbook
Updated output workbook describing the CDEs for the given federation.
"""
if isinstance(cdes_data, dict):
keys = cdes_data.keys()
if ("label" in keys) and ("code" in keys):
cdes_data, cdes_wb = replace_characters_in_given_keys(
cdes_data,
cdes_wb,
old_chars=["-", "_"],
new_chars=[" ", " "],
keys=["code", "label"],
exceptions=STRING_EXCEPTIONS,
)
if "groups" in keys:
for i, item in enumerate(cdes_data["groups"]):
(
cdes_data["groups"][i],
cdes_wb,
) = recursive_replace_dashes_and_underscores(item, cdes_wb)
return (cdes_data, cdes_wb)
else:
return (None, cdes_wb)
2 changes: 2 additions & 0 deletions mip_schema/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""MIP Schema module that stores the different scripts with command line interface (CLI)."""

84 changes: 84 additions & 0 deletions mip_schema/cli/mip_update_cdes_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Standalone script which updates the CDEs of the federations of the Medical Informatics Platform (MIP)."""

import sys
from pathlib import Path
import logging

from mip_datatools.io import (
load_cdes_json,
write_cdes_json,
load_cdes_excel,
generate_output_path,
write_cdes,
write_cdes_excel,
)
from mip_datatools.parser import create_parser
from mip_datatools.logger import setup_logging
from mip_datatools.cdes.update import recursive_replace_dashes_and_underscores


def main():
"""Main script function.
Returns
-------
exit_code : {0, 1}
Exit code (0: success / 1: error)
"""
# Create parser and parse script arguments
parser = create_parser()
args = parser.parse_args()

# Set output directory
args.output_dir = (
args.output_dir
if args.output_dir is not None
else Path(args.cdes_json_file).parent
)

# Set path of log file
args.log_file = (
args.log_file
if args.log_file is not None
else (Path(args.output_dir) / "cdes_update.log").absolute()
)
# Set up logging with log file
setup_logging(args.log_file)

# Log script arguments
logging.info(f"Starting script with arguments: {args}")

# Load the CDEs
cdes_data = load_cdes_json(args.cdes_json_file)
cdes_wb = load_cdes_excel(args.cdes_excel_file)

# Replace "-" and "_" characters by white space
if args.command == "remove_dashes_and_underscores":
(cdes_data, cdes_wb) = recursive_replace_dashes_and_underscores(
cdes_data, cdes_wb
)

# Generate output file names for json and excel files
out_cdes_json_fname = generate_output_path(
args.cdes_json_file,
output_dir=args.output_dir,
output_suffix=args.output_suffix,
)
out_cdes_excel_fname = generate_output_path(
args.cdes_excel_file,
output_dir=args.output_dir,
output_suffix=args.output_suffix,
)
# Write edited CDEs to json and excel files
write_cdes(
cdes_data,
cdes_wb,
out_cdes_json_fname,
out_cdes_excel_fname,
out_json_indent=args.output_json_indent,
)
return 0


if __name__ == "__main__":
sys.exit(main())
Loading

0 comments on commit 6724b0e

Please sign in to comment.