diff --git a/.gitignore b/.gitignore index 1d1447de..a7228ab3 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ docs/cardea.rst docs/cardea.*.rst docs/modules.rst docs/api +docs/api_reference/api # PyBuilder target/ @@ -113,3 +114,7 @@ ENV/ # IntelliJ Idea .idea/ + +# output +data/ +*.csv diff --git a/Makefile b/Makefile index e7f11bd4..482857ee 100644 --- a/Makefile +++ b/Makefile @@ -113,7 +113,7 @@ test: ## run tests quickly with the default Python .PHONY: test-all test-all: ## run tests on every Python version with tox - tox + tox -r .PHONY: test-readme test-readme: ## run the readme snippets diff --git a/README.md b/README.md index 75fef5b4..264a266a 100644 --- a/README.md +++ b/README.md @@ -64,30 +64,34 @@ This will pull and install the latest stable release from [PyPi](https://pypi.or In this short tutorial we will guide you through a series of steps that will help you get Cardea started. -First, load the core class to work with: +First, we download the dataset we will be working with. Here in this example, we are loading a pre-processed version of the [Kaggle dataset: Medical Appointment No Shows](https://www.kaggle.com/joniarroba/noshowappointments). + +We can use a helper function to download the data. ```python3 -from cardea import Cardea +from cardea.data import download -cardea = Cardea() +data_path = download('kaggle') ``` -We then seamlessly plug in our data. Here in this example, we are loading a pre-processed version of the [Kaggle dataset: Medical Appointment No Shows](https://www.kaggle.com/joniarroba/noshowappointments). -To use this dataset download the data from here then unzip it in the root directory, or run the command: +Alternatively, you can download the dataset directly from the s3 bucket. ```bash curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip ``` -To load the data, supply the ``data`` to the loader using the following command: + +Then, we instantiate a cardea instance by supplying the ``data_path`` to the initializer and choosing the format of the data. ```python3 -cardea.load_entityset(data='kaggle') +from cardea import Cardea + +cardea = Cardea(data_path=data_path, + fhir=True) ``` -> :bulb: To load local data, pass the folder path to ``data``. -To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.es`` which should output the following: +To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.entityset`` which should output the following: -```bash +``` Entityset: kaggle Entities: Address [Rows: 81, Columns: 2] @@ -108,23 +112,25 @@ Entityset: kaggle Patient.address -> Address.object_id ``` -The output shown represents the entityset data structure where ``cardea.es`` is composed of entities and relationships. You can read more about entitysets [here](https://mlbazaar.github.io/Cardea/basic_concepts/data_loading.html). +The output shown represents the entityset data structure where ``cardea.entityset`` is composed of entities and relationships. You can read more about entitysets [here](https://mlbazaar.github.io/Cardea/basic_concepts/data_loading.html). -From there, you can select the prediction problem you aim to solve by specifying the name of the class, which in return gives us the ``label_times`` of the problem. +From there, you can select the prediction problem you aim to solve by specifying the name of the function, which in return gives us the ``label_times`` of the problem. ```python3 -label_times = cardea.select_problem('MissedAppointment') +from cardea.data_labeling import appointment_no_show + +label_times = cardea.label(appointment_no_show, subset=1000) # labeling only a subset of the data ``` ``label_times`` summarizes for each instance in the dataset (1) what is its corresponding label of the instance and (2) what is the time index that indicates the timespan allowed for calculating features that pertain to each instance in the dataset. -```bash - cutoff_time instance_id label -0 2015-11-10 07:13:56 5030230 noshow -1 2015-12-03 08:17:28 5122866 fulfilled -2 2015-12-07 10:40:59 5134197 fulfilled -3 2015-12-07 10:42:42 5134220 noshow -4 2015-12-07 10:43:01 5134223 noshow +``` + identifier time label +0 5030230 2015-11-10 07:13:56 True +1 5122866 2015-12-03 08:17:28 False +2 5134197 2015-12-07 10:40:59 False +3 5134220 2015-12-07 10:42:42 True +4 5134223 2015-12-07 10:43:01 True ``` You can read more about ``label_times`` [here](https://mlbazaar.github.io/Cardea/basic_concepts/machine_learning_tasks.html). @@ -134,15 +140,14 @@ Then, you can perform the AutoML steps and take advantage of Cardea. Cardea extracts features through automated feature engineering by supplying the ``label_times`` pertaining to the problem you aim to solve ```python3 -feature_matrix = cardea.generate_features(label_times[:1000]) +feature_matrix = cardea.featurize(label_times) ``` -> :warning: Featurizing the data might take a while depending on the size of the data. For demonstration, we only featurize the first 1000 records. +> :warning: Featurizing the data might take a while depending on the size of the data. Once we have the features, we can now split the data into training and testing ```python3 -y = list(feature_matrix.pop('label')) - +y = feature_matrix.pop('label').values X = feature_matrix.values X_train, X_test, y_train, y_test = cardea.train_test_split( @@ -152,21 +157,21 @@ X_train, X_test, y_train, y_test = cardea.train_test_split( Now that we have our feature matrix properly divided, we can use to train our machine learning pipeline, Modeling, optimizing hyperparameters and finding the most optimal model ```python3 -cardea.select_pipeline('Random Forest') +cardea.set_pipeline('Random Forest') cardea.fit(X_train, y_train) y_pred = cardea.predict(X_test) ``` Finally, you can evaluate the performance of the model ```python3 -cardea.evaluate(X, y, test_size=0.2, shuffle=True) +cardea.evaluate(X_test, y_test, shuffle=True) ``` which returns the scoring metric depending on the type of problem -```bash -{'Accuracy': 0.75, - 'F1 Macro': 0.5098039215686274, - 'Precision': 0.5183001719479243, - 'Recall': 0.5123528436411872} +``` +Accuracy 0.75 +F1 Macro 0.5098 +Precision 0.5183 +Recall 0.5123 ``` # Citation @@ -174,7 +179,7 @@ If you use Cardea for your research, please consider citing the following paper: Sarah Alnegheimish; Najat Alrashed; Faisal Aleissa; Shahad Althobaiti; Dongyu Liu; Mansour Alsaleh; Kalyan Veeramachaneni. [Cardea: An Open Automated Machine Learning Framework for Electronic Health Records](https://arxiv.org/abs/2010.00509). [IEEE DSAA 2020](https://ieeexplore.ieee.org/document/9260104). -```bash +``` @inproceedings{alnegheimish2020cardea, title={Cardea: An Open Automated Machine Learning Framework for Electronic Health Records}, author={Alnegheimish, Sarah and Alrashed, Najat and Aleissa, Faisal and Althobaiti, Shahad and Liu, Dongyu and Alsaleh, Mansour and Veeramachaneni, Kalyan}, diff --git a/cardea/core.py b/cardea/core.py index edca198e..a0394e44 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -3,66 +3,54 @@ This module defines the Cardea Class, which is responsible for the tying all components together, as well as the interact with them. """ +import json import logging import os import pickle -from inspect import isclass -from io import BytesIO -from urllib.request import urlopen -from zipfile import ZipFile +from functools import partial +from inspect import getfullargspec +from types import FunctionType +from typing import List, Union -import featuretools as ft +import numpy as np import pandas as pd +from mlblocks import MLPipeline import cardea -from cardea.data_loader import EntitySetLoader, load_mimic_data -from cardea.featurization import Featurization +from cardea.data_assembling import EntitySetLoader, load_mimic_data +from cardea.data_labeling import DataLabeler +from cardea.featurizing import Featurization from cardea.modeling import Modeler -from cardea.problem_definition import ( - DiagnosisPrediction, LengthOfStay, MissedAppointment, MortalityPrediction, - ProlongedLengthOfStay, Readmission) LOGGER = logging.getLogger(__name__) -DATA_PATH = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - 'data' -) -BUCKET = 'dai-cardea' -S3_URL = 'https://{}.s3.amazonaws.com/{}' +DEFAULT_PIPELINE = 'XGB' +DEFAULT_METRICS = ["Accuracy", "F1 Macro", "Precision", "Recall"] -class Cardea(): - """An interface class that ties the end-to-end system together. +class Cardea: + """Cardea Class. + + The Cardea Class provides the main functionalities + to load data, create prediction tasks, and build + pipelines. Args: - es_loader (EntitySetLoader): - An entityset loader. - featurization (Featurization): - A featurization class. - modeler (Modeler): - A modeling class. - problems (list): - A list of currently available prediction problems. - chosen_problem (str): - The selected prediction problem or regression. - es (featuretools.EntitySet): - The loaded entityset. - target_entity (str): - The target entity for featurization. + data_path (str): + Path or name of the dataset to load into an entityset. + fhir (bool): + Indicator of whether to use FHIR or MIMIC schema. + pipeline (str, dict or MLPipeline): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (dict): + Additional hyperparameters to set to the pipeline. """ - def __init__(self): - - self.es_loader = EntitySetLoader() - self.featurization = Featurization() - - self.es = None - self.chosen_problem = None - self.target_entity = None - self.modeler = None - - def load_entityset(self, data, fhir=True): + def _load_entityset(self, data_path, fhir): """Returns an entityset loaded with .csv files in data. Load the given dataset into an entityset. The dataset @@ -70,169 +58,177 @@ def load_entityset(self, data, fhir=True): Args: data (str): - A directory of all .csv files that should be loaded. To load demo dataset, - pass the name of the dataset "kaggle" or "mimic". + Path or name of the dataset to load into an entityset. Or + a preloaded entityset fhir (bool): - An indicator of whether to use FHIR or MIMIC schema. + An indicator whether FHIR or MIMIC schema is used. This parameter is + ignored when loading demo data. Default is ``True``. Returns: featuretools.EntitySet: An entityset with loaded data. """ - demo = ['kaggle', 'mimic'] - if not os.path.exists(data) and data in demo: - data = self.download_demo(data) - + function = load_mimic_data if fhir: - self.es = self.es_loader.load_data_entityset(data) - else: - self.es = load_mimic_data(data) + function = self._es_loader.load_data_entityset - @staticmethod - def download_demo(name, data_path=DATA_PATH): - data_path = os.path.join(data_path, name) - os.makedirs(data_path, exist_ok=True) + LOGGER.info("Loading data %s", data_path) - url = S3_URL.format(BUCKET, '{}.zip'.format(name)) - compressed = ZipFile(BytesIO(urlopen(url).read())) + return function(data_path) - LOGGER.info('Downloading dataset %s from %s', name, url) - for file in compressed.namelist(): - filename = os.path.join(data_path, file) - csv_file = compressed.open(file) + def _set_modeler(self): + pipeline = self._pipeline + if isinstance(pipeline, str) and os.path.isfile(pipeline): + with open(pipeline) as json_file: + pipeline = json.load(json_file) - data = pd.read_csv(csv_file, dtype=str) - data.to_csv(filename, index=False) + mlpipeline = MLPipeline(pipeline) + if self._hyperparameters: + mlpipeline.set_hyperparameters(self._hyperparameters) - return data_path + self._modeler = Modeler(mlpipeline, self._type) - def list_problems(self): - """Returns a list of the currently available problems. + def __init__(self, data_path: str = None, labeler: FunctionType = None, fhir: bool = True, + pipeline: Union[str, dict, MLPipeline] = None, hyperparameters: dict = None): + self._pipeline = pipeline or DEFAULT_PIPELINE + self._hyperparameters = hyperparameters + + self._es_loader = EntitySetLoader() + self._featurization = Featurization() + self._modeler = None + + self._fhir = fhir + self._target = None + + # load dataset + self.entityset = self._load_entityset(data_path, fhir) + + def list_labelers(self) -> set: + """Returns a list of the currently available data labelers. Returns: list: - A list of the available problems. + A list of the available data labelers. """ - problems = set([]) - for attribute_string in dir(cardea.problem_definition): - attribute = getattr(cardea.problem_definition, attribute_string) - if isclass(attribute): - if attribute.__name__ and attribute.__name__ != 'ProblemDefinition': - problems.add(attribute.__name__) + labelers = set() + for labeler_string in dir(cardea.data_labeling): + labeler = getattr(cardea.data_labeling, labeler_string) + if isinstance(labeler, FunctionType): + labelers.add(labeler.__name__) - return problems + return labelers - def select_problem(self, selection, parameter=None): - """Select a prediction problem and extract information. + def label(self, labeler: FunctionType, + parameters: dict = None, + subset: Union[list or float] = None, + verbose: bool = False) -> pd.DataFrame: + """Create label times using the data labeler. - Update the select_problem attribute and generate the cutoff times, - the target entity and update the entityset. + Use the labeling function to generate label times Args: - selection (str): - Name of the chosen prediction problem. - parameters (dict): + labeler (function): + Function that defines the prediction task, it should return a + tuple of labeling function, the dataframe, and the name of the + target entity. + parameter (dict): Variables to change the default parameters, if any. + subset (list or float): + If float, fraction of data to label. If list, it should + reference the instances for while to calculat the label + for. + verbose (bool): + Indicate verbosity of the labeler. Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. + pandas.DataFrame: + A dataframe of cutoff times and their target labels. """ - LOGGER.info("Selecting %s prediction problem", selection) - - # problem selection - if selection == 'LengthOfStay': - self.chosen_problem = LengthOfStay() - - elif selection == 'MortalityPrediction': - self.chosen_problem = MortalityPrediction() - - elif selection == 'MissedAppointment': - self.chosen_problem = MissedAppointment() - - elif selection == 'ProlongedLengthOfStay' and parameter: - self.chosen_problem = ProlongedLengthOfStay(parameter) - - elif selection == 'ProlongedLengthOfStay': - self.chosen_problem = ProlongedLengthOfStay() + if parameters: + labeler = partial(labeler, **parameters) - elif selection == 'Readmission' and parameter: - self.chosen_problem = Readmission(parameter) - - elif selection == 'Readmission': - self.chosen_problem = Readmission() - - elif selection == 'DiagnosisPrediction' and parameter: - self.chosen_problem = DiagnosisPrediction(parameter) - - elif selection == 'DiagnosisPrediction': - raise ValueError('unspecified diagnosis code') - - else: - raise ValueError('{} is not a defined problem'.format(selection)) + LOGGER.info("Using labeler %s", str(labeler.__name__)) + data_labeler = DataLabeler(labeler) # target label calculation - self.es, self.target_entity, cutoff = self.chosen_problem.generate_cutoff_times(self.es) + label_times, self._type, self._meta = data_labeler.generate_label_times( + self.entityset, subset=subset, verbose=verbose) - # set default pipeline - if self.chosen_problem.prediction_type == "classification": - pipeline = "Random Forest" - else: - pipeline = "Random Forest Regressor" - - self.modeler = Modeler(pipeline, self.chosen_problem.prediction_type) - - return cutoff + # set modeler if pipeline defined + if self._pipeline: + self._set_modeler() - def list_feature_primitives(self): - """Returns built-in primitive in Featuretools. + return label_times - Returns: - pandas.DataFrame: - A dataframe that lists and describes each built-in primitives. - """ - return ft.list_primitives() - - def generate_features(self, cutoff): + def featurize(self, label_times: pd.DataFrame, + seed_features: Union[bool, list] = None, max_depth: int = 1, + max_features: int = -1, n_jobs: int = 1, + verbose: bool = False) -> pd.DataFrame: """Returns a the calculated feature matrix. Args: - es (featuretools.EntitySet): - An entityset that holds data. - cutoff (pandas.DataFrame): + label_times (pandas.DataFrame): A dataframe that indicates cutoff time for each instance. + max_depth (int): + Maximum allowed depth of features. + max_features (int): + Cap the number of generated features to this number. If -1, no limit. + n_jobs (int): + Number of parallel processes to use when calculating feature matrix. + seed_features (bool or list): + List of manually defined features to use. If boolean, then use previously + created features as seed. + verbose (bool): + Indicate verbosity of the featurization. Returns: - pandas.DataFrame, list: - * The generated feature matrix. - * List of feature definitions in the feature matrix. + pandas.DataFrame: + Generated feature matrix. """ - - fm_encoded, _ = self.featurization.generate_feature_matrix( - self.es, self.target_entity, cutoff) - fm_encoded = fm_encoded.reset_index(drop=True) - return fm_encoded - - def select_pipeline(self, pipeline): + if isinstance(seed_features, bool): + seed_features = self._fm_defs + + method = self._featurization.generate_feature_matrix + target = self._meta.get('entity') + arguments = set(getfullargspec(method)[0]) - set(getfullargspec(self.featurize)[0]) + kwargs = {k: self._meta.get(k) for k in arguments if self._meta.get(k) is not None} + fm, self._fm_defs = method( + self.entityset, target, label_times, + seed_features=seed_features, max_depth=max_depth, + max_features=max_features, n_jobs=n_jobs, verbose=verbose, **kwargs) + + return fm + + def set_pipeline(self, pipeline: Union[str, dict, MLPipeline], + hyperparameters: dict = None) -> None: """Select a pipeline. Args: - pipeline (MLPipeline or str): - A pipeline instance or the name/path of a pipeline. + pipeline (str, dict or MLPipeline): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (dict): + Additional hyperparameters to set to the pipeline. """ - LOGGER.info("Selecting %s pipeline", pipeline) - self.modeler = Modeler(pipeline, self.chosen_problem.prediction_type) + LOGGER.info("Setting %s pipeline", pipeline) - def train_test_split(self, X, y, test_size, shuffle): + self._pipeline = pipeline + self._hyperparameters = hyperparameters + self._set_modeler() + + def train_test_split(self, X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series, list], test_size: float = 0.2, + shuffle: bool = True) -> List[Union[pd.DataFrame, np.ndarray]]: """Split the training dataset and the testing dataset. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series or ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. test_size (float): The proportion of the dataset to include in the test dataset. @@ -243,15 +239,17 @@ def train_test_split(self, X, y, test_size, shuffle): list: List containing the train-test split of the inputs and targets. """ - return self.modeler.train_test_split(X, y, test_size, shuffle) + return self._modeler.train_test_split(X, y, test_size, shuffle) - def fit(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): + def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series, list], + tune: bool = False, max_evals: int = 10, scoring: str = None, + verbose: bool = False) -> None: """Train the cardea pipeline. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. tune (bool): Whether to optimize hyper-parameters of the pipelines. @@ -262,28 +260,31 @@ def fit(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): verbose (bool): Whether to log information during processing. """ - self.modeler.fit(X, y, tune, max_evals, scoring, verbose) + self._modeler.fit(X, y, tune, max_evals, scoring, verbose) - def predict(self, X): + def predict(self, X: Union[str, np.ndarray, pd.DataFrame]) -> Union[np.ndarray, list]: """Get predictions from the cardea pipeline. Args: - X (pandas.DataFrame or ndarray): - Inputs to the pipeline. + X (str, pandas.DataFrame or ndarray): + Inputs to the pipeline. If string, it points to the data path. Returns: - ndarray: + numpy.ndarray or list: Predictions to the input data. """ - return self.modeler.predict(X) + return self._modeler.predict(X) - def fit_predict(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): + def fit_predict(self, X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series, list], tune: bool = False, + max_evals: int = 10, scoring: str = None, + verbose: bool = False) -> Union[np.ndarray, list]: """Train a cardea pipeline then make predictions. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series or ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. tune (bool): Whether to optimize hyper-parameters of the pipelines. @@ -298,21 +299,26 @@ def fit_predict(self, X, y, tune=False, max_evals=10, scoring=None, verbose=Fals ndarray: Predictions to the input data. """ - return self.modeler.fit_predict(X, y, tune, max_evals, scoring, verbose) + return self._modeler.fit_predict(X, y, tune, max_evals, scoring, verbose) - def evaluate(self, X, y, test_size=0.2, shuffle=True, tune=False, max_evals=10, scoring=None, - metrics=None, verbose=False): + def evaluate(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series, list], + test_size: float = 0.2, shuffle: bool = True, fit: bool = False, + tune: bool = False, max_evals: int = 10, scoring: str = None, + metrics: List[str] = DEFAULT_METRICS, verbose: bool = False) -> pd.Series: """Evaluate the cardea pipeline. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series or ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. test_size (float): The proportion of the dataset to include in the test dataset. shuffle (bool): Whether or not to shuffle the data before splitting. + fit (bool): + Whether to fit the pipeline before evaluating it. + Defaults to ``False``. tune (bool): Whether to optimize hyper-parameters of the pipelines. max_evals (int): @@ -324,11 +330,30 @@ def evaluate(self, X, y, test_size=0.2, shuffle=True, tune=False, max_evals=10, with the problem type. verbose (bool): Whether to log information during processing. + + Returns: + Series: + ``pandas.Series`` containing one element for each + metric applied, with the metric name as index. """ - return self.modeler.evaluate( - X, y, test_size, shuffle, tune, max_evals, scoring, metrics, verbose) + if fit: + X_train, X_test, y_train, y_test = self.train_test_split( + X, y, test_size=test_size, shuffle=shuffle) + + self._modeler.fit( + X_train, y_train, tune=tune, max_evals=max_evals, scoring=scoring, verbose=verbose) + + else: + X_test, y_test = X, y + + scores = { + metric: self._modeler.test(X_test, y_test, scoring=metric) + for metric in metrics + } + + return pd.Series(scores) - def save(self, path): + def save(self, path: str): """Save this object using pickle. Args: @@ -342,7 +367,7 @@ def save(self, path): @classmethod def load(cls, path: str): - """Load an Orion instance from a pickle file. + """Load a Cardea instance from a pickle file. Args: path (str): diff --git a/cardea/data.py b/cardea/data.py new file mode 100644 index 00000000..eb04677f --- /dev/null +++ b/cardea/data.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- + +""" +Data Management module. +This module contains functions that allow downloading demo data from Amazon S3 +The demo data is a modified version of the missed appointment data found here: +https://www.kaggle.com/joniarroba/noshowappointments +Another demo data is also available for mimic dataset: +https://physionet.org/files/mimiciii-demo/1.4/ +""" + +import logging +import os +from io import BytesIO +from urllib.request import urlopen +from zipfile import ZipFile + +import pandas as pd + +LOGGER = logging.getLogger(__name__) + +DATA_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'data' +) +BUCKET = 'dai-cardea' +S3_URL = 'https://{}.s3.amazonaws.com/{}' + +DEMO_DATA = ("kaggle", "mimic") + + +def download(name, data_path=DATA_PATH): + """Download demo data with the given name from S3. + + If the data has never been loaded before, it will be downloaded + from the [dai-cardea bucket](https://dai-cardea.s3.amazonaws.com) or + the S3 bucket specified following the `s3://{bucket}/path/to/the.csv` format, + and then cached inside the `data` folder, within the `cardea` package + directory, and then returned. + + Otherwise, if it has been downloaded and cached before, it will be directly + loaded from the `cardea/data` folder without contacting S3. + + Args: + name (str): Name of demo data + + Returns: + str: + path to the downloaded data + """ + if name not in DEMO_DATA: + raise KeyError("unknown demo data {}".format(name)) + + data_path = os.path.join(data_path, name) + + if not os.path.exists(data_path): + os.makedirs(data_path, exist_ok=True) + url = S3_URL.format(BUCKET, '{}.zip'.format(name)) + compressed = ZipFile(BytesIO(urlopen(url).read())) + + LOGGER.info('Downloading dataset %s from %s', name, url) + + for file in compressed.namelist(): + filename = os.path.join(data_path, file) + csv_file = compressed.open(file, 'r') + + data = pd.read_csv(csv_file, dtype=str, encoding="utf-8") + data.to_csv(filename, index=False) + + return data_path diff --git a/cardea/data_assembling/__init__.py b/cardea/data_assembling/__init__.py new file mode 100644 index 00000000..3e88baef --- /dev/null +++ b/cardea/data_assembling/__init__.py @@ -0,0 +1,5 @@ +# import logging + +from cardea.data_assembling.data_loader import DataLoader, Diamond +from cardea.data_assembling.entityset_loader import EntitySetLoader +from cardea.data_assembling.load_mimic import load_mimic_data diff --git a/cardea/data_loader/data_loader.py b/cardea/data_assembling/data_loader.py similarity index 100% rename from cardea/data_loader/data_loader.py rename to cardea/data_assembling/data_loader.py diff --git a/cardea/data_loader/entityset_loader.py b/cardea/data_assembling/entityset_loader.py similarity index 89% rename from cardea/data_loader/entityset_loader.py rename to cardea/data_assembling/entityset_loader.py index f0e51a05..1db4cc99 100644 --- a/cardea/data_loader/entityset_loader.py +++ b/cardea/data_assembling/entityset_loader.py @@ -3,7 +3,7 @@ import featuretools as ft import pandas as pd -from cardea.data_loader import DataLoader, Diamond +from cardea.data_assembling import DataLoader, Diamond class EntitySetLoader(DataLoader): @@ -15,16 +15,15 @@ def create_entity(self, fhir, identifiers, entity_set): """Creates an entity from fhir dataframes and add it to entityset. Args: - fhir (dict): - A dictionary of fhir class dataframes. - entity_set (featuretools.EntitySet): - The global entityset that the entity will be added to. + fhir: A dictionary of fhir class dataframes. + entity_set: The global entityset that the entity will be added to. """ for object_name, df in fhir.items(): id = identifiers[object_name] df = df.apply(pd.to_numeric, errors='ignore') + df.columns = map(str.lower, df.columns) if object_name == 'Period': entity_set.entity_from_dataframe(entity_id=str(object_name), @@ -60,12 +59,10 @@ def load_data_entityset(self, folder_path): Loads .csv files into pandas dataframes then loads them into featuretools' entityset. Args: - folder_path (dict): - A directory of all .csv files that should be loaded. + folder_path: A directory of all .csv files that should be loaded. Returns: - featuretools.EntitySet: - An entityset with loaded data. + An entityset with loaded data. """ fhir = self.read_csv_files(folder_path=folder_path) diff --git a/cardea/data_loader/load_mimic.py b/cardea/data_assembling/load_mimic.py similarity index 88% rename from cardea/data_loader/load_mimic.py rename to cardea/data_assembling/load_mimic.py index a9caf5b7..2364b3e6 100644 --- a/cardea/data_loader/load_mimic.py +++ b/cardea/data_assembling/load_mimic.py @@ -36,7 +36,7 @@ def get_table_properties(name): if a_type == 'timestamp': arr_time.append(column) - types[column.lower()] = d_type + types[column.upper()] = d_type return types, prim_key, arr_time @@ -80,29 +80,27 @@ def load_mimic_data(path=None, subset=None): """Returns an entityset loaded with the dataframes in the received path. Args: - path (str): - The folder path that contains the data. - subset (str): - List of tables to include. + path: The path of the data. + subset: List of tables to include. Returns: - featuretools.EntitySet: - An entityset with loaded data. + An entityset with loaded data. """ + es = ft.EntitySet(id="mimic") relationships = [] global_tables = [] - files = glob(path + '/*.csv') + files = glob(path + '*.csv') for tag in root.findall('tables/table'): table = tag.get('name') - file = os.path.join(path, table.upper() + '.csv') + file = table.upper() + '.csv' if subset and table not in subset: continue - if file in files: + if (path + file) in files: # table name global_tables.append(table) @@ -113,8 +111,7 @@ def load_mimic_data(path=None, subset=None): prop, key, arr_time = get_table_properties(table) # load table into a dataframe - df = pd.read_csv(file, dtype=prop, date_parser=pd.to_datetime) - + df = pd.read_csv(path + file, dtype=prop, date_parser=pd.to_datetime) df.columns = [column.lower() for column in df.columns] # check if arr_time should be None (no time index) diff --git a/cardea/data_loader/schema.xml b/cardea/data_assembling/schema.xml similarity index 100% rename from cardea/data_loader/schema.xml rename to cardea/data_assembling/schema.xml diff --git a/cardea/data_labeling/__init__.py b/cardea/data_labeling/__init__.py new file mode 100644 index 00000000..e5a4f8b3 --- /dev/null +++ b/cardea/data_labeling/__init__.py @@ -0,0 +1,7 @@ +# import logging +from cardea.data_labeling.appointment_no_show import appointment_no_show +from cardea.data_labeling.data_labeler import DataLabeler +from cardea.data_labeling.diagnosis import diagnosis_prediction +from cardea.data_labeling.length_of_stay import length_of_stay +from cardea.data_labeling.mortality import mortality_prediction +from cardea.data_labeling.readmission import readmission diff --git a/cardea/data_labeling/appointment_no_show.py b/cardea/data_labeling/appointment_no_show.py new file mode 100644 index 00000000..45e0439e --- /dev/null +++ b/cardea/data_labeling/appointment_no_show.py @@ -0,0 +1,26 @@ + + +from cardea.data_labeling import utils + + +def appointment_no_show(es): + """Defines the labeling task of appointment no show. + """ + def label(ds, **kwargs): + return True if 'noshow' in ds["status"].values else False + + if es.id == 'mimic': + raise ValueError("Problem not supported for MIMIC data.") + + meta = { + "entity": "Appointment", + "target_entity": "identifier", + "time_index": "created", + "type": "classification", + "num_examples_per_instance": 1, + "ignore_variables": {'Appointment': ['status']} + } + + df = utils.denormalize(es, entities=['Appointment']) + + return label, df, meta diff --git a/cardea/data_labeling/data_labeler.py b/cardea/data_labeling/data_labeler.py new file mode 100644 index 00000000..0636cc54 --- /dev/null +++ b/cardea/data_labeling/data_labeler.py @@ -0,0 +1,61 @@ +import composeml as cp + +from cardea.data_labeling.utils import _get_arguments + + +class DataLabeler: + """Class that defines the prediction problem. + + This class supports the generation of `label_times` which + is fundamental to the feature generation phase as well + as specifying the target labels. + + Args: + function (method): + function that defines the labeling function, it should return a + tuple of labeling function, the dataframe, and the name of the + target entity. + """ + + def __init__(self, function): + self.function = function + + def generate_label_times(self, es, subset=None, verbose=False, **kwargs): + """Searches the data to calculate label times. + + Args: + es (featuretools.EntitySet): + Entityset to extract `label_times` from. + subset (float or int): + Portion of the data to select for searching. + verbose: + An indicator to the verbosity of searching. + + Returns: + composeml.LabelTimes: + Calculated labels with cutoff times. + """ + labeling_function, df, meta = self.function(es) + + data = df + if isinstance(subset, float) or isinstance(subset, int): + data = data.sample(subset) + + target_entity = meta.get('target_entity') + time_index = meta.get('time_index') + window_size = meta.get('window_size') + thresh = meta.get('thresh') + pred_type = meta.get('type') + label_maker = cp.LabelMaker(labeling_function=labeling_function, + target_entity=target_entity, + time_index=time_index, + window_size=window_size) + + kwargs = {**meta, **kwargs} + kwargs = _get_arguments(kwargs, label_maker.search) + label_times = label_maker.search(data.sort_values(time_index), + verbose=verbose, **kwargs) + if thresh is not None: + label_times = label_times.threshold(thresh) + + return label_times, pred_type, meta diff --git a/cardea/data_labeling/diagnosis.py b/cardea/data_labeling/diagnosis.py new file mode 100644 index 00000000..842c90a0 --- /dev/null +++ b/cardea/data_labeling/diagnosis.py @@ -0,0 +1,42 @@ + +from cardea.data_labeling import utils + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'hadm_id', + 'time_index': 'admittime', +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'identifier', + 'time_index': 'start', +} + + +def diagnosis_prediction(es, diag): + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For + a classification version of the problem, specify k. + """ + def label(ds, **kwargs): + return True if diag in ds[column].values else False + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + column = 'diagnosis' + + else: + meta = FHIR_META + entities = ['encounter', 'encounter_diagnosis', 'condition', + 'codeableconcept', 'coding', 'period'] + column = 'code' + + meta['type'] = 'classification' + meta['num_examples_per_instance'] = 1 + + df = utils.denormalize(es, entities=entities) + + return label, df, meta diff --git a/cardea/data_labeling/length_of_stay.py b/cardea/data_labeling/length_of_stay.py new file mode 100644 index 00000000..a0153ab4 --- /dev/null +++ b/cardea/data_labeling/length_of_stay.py @@ -0,0 +1,53 @@ +import pandas as pd + +from cardea.data_labeling import utils + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'hadm_id', + 'time_index': 'admittime', +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'identifier', + 'time_index': 'start', +} + + +def length_of_stay(es, k=None): + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For + a classification version of the problem, specify k. + """ + def label(ds, **kwargs): + return (ds['los'].dt.days).sum() + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + start = 'admittime' + end = 'dischtime' + + elif es.id == 'fhir': + meta = FHIR_META + entities = ['Encounter', 'Period'] + start = 'start' + end = 'end' + + meta['type'] = 'regression' + meta['num_examples_per_instance'] = 1 + + if k: + meta['type'] = 'classification' + meta['thresh'] = k + + df = utils.denormalize(es, entities=entities) + + # generate label + df[end] = pd.to_datetime(df[end]) + df[start] = pd.to_datetime(df[start]) + df['los'] = df[end] - df[start] + + return label, df, meta diff --git a/cardea/data_labeling/mortality.py b/cardea/data_labeling/mortality.py new file mode 100644 index 00000000..1d02fb72 --- /dev/null +++ b/cardea/data_labeling/mortality.py @@ -0,0 +1,52 @@ + +from cardea.data_labeling import utils + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'hadm_id', + 'time_index': 'admittime', + 'ignore_variables': {'admissions': [ + 'hospital_expire_flag', + 'deathtime', + 'discharge_location', + 'dischtime'], + 'patients': ['expire_flag'], + 'callout': ['discharge_wardid']} +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'identifier', + 'time_index': 'start', +} + + +def mortality_prediction(es): + """Defines the labeling task of mortality prediction. + Predict patient mortality from the point of admission. + """ + def label(ds, **kwargs): + return ds['hospital_expire_flag'].sum() > 0 + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + + else: + meta = FHIR_META + entities = ['encounter', 'encounter_diagnosis', 'condition', + 'codeableconcept', 'coding', 'period'] + + meta['type'] = 'classification' + meta['num_examples_per_instance'] = 1 + + df = utils.denormalize(es, entities=entities) + + # generate label + if es.id == 'fhir': + causes_of_death = ['X60', 'X84', 'Y87.0', 'X85', 'Y09', 'Y87.1', + 'V02', 'V04', 'V09.0', 'V09.2', 'V12', 'V14'] + + df['hospital_expire_flag'] = int(df['code'].isin(causes_of_death)) + + return label, df, meta diff --git a/cardea/data_labeling/readmission.py b/cardea/data_labeling/readmission.py new file mode 100644 index 00000000..2090f9a2 --- /dev/null +++ b/cardea/data_labeling/readmission.py @@ -0,0 +1,54 @@ +import pandas as pd + +from cardea.data_labeling import utils + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'subject_id', + 'time_index': 'dischtime' +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'subject', + 'time_index': 'end' +} + + +def readmission(es, k=30): + """Defines the labeling task of readmission. + Predict whether or not the patient will get readmitted + into the hospital, you can specify the number of days + between one visit and another using k. + """ + def label(ds, **kwargs): + if len(ds) < 2: + return 0 + initial_discharge = min(ds.index) + second_admission = sorted(ds[start])[1] + return (second_admission - initial_discharge).days + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + start = 'admittime' + end = 'dischtime' + + else: + meta = FHIR_META + entities = ['encounter', 'period'] + start = 'start' + end = 'end' + + meta['type'] = 'classification' + meta['thresh'] = k + meta['num_examples_per_instance'] = 2 + meta['window_size'] = 2 + + df = utils.denormalize(es, entities=entities) + + # generate label + df[end] = pd.to_datetime(df[end]) + df[start] = pd.to_datetime(df[start]) + + return label, df, meta diff --git a/cardea/data_labeling/utils.py b/cardea/data_labeling/utils.py new file mode 100644 index 00000000..f93023d2 --- /dev/null +++ b/cardea/data_labeling/utils.py @@ -0,0 +1,60 @@ +from inspect import getfullargspec + +import pandas as pd + + +def _get_arguments(arguments, function): + function_arguments = set(getfullargspec(function)[0]) + return {k: arguments.get(k) for k in function_arguments if arguments.get(k) is not None} + + +def _search_relationship(es, left, right): + for r in es.relationships: + if r.parent_entity.id in left: + if right == r.child_entity.id: + left_on = r.parent_variable.id + right_on = r.child_variable.id + + elif r.child_entity.id in left: + if right == r.parent_entity.id: + left_on = r.child_variable.id + right_on = r.parent_variable.id + + return left_on, right_on + + +def denormalize(es, entities): + """Merge a set of entities into a single dataframe. + + Convert a set of entities from the entityset into a single + dataframe by repetitively merging the selected entities. The + merge process is applied sequentially. + + Args: + entities (list): + list of strings denoting which entities to merge. + + Returns: + pandas.DataFrame: + A single dataframe containing all the information from the + selected entities. + """ + k = len(entities) + + # initial entity to start from (should be the target entity) + first = entities[0] + previous = [first] + df = es[first].df + + # merge the dataframes to create a single input + for i in range(1, k): + right = entities[i] + + left_on, right_on = _search_relationship(es, previous, right) + df = pd.merge(df, es[right].df, + left_on=left_on, right_on=right_on, + how='left', suffixes=('', '_y')).filter(regex='^(?!.*_y)') + + previous.append(right) + + return df diff --git a/cardea/data_loader/__init__.py b/cardea/data_loader/__init__.py deleted file mode 100644 index b8a48075..00000000 --- a/cardea/data_loader/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Data loader module.""" - -from cardea.data_loader.data_loader import DataLoader, Diamond -from cardea.data_loader.entityset_loader import EntitySetLoader -from cardea.data_loader.load_mimic import load_mimic_data - -__all__ = ( - "DataLoader", - "EntitySetLoader", - "load_mimic_data" -) diff --git a/cardea/featurization/__init__.py b/cardea/featurization/__init__.py deleted file mode 100644 index 85a20358..00000000 --- a/cardea/featurization/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# -*- coding: utf-8 -*- - -from cardea.featurization.featurization import Featurization - -__all__ = ( - "Featurization" -) diff --git a/cardea/featurizing/__init__.py b/cardea/featurizing/__init__.py new file mode 100644 index 00000000..3a501a37 --- /dev/null +++ b/cardea/featurizing/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +from cardea.featurizing.featurization import Featurization + +__all__ = ( + "Featurization" +) diff --git a/cardea/featurization/featurization.py b/cardea/featurizing/featurization.py similarity index 100% rename from cardea/featurization/featurization.py rename to cardea/featurizing/featurization.py diff --git a/cardea/functional.py b/cardea/functional.py new file mode 100644 index 00000000..94ec9153 --- /dev/null +++ b/cardea/functional.py @@ -0,0 +1,150 @@ +"""Cardea Functional API. + +This module provides a collection of simple python functions that +allow using Cardea performing as little steps as possible. The +API is oriented around various prediction problems. +""" +import logging +from typing import List, Union + +import pandas as pd +from mlblocks import MLPipeline + +from cardea import Cardea +from cardea.core import DEFAULT_METRICS, DEFAULT_PIPELINE +from cardea.data_labeling import appointment_no_show + +LOGGER = logging.getLogger(__name__) + + +def _run(cls, labeler, max_depth, max_features, n_jobs, test_size, shuffle, tune, max_evals, + scoring, evaluate, metrics, return_lt, return_fm, return_pred, verbose): + output = dict() + # labeling + label_times = cls.label(labeler, verbose=verbose) + if return_lt: + output['label_times'] = label_times + + # featurizing + fm = cls.featurize(label_times, max_depth=max_depth, max_features=max_features, + n_jobs=n_jobs, verbose=verbose) + if return_fm: + output['feature_matrix'] = fm + + # modeling + y = fm.pop('label').values + X = fm.values + X_train, X_test, y_train, y_test = cls.train_test_split( + X, y, test_size=test_size, shuffle=shuffle) + + if test_size == 0.: + LOGGER.info("Setting test data equal to train data") + X_test, y_test = X_train, y_train + + cls.fit(X_train, y_train, tune=tune, max_evals=max_evals, scoring=scoring, verbose=verbose) + + if return_pred: + y_pred = cls.predict(X_test) + output['prediction'] = y_pred + + if evaluate: + result = cls.evaluate(X=X_test, y=y_test, fit=False, metrics=metrics) + output['evaluate'] = result + + if len(output) > 0: + return output + + return None + + +def model_appnoshow(data_path: str, fhir: bool = False, + pipeline: Union[str, dict, MLPipeline] = DEFAULT_PIPELINE, + hyperparameters: Union[str, pd.DataFrame] = None, max_depth: int = 1, + max_features: int = -1, n_jobs: int = 1, test_size: float = 0.2, + shuffle: bool = True, tune: bool = False, max_evals: int = 10, + scoring: str = None, evaluate: bool = False, + metrics: List[str] = DEFAULT_METRICS, return_lt: bool = False, + return_fm: bool = False, return_pred: bool = False, verbose: bool = False, + save_path: str = None) -> Cardea: + """Create and train an appointment no show cardea instance. + + Return a cardea class object that has been trained on the given + dataset. The function loads the data, extracts label times, generates + features, then trains the pipeline all in one command. + + Args: + data_path (str): + A directory of all .csv files that should be loaded. + fhir (bool): + An indicator whether FHIR or MIMIC schema is used. + pipeline (str or MLPipeline or dict): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``str`` with the path to a pickle file. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (str or dict): + Hyperparameters to set to the pipeline. It can be passed as + a hyperparameters ``dict`` in the ``mlblocks`` format or as + a path to the corresponding JSON file. Defaults to ``None``. + max_depth (int): + Maximum allowed depth of features. + max_features (int): + Cap to the number of generated features. If -1, no limit. + n_jobs (int): + Number of parallel processes to use when calculating the + feature matrix. + test_size (float): + The proportion of the dataset to include in the test dataset. + shuffle (bool): + Whether or not to shuffle the data before splitting. + tune (bool): + Whether to optimize hyper-parameters of the pipelines. + max_evals (int): + Maximum number of hyper-parameter optimization iterations. + scoring (str): + The name of the scoring function used in the hyper-parameter + optimization. + evaluate (bool): + Whether to evaluate the performance of the pipeline. If True, + we evaluate the performance on the test data, if not given, + evaluate on train data. + metrics (list): + A list of scoring function names. The scoring functions should + be consistent with the problem type. + return_lt (bool): + Whether to return ``label_times``. + return_fm (bool): + Whether to return the calculated feature matrix. + return_pred (bool): + Whether to return the predictions of the pipeline. + verbose (bool): + Whether to log information during processing. + save_path (str): + Path to the file where the fitted pipeline will be stored + using ``pickle``. + + Returns: + Cardea: + A fitted Cardea instance. + """ + + cardea = Cardea(data_path=data_path, + fhir=fhir, + pipeline=pipeline, + hyperparameters=hyperparameters) + + # define labeler + labeler = appointment_no_show + output = _run(cardea, labeler, max_depth, max_features, n_jobs, test_size, shuffle, tune, + max_evals, scoring, evaluate, metrics, return_lt, return_fm, return_pred, + verbose) + + if save_path: + cardea.save(save_path) + + if len(output) > 0: + return cardea, output + + return cardea diff --git a/cardea/problem_definition/__init__.py b/cardea/problem_definition/__init__.py deleted file mode 100644 index 9e39a925..00000000 --- a/cardea/problem_definition/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# import logging -from cardea.problem_definition.definition import ProblemDefinition -from cardea.problem_definition.length_of_stay import LengthOfStay -from cardea.problem_definition.mortality_prediction import MortalityPrediction -from cardea.problem_definition.predicting_diagnosis import DiagnosisPrediction -from cardea.problem_definition.prolonged_length_of_stay import ProlongedLengthOfStay -from cardea.problem_definition.readmission import Readmission -from cardea.problem_definition.show_noshow_appointment import MissedAppointment - -__all__ = ( - "ProblemDefinition", - "LengthOfStay", - "MortalityPrediction", - "DiagnosisPrediction", - "ProlongedLengthOfStay", - "Readmission", - "MissedAppointment" -) diff --git a/cardea/problem_definition/definition.py b/cardea/problem_definition/definition.py deleted file mode 100644 index df82b0e3..00000000 --- a/cardea/problem_definition/definition.py +++ /dev/null @@ -1,247 +0,0 @@ -import pandas as pd - -from cardea.data_loader import DataLoader - - -class ProblemDefinition: - """A class that defines the prediction problem - by specifying cutoff times and generating the target label if it does not exist. - """ - - def check_target_label(self, entity_set, target_entity, target_label): - """Checks if target label exists in the entity set. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - - Returns: - True if the target label exists. - """ - return DataLoader().check_column_existence(entity_set, target_entity, target_label) - - def check_for_missing_values_in_target_label( - self, entity_set, target_entity, target_label_column_name): - """Checks if there is a missing value in the target label. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - - Returns: - False is the target label does not contain a missing value. - """ - return DataLoader().check_for_missing_values(entity_set, - target_entity, - target_label_column_name) - - def generate_target_label(self, entity_set, target_entity, target_label): - """Generates target labels if the entityset is missing labels. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - - Returns: - Target entity with the generated label. - """ - - def generate_cutoff_times(self, entity_set): - """Generates cutoff times for the predection problem. - - Args: - entity_set: fhir entityset. - - Returns: - entity_set, target_entity, series of target_labels and a dataframe of cutoff_times. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - def unify_cutoff_times_hours_admission_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared time. - - Args: - df: cutoff_entity dataframe. - """ - df = df.sort_values(by=[cutoff_time_label]) - df = df.reset_index() - - for i in df.index: - - if i == 0: - - if df.at[i, 'checked'] is not True: - df.at[i, 'ct'] = df.at[i, cutoff_time_label] - df.at[i, 'checked'] = True - - elif df.at[i, 'checked'] is not True: - - ct_val1 = df.at[i - 1, 'ct'] - end_val1 = df.at[i - 1, 'end'] - start_val2 = df.at[i, cutoff_time_label] - df.at[i, 'end'] - - if ct_val1 < start_val2 < end_val1: - df.at[i - 1, 'ct'] = start_val2 - df.at[i, 'ct'] = start_val2 - df.at[i, 'checked'] = True - - else: - df.at[i, 'ct'] = df.at[i, cutoff_time_label] - df.at[i, 'checked'] = True - - if i + 1 == len(df): - break - return df - - def unify_cutoff_times_days_admission_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared days. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['date']): - sub_day = df[df['date'] == d] - - sub_duration_greater = sub_day[sub_day['duration'] > 0] - sub_duration_less = sub_day[sub_day['duration'] <= 0] - frames.append(sub_duration_less) - sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label]) - if len(sub_duration_greater) != 0: - final_date = sub_duration_greater.iloc[-1][cutoff_time_label] - - for i in sub_duration_greater.index: - sub_duration_greater.at[i, 'ct'] = final_date - sub_duration_greater.at[i, 'checked'] = True - - frames.append(sub_duration_greater) - - for i in sub_duration_less.index: - sub_duration_less.at[i, 'ct'] = pd.NaT - sub_duration_less.at[i, 'checked'] = False - - frames.append(sub_duration_less) - - result = pd.concat(frames) - result = result.drop_duplicates() - result[cutoff_time_label] = pd.to_datetime(result.start) - result = result.sort_values(by=[cutoff_time_label]) - result = result.reset_index() - return result - - def unify_cutoff_time_admission_time(self, es, cutoff_entity, cutoff_time_label): - """Process records in the entity that contains cutoff times - based on shared days and time. - - Args: - es: fhir entityset. - - Returns: - processed entity - """ - - df = es[cutoff_entity].df - df[cutoff_time_label] = pd.to_datetime(df[cutoff_time_label]) - df['end'] = pd.to_datetime(df['end']) - duration = (df['end'] - df[cutoff_time_label]).dt.days - duration = duration.tolist() - df['duration'] = duration - df['date'] = df[cutoff_time_label].dt.date - df['ct'] = '' - df['checked'] = False - result1 = self.unify_cutoff_times_days_admission_time(df, cutoff_time_label) - result = self.unify_cutoff_times_hours_admission_time(result1, cutoff_time_label) - if 'level_0' in result.columns: - result = result.drop(columns=['level_0']) - return result - - def unify_cutoff_times_days_discharge_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared days. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['end_date']): - sub_day = df[df['end_date'] == d] - - sub_duration_greater = sub_day[sub_day['duration'] > 0] - sub_duration_less = sub_day[sub_day['duration'] <= 0] - frames.append(sub_duration_less) - sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label]) - if len(sub_duration_greater) != 0: - first_date = sub_duration_greater.iloc[0][cutoff_time_label] - - for i in sub_duration_greater.index: - sub_duration_greater.at[i, 'ct'] = first_date - sub_duration_greater.at[i, 'checked'] = True - frames.append(sub_duration_greater) - - for i in sub_duration_less.index: - sub_duration_less.at[i, 'ct'] = pd.NaT - sub_duration_less.at[i, 'checked'] = False - frames.append(sub_duration_less) - - result = pd.concat(frames) - result = result.drop_duplicates() - result[cutoff_time_label] = pd.to_datetime(result.end) - result = result.reset_index() - return result - - def unify_cutoff_times_hours_discharge_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared time. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['end_date']): - sub_day = df[df['end_date'] == d] - for h in set(sub_day['hour']): - sub_hour = sub_day[sub_day['hour'] == h] - sub_hour = sub_hour.sort_values(by=[cutoff_time_label]) - if len(sub_hour) != 0: - first_date = sub_hour.iloc[0][cutoff_time_label] - for i in sub_hour.index: - sub_hour.at[i, 'ct'] = first_date - sub_hour.at[i, 'checked'] = True - - frames.append(sub_hour) - - result = pd.concat(frames) - result = result.drop_duplicates() - return result - - def unify_cutoff_time_discharge_time(self, es, cutoff_entity, cutoff_time_label): - """Process records in the entity that contains cutoff times - based on shared days and time. - - Args: - es: fhir entityset. - - Returns: - processed entity - """ - - df = es[cutoff_entity].df - df['end_date'] = df[cutoff_time_label].dt.date - df['hour'] = df.end.apply(lambda x: x.hour) - duration = (df[cutoff_time_label] - df['start']).dt.days - duration = duration.tolist() - df['duration'] = duration - df['ct'] = '' - df['checked'] = False - result1 = self.unify_cutoff_times_days_discharge_time(df, cutoff_time_label) - result = self.unify_cutoff_times_hours_discharge_time(result1, cutoff_time_label) - if 'level_0' in result.columns: - result = result.drop(columns=['level_0']) - return result diff --git a/cardea/problem_definition/length_of_stay.py b/cardea/problem_definition/length_of_stay.py deleted file mode 100644 index 834c461c..00000000 --- a/cardea/problem_definition/length_of_stay.py +++ /dev/null @@ -1,150 +0,0 @@ -import featuretools as ft -import pandas as pd - -from cardea.data_loader import DataLoader as DL -from cardea.problem_definition import ProblemDefinition - - -class LengthOfStay (ProblemDefinition): - """Defines the problem of length of stay, predicting how many days - the patient will be in the hospital. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - - __name__ = 'los' - - updated_es = None - target_label_column_name = 'length' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - conn = 'period' - prediction_type = 'regression' - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label(es, - self.target_entity, - self.target_label_column_name) and not - self.check_for_missing_values_in_target_label(es, - self.target_entity, - self.target_label_column_name)): - if DL().check_column_existence(es, - self.cutoff_entity, - self.cutoff_time_label): - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - - cutoff_times['label'] = list( - es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - else: - updated_es = self.generate_target_label(es) - return self.generate_cutoff_times(updated_es) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - - generate_from = 'Period' - start = self.cutoff_time_label - end = 'end' - label_name = self.target_label_column_name - - if (DL().check_column_existence(es, - generate_from, - start) and DL().check_column_existence(es, - generate_from, - end)): - if (not DL().check_for_missing_values(es, - generate_from, - start) and not - (DL().check_for_missing_values(es, - generate_from, - end))): - - es[generate_from].df[start] = pd.to_datetime( - es[generate_from].df[start]) - es[generate_from].df[end] = pd.to_datetime( - es[generate_from].df[end]) - duration = (es[generate_from].df[end] - es[generate_from].df[start]).dt.days - duration = duration.tolist() - es[self.target_entity].df[label_name] = duration - updated_target_entity = es[self.target_entity].df - duration_df = pd.DataFrame({'object_id': duration}) - - es = es.entity_from_dataframe(entity_id='Duration', - dataframe=duration_df, - index='object_id') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, index='identifier') - new_relationship = ft.Relationship(es['Duration']['object_id'], - es[self.target_entity][label_name]) - es = es.add_relationship(new_relationship) - - return es - - else: - raise ValueError('Can not generate target label {} in table {} \ - beacuse start or end labels in table {} contain \ - missing value.'.format(label_name, - self.target_entity, - generate_from)) - - else: - raise ValueError('Can not generate target label {} in \ - table {}.'.format(label_name, - self.target_entity)) diff --git a/cardea/problem_definition/mortality_prediction.py b/cardea/problem_definition/mortality_prediction.py deleted file mode 100644 index 41b62514..00000000 --- a/cardea/problem_definition/mortality_prediction.py +++ /dev/null @@ -1,145 +0,0 @@ -import pandas as pd - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class MortalityPrediction (ProblemDefinition): - """Defines the problem of diagnosis Prediction. - - Finding whether a patient will be diagnosed with a specifed diagnosis. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'mortality' - - updated_es = None - target_label_column_name = 'diagnosis' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - prediction_type = 'classification' - conn = 'period' - causes_of_death = ['X60', 'X84', 'Y87.0', 'X85', 'Y09', - 'Y87.1', 'V02', 'V04', 'V09.0', 'V09.2', 'V12', 'V14'] - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - es = self.generate_target_label(es) - - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): # check the existance of the cutoff label - - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - - cutoff_times['label'] = list(es[self.target_entity].df[self.target_label_column_name]) - - for (idx, row) in cutoff_times.iterrows(): - new_val = row.loc['label'] in self.causes_of_death - cutoff_times.at[idx, 'label'] = new_val - - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - if (self.check_target_label( - es, - self.target_entity, - self.target_label_column_name)): - - if not DataLoader().check_for_missing_values(es, - self.target_entity, - self.target_label_column_name): - entity_set_df = es[self.target_entity].df - - merging_coding = pd.merge(es['Coding'].df, es['CodeableConcept'].df, - left_on='object_id', right_on='coding', how='left') - merging_condtion = pd.merge(merging_coding, es['Condition'].df, - left_on='object_id_y', right_on='code', how='left') - merging_diagnosis = pd.merge( - merging_condtion, - es['Encounter_Diagnosis'].df, - left_on='identifier', - right_on='condition', how='left') - - merging_encouter = pd.merge(merging_diagnosis, es[self.target_entity].df, - left_on='subject', right_on='identifier', how='left') - merging_encouter['target'] = merging_encouter['code_x'] - - set(es[self.target_entity].df.identifier) - - entity_set_df[self.target_label_column_name] = list(merging_encouter['target']) - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=entity_set_df, - index='identifier') - - return es - - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse end label in \ - table {} contains missing value.'.format( - self.target_label_column_name, self. target_entity, generate_from)) - - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) diff --git a/cardea/problem_definition/predicting_diagnosis.py b/cardea/problem_definition/predicting_diagnosis.py deleted file mode 100644 index f4afeb26..00000000 --- a/cardea/problem_definition/predicting_diagnosis.py +++ /dev/null @@ -1,142 +0,0 @@ -import pandas as pd - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class DiagnosisPrediction (ProblemDefinition): - """Defines the problem of diagnosis Prediction. - - Finding whether a patient will be diagnosed with a specifed diagnosis. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'diagnosis' - - updated_es = None - target_label_column_name = 'diagnosis' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - prediction_type = 'classification' - conn = 'period' - - def __init__(self, d): - self.diagnosis = d - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - es = self.generate_target_label(es) - - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): # check the existance of the cutoff label - - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - cutoff_times['label'] = list(es[self.target_entity].df[self.target_label_column_name]) - cutoff_times['label'] = cutoff_times['label'] == self.diagnosis - - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - if (self.check_target_label( - es, - self.target_entity, - self.target_label_column_name)): - - if not DataLoader().check_for_missing_values(es, - self.target_entity, - self.target_label_column_name): - entity_set_df = es[self.target_entity].df - - merging_coding = pd.merge(es['Coding'].df, es['CodeableConcept'].df, - left_on='object_id', right_on='coding', how='left') - merging_condtion = pd.merge(merging_coding, es['Condition'].df, - left_on='object_id_y', right_on='code', how='left') - merging_diagnosis = pd.merge( - merging_condtion, - es['Encounter_Diagnosis'].df, - left_on='identifier', - right_on='condition', how='left') - - merging_encouter = pd.merge(merging_diagnosis, es[self.target_entity].df, - left_on='subject', right_on='identifier', how='left') - merging_encouter['target'] = merging_encouter['code_x'] - - set(es[self.target_entity].df.identifier) - - entity_set_df[self.target_label_column_name] = list(merging_encouter['target']) - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=entity_set_df, - index='identifier') - - return es - - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse end label in \ - table {} contains missing value.'.format( - self.target_label_column_name, self. target_entity, generate_from)) - - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) diff --git a/cardea/problem_definition/prolonged_length_of_stay.py b/cardea/problem_definition/prolonged_length_of_stay.py deleted file mode 100644 index 0e88a374..00000000 --- a/cardea/problem_definition/prolonged_length_of_stay.py +++ /dev/null @@ -1,169 +0,0 @@ -import featuretools as ft -import pandas as pd - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class ProlongedLengthOfStay (ProblemDefinition): - """Defines the problem of length of stay - - Predicting whether a patient stayed in the hospital more or less than - a week (by default). - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'plos' - - updated_es = None - target_label_column_name = 'length' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - conn = 'period' - prediction_type = 'classification' - - def __init__(self, t=7): - self.threshold = t - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label(es, - self.target_entity, - self.target_label_column_name) and not - self.check_for_missing_values_in_target_label(es, - self.target_entity, - self.target_label_column_name)): - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): - - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - update_es = es[self.target_entity].df - - # threshold - update_es['length'] = (update_es['length'] >= self.threshold) - update_es['length'] = update_es['length'].astype(int) - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=update_es, - index='identifier') - - cutoff_times['label'] = list( - es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - else: - updated_es = self.generate_target_label(es) - return self.generate_cutoff_times(updated_es) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - start = self.cutoff_time_label - end = 'end' - label_name = self.target_label_column_name - if (DataLoader().check_column_existence( - es, - generate_from, - start) and DataLoader().check_column_existence(es, - generate_from, - end)): - - if (not DataLoader().check_for_missing_values( - es, - generate_from, - start) and not DataLoader().check_for_missing_values(es, - generate_from, - end)): - - es[generate_from].df[start] = pd.to_datetime( - es[generate_from] - .df[start]) - es[generate_from].df[end] = pd.to_datetime( - es[generate_from].df[end]) - duration = (es[generate_from].df[end] - - es[generate_from].df[start]).dt.days - duration = duration.tolist() - es[self.target_entity].df[label_name] = duration - updated_target_entity = es[self.target_entity].df - duration_df = pd.DataFrame({'object_id': duration}) - - es = es.entity_from_dataframe( - entity_id='Duration', - dataframe=duration_df, - index='object_id') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, - index='identifier') - new_relationship = ft.Relationship(es['Duration']['object_id'], - es[self.target_entity][label_name]) - es = es.add_relationship(new_relationship) - - return es - - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse start or end labels in \ - table {} contain missing value.'.format( - label_name, self.target_entity, generate_from)) - - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - label_name, - self.target_entity)) diff --git a/cardea/problem_definition/readmission.py b/cardea/problem_definition/readmission.py deleted file mode 100644 index 7ce99400..00000000 --- a/cardea/problem_definition/readmission.py +++ /dev/null @@ -1,169 +0,0 @@ -import pandas as pd - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class Readmission (ProblemDefinition): - """Defines the problem of readmission. - - Predicting whether a patient will revisit the hospital within certain period of time. - - Note: - The patient visit is considered a readmission if he/she visits - the hospital again within 30 days. - - The readmission diagnosis does not have to be the same as the initial visit diagnosis, - (The patient could be diagnosed of something that is a complication - of the initial diagnosis). - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'readmission' - - updated_es = None - target_label_column_name = 'readmitted' - target_entity = 'Encounter' - cutoff_time_label = 'end' - cutoff_entity = 'Period' - prediction_type = 'classification' - conn = 'period' - - def __init__(self, t=30): - self.readmission_threshold = t - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - self.generate_target_label(es) - - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): # check the existance of the cutoff label - - generated_cts = self.unify_cutoff_time_discharge_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - cutoff_times['label'] = list(es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - end = 'end' - if (DataLoader().check_column_existence( - es, - generate_from, - end)) and (DataLoader().check_column_existence(es, - self.target_entity, - 'period')): - - if not DataLoader().check_for_missing_values( - es, - generate_from, end): - - entity_set_df = es[self.target_entity].df - generated_df = es[generate_from].df - merged_df = pd.merge(entity_set_df, generated_df, how='left', - left_on='period', right_on='object_id') - - generated_target_label = [] - encounter_identifier = [] - - for patient in set(merged_df['subject']): - patient_visits = merged_df[merged_df['subject'] == patient] - inital_date = patient_visits[end].iloc[0] - - encounter_identifier.append(patient_visits['identifier'].iloc[0]) - generated_target_label.append(False) # first visit - - if len(patient_visits) != 1: - for visit_date, encounter_id in zip(patient_visits[end][1:], - patient_visits['identifier'][1:]): - - visit_range = visit_date - inital_date - inital_date = visit_date - - if visit_range.days <= self.readmission_threshold: - generated_target_label.append(True) - encounter_identifier.append(encounter_id) - - else: - generated_target_label.append(False) - encounter_identifier.append(encounter_id) - - generated_labels = pd.DataFrame( - {self.target_label_column_name: generated_target_label, - 'identifier': encounter_identifier}) - updated_target_entity = pd.merge(entity_set_df, - generated_labels, - on='identifier') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, - index='identifier') - - return es - - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse end label in \ - table {} contains missing value.'.format( - self.target_label_column_name, self. target_entity, generate_from)) - - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) diff --git a/cardea/problem_definition/show_noshow_appointment.py b/cardea/problem_definition/show_noshow_appointment.py deleted file mode 100644 index 6093802f..00000000 --- a/cardea/problem_definition/show_noshow_appointment.py +++ /dev/null @@ -1,76 +0,0 @@ - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class MissedAppointment(ProblemDefinition): - """Defines the problem of missed appointment - - Predict whether the patient will show to the appointment or not. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'mapp' - - target_label_column_name = 'status' - target_entity = 'Appointment' - prediction_type = 'classification' - cutoff_time_label = 'created' - cutoff_entity = target_entity - - def generate_cutoff_times(self, entity_set): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label( - entity_set, - self.target_entity, - self.target_label_column_name)) and\ - not (self.check_for_missing_values_in_target_label(entity_set, - self.target_entity, - self.target_label_column_name)): - - if DataLoader().check_column_existence(entity_set, - self.target_entity, - self.cutoff_time_label): - - instance_id = list(entity_set[self.target_entity].df.index) - cutoff_times = entity_set[self.cutoff_entity].df[self.cutoff_time_label].to_frame() - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - cutoff_times['label'] = list( - entity_set[self.target_entity].df[self.target_label_column_name]) - entity_set[self.target_entity].delete_variables([self.target_label_column_name]) - return (entity_set, self.target_entity, cutoff_times) - else: - raise ValueError( - 'Cutoff time label {} in table {} does not exist'.format( - 'created', self.target_entity)) - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) diff --git a/docs/api_reference/cardea.rst b/docs/api_reference/cardea.rst index 9f968a44..057a1f06 100644 --- a/docs/api_reference/cardea.rst +++ b/docs/api_reference/cardea.rst @@ -12,12 +12,9 @@ Cardea :toctree: api/ Cardea - Cardea.load_entityset - Cardea.list_problems - Cardea.select_problem - Cardea.list_feature_primitives - Cardea.generate_features - Cardea.select_pipeline + Cardea.label + Cardea.featurize + Cardea.set_pipeline Cardea.train_test_split Cardea.fit Cardea.predict diff --git a/docs/api_reference/data_loader.rst b/docs/api_reference/data_assembling.rst similarity index 78% rename from docs/api_reference/data_loader.rst rename to docs/api_reference/data_assembling.rst index def53bfb..c02a55eb 100644 --- a/docs/api_reference/data_loader.rst +++ b/docs/api_reference/data_assembling.rst @@ -1,9 +1,9 @@ -.. _cardea.data_loader: +.. _cardea.data_assembling: cardea.data_loader ================== -.. currentmodule:: cardea.data_loader +.. currentmodule:: cardea.data_assembling EntitySet Loader ~~~~~~~~~~~~~~~~ diff --git a/docs/api_reference/featurization.rst b/docs/api_reference/featurizing.rst similarity index 72% rename from docs/api_reference/featurization.rst rename to docs/api_reference/featurizing.rst index ff3d98eb..3365cc24 100644 --- a/docs/api_reference/featurization.rst +++ b/docs/api_reference/featurizing.rst @@ -1,9 +1,9 @@ -.. _cardea.featurization: +.. _cardea.featurizing: cardea.featurization ==================== -.. currentmodule:: cardea.featurization +.. currentmodule:: cardea.featurizing Featurization ~~~~~~~~~~~~~~~~~ diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst index 11ff1b1d..ce3f56dc 100644 --- a/docs/api_reference/index.rst +++ b/docs/api_reference/index.rst @@ -9,8 +9,7 @@ In this section you will find a detailed specification of all the public functio :maxdepth: 2 cardea - data_loader - problem_definition - featurization + data_assembling + featurizing modeling fhir \ No newline at end of file diff --git a/docs/api_reference/problem_definition.rst b/docs/api_reference/problem_definition.rst deleted file mode 100644 index 7a109238..00000000 --- a/docs/api_reference/problem_definition.rst +++ /dev/null @@ -1,60 +0,0 @@ -.. _cardea.problem_definition: - -cardea.problem_definition -========================= - -.. currentmodule:: cardea.problem_definition - -Prolonged Length of Stay -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - ProlongedLengthOfStay - ProlongedLengthOfStay.generate_cutoff_times - -Length of Stay -~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - LengthOfStay - LengthOfStay.generate_cutoff_times - -Readmission -~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - Readmission - Readmission.generate_cutoff_times - -MortalityPrediction -~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - MortalityPrediction - MortalityPrediction.generate_cutoff_times - -DiagnosisPrediction -~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - DiagnosisPrediction - DiagnosisPrediction.generate_cutoff_times - -MissedAppointmentProblemDefinition -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - MissedAppointment - MissedAppointment.generate_cutoff_times diff --git a/docs/basic_concepts/advanced_use.rst b/docs/basic_concepts/advanced_use.rst deleted file mode 100644 index 09df92a7..00000000 --- a/docs/basic_concepts/advanced_use.rst +++ /dev/null @@ -1,55 +0,0 @@ -Advanced use -============ - -How to define a new machine learning task? ------------------------------------------- - -The definition of a new Machine Learning task in Cardea can be made in four simple steps: - -1. Go to the `problem_definition`_ directory and create a file with a class specifically for - your problem. This class should extend the `ProblemDefinition`_ class and overwrites - accordingly the necessary attributes and methods as needed. Usually, you should pay special - attention to the ``generate_target_label(...)`` and ``generate_cutoff_times(...)`` methods - as you might need to extend them or re-implemented in some cases. - -2. Expose your new class definition in the `init`_ file inside the `problem_definition`_ directory - -3. If you will be using a dataset in a different format that the expected by Cardea (CSV files), - then you will need to provide a specific loading dataset method for your data in the - `EntitySetLoader`_ class, where you will be creating your collection of entities and - relationships between them using the `featuretools.EntitySet`_ class. - -4. Finally, you need to update the `Cardea`_ class to support the new problem definition and be - able to instantiate the proper class when it is necessary in the ``Cardea.select_problem(...)`` - method. - -Features, primitives and AutoML integration -------------------------------------------- - -Once you have defined your problem, following the four steps in the previous section, you will be -able to perform featurization and run different primitives using the AutoML tool as follows: - -.. code-block:: python - - from cardea import Cardea - cardea = Cardea() - cardea.load_your_custom_data() - problem = cardea.select_problem('YourCustomProblemDefinition') - feature_matrix = cardea.generate_features(problem[:1000]) # a subset - feature_matrix = feature_matrix.sample(frac=1) # shuffle - y = list(feature_matrix.pop('label')) - X = feature_matrix.values - pipeline = [ - ['sklearn.ensemble.RandomForestClassifier'], - ['sklearn.naive_bayes.MultinomialNB'], - ['sklearn.neighbors.KNeighborsClassifier'] - ] - result = cardea.execute_model(feature_matrix=X, target=y, primitives=pipeline) - - -.. _featuretools.EntitySet: https://docs.featuretools.com/generated/featuretools.EntitySet.html#featuretools.EntitySet -.. _problem_definition: https://github.com/D3-AI/Cardea/tree/master/cardea/problem_definition -.. _ProblemDefinition: https://github.com/D3-AI/Cardea/blob/master/cardea/problem_definition/definition.py -.. _init: https://github.com/D3-AI/Cardea/blob/master/cardea/problem_definition/__init__.py -.. _EntitySetLoader: https://github.com/D3-AI/Cardea/blob/master/cardea/data_loader/entityset_loader.py#L9 -.. _Cardea: https://github.com/D3-AI/Cardea/blob/master/cardea/cardea.py diff --git a/docs/basic_concepts/auditing.rst b/docs/basic_concepts/auditing.rst deleted file mode 100644 index 4458f2ac..00000000 --- a/docs/basic_concepts/auditing.rst +++ /dev/null @@ -1,37 +0,0 @@ -Auditing -======== - -One element that is essential to prediction problems is the evaluation of the prediction results, -but this might come in various forms and users rely on different metrics to identify the best -model for a specific problem. Commonly, some metrics might be more representative than others -depending on problem. - -Therefore, to facilitate the auditing process, Cardea has two components designed specifically -to cover both: data and model auditing, given that prediction problems rely mainly on the data -that is being used. While Cardea provides a set of metrics that can be used as default metrics -for certain prediction problems, it also provides the means to expand them and allow users to -introduce new kind of metrics. - -Using Cardea, users have the ability to generate a data summary report describing the data through -the Data Auditor, enhancing users' understandability and engagement. Although the system includes -a set of predefined audits that are commonly applied in the literature, they can also specify special -types of audits that they want to apply on their dataset, using a dictionary of all the possible checks -that must be reported. - -These checks are divided in two categories: **data quality checks** and **data representation checks**. While -the data quality checks identifies the missing information in the data; the data representation checks -identifies data represents the users assumptions. - -Similarly, Cardea provides full report to users describing the performance and behavior of the model with -the `Model Auditor`_ component, aiming to give users more interpretability and understanding of the machine -learning model. - -Currently, prediction problems are categorized in regression or classification problems and each of them -has a wide range of metrics (e.g., accuracy, F1 scores, precision recall, AUC for classification and -mean square errors, mean absolute errors and r squared for regression). - -Additionally, given that Cardea provides the ability to run different pipelines composed of different -types of machine learning algorithms, the Model Auditor allows to compare multiple prediction -pipelines and evaluate changes in their behavior using different training and testing data sets. - -.. _Model Auditor: https://github.com/HDI-Project/ModelAudit diff --git a/docs/basic_concepts/auto_featurization.rst b/docs/basic_concepts/auto_featurization.rst deleted file mode 100644 index 725f26d4..00000000 --- a/docs/basic_concepts/auto_featurization.rst +++ /dev/null @@ -1,10 +0,0 @@ -Auto - Featurization -==================== - -Cardea automatically generates features using the `Featuretools`_ package, specifically, -the `Deep Feature Synthesis (DFS)`_ algorithm to generate a feature matrix from a given dataset. -Aiming to fully automate this process, it determines the focus values of the automated feature engineering -task using the **target entity**, **cutoff times**, and **label** of the prediction problem. - -.. _Featuretools: https://www.featuretools.com/ -.. _Deep Feature Synthesis (DFS): https://docs.featuretools.com/automated_feature_engineering/afe.html#deep-feature-synthesis diff --git a/docs/basic_concepts/auto_ml.rst b/docs/basic_concepts/auto_ml.rst deleted file mode 100644 index 0e244072..00000000 --- a/docs/basic_concepts/auto_ml.rst +++ /dev/null @@ -1,96 +0,0 @@ -Auto - ML -========= - -Cardea makes use of two packages to automate and simplify the modeling step in the Machine -Learning tasks: `MLPrimitives`_ and `MLBlocks`_. - -MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning -tools developed in Python, whether they are custom developments or belong to third party -libraries, and build Pipelines out of them that can be fitted and then used to make predictions. -This is achieved by providing a simple and intuitive annotation language that allows the user to -specify how to integrate with each tool, called **primitives**, in order to provide a common uniform -interface to each one of them. - -In the other hand, MLPrimitives is a repository containing primitive annotations to be used by the -MLBlocks library. - -Thanks to the use of these two packages, the Machine Learning algorithm selection and the -hyper-parameter tuning steps can be done easily using JSON annotations as follow: - -.. code-block:: python - - pipeline = [ - ['sklearn.ensemble.RandomForestClassifier'], - ['sklearn.naive_bayes.MultinomialNB'], - ['sklearn.neighbors.KNeighborsClassifier'] - ] - result = cardea.execute_model(..., primitives=pipeline) - -Where, for example, the ``sklearn.naive_bayes.MultinomialNB`` primitive is defined in the -`MLPrimitives`_ package, with the following structure: - -.. code-block:: python - - { - "name": "sklearn.naive_bayes.MultinomialNB", - "contributors": [...], - "documentation": "...", - "description": "...", - "classifiers": { - "type": "estimator", - "subtype": "classifier" - }, - "modalities": ["text"], - "primitive": "sklearn.naive_bayes.MultinomialNB", - "fit": { - "method": "fit", - "args": [ - { - "name": "X", - "type": "ndarray" - }, - { - "name": "y", - "type": "array" - } - ] - }, - "produce": { - "method": "predict", - "args": [ - { - "name": "X", - "type": "ndarray" - } - ], - "output": [ - { - "name": "y", - "type": "array" - } - ] - }, - "hyperparameters": { - "fixed": { - "fit_prior": { - "type": "bool", - "default": true - }, - "class_prior": { - "type": "iterable", - "default": null - } - }, - "tunable": { - "alpha": { - "type": "float", - "default": 1.0, - "range": [0.0, 1.0] - } - } - } - } - - -.. _MLPrimitives: https://hdi-project.github.io/MLPrimitives/ -.. _MLBlocks: https://hdi-project.github.io/MLBlocks/ diff --git a/docs/basic_concepts/data_loading.rst b/docs/basic_concepts/data_loading.rst deleted file mode 100644 index 73e06efd..00000000 --- a/docs/basic_concepts/data_loading.rst +++ /dev/null @@ -1,66 +0,0 @@ -Data Loading -============ - -Cardea makes use of a module to plugin the user's data and automatically organize it into the framework. -It expects data in Fast Healthcare Interoperability Resources (FHIR), a standard for health care data -exchange, published by HL7®. Among the advantages of FHIR over other standards are: - -* Fast and easy to implement -* Specification is free for use with no restrictions -* Strong foundation in Web standards: XML, JSON, HTTP, OAuth, etc. -* Support for RESTful architectures -* Concise and easily understood specifications -* A human-readable serialization format for ease of use by developers - -By default, Cardea loads a dataset hosted in `Amazon S3`_, representing a formatted version of the -Kaggle dataset: `Medical Appointment No Shows`_, but it also allows user to load datasets providing a -local path with CSV files, using the ``load_data_entityset(...)`` method. As an example, the following piece -of code will load the default Kaggle dataset: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.load_entityset(data='kaggle') - -While local files can be loaded using the same method with a ``data`` parameter: - -.. code-block:: python - - cardea.load_entityset(data="your/local/path/") - -Cardea handles datasets as a collection of entities and the relationships between them because they -are useful for preparing raw, structured datasets for feature engineering. For this, it uses -the `featuretools.EntitySet`_ class. - -Using the following command, you will be able to summarize the dataset: - -.. code-block:: python - - cardea.es - Entityset: fhir - Entities: - Address [Rows: 81, Columns: 2] - Appointment_Participant [Rows: 6100, Columns: 2] - Appointment [Rows: 110527, Columns: 5] - CodeableConcept [Rows: 4, Columns: 2] - Coding [Rows: 3, Columns: 2] - Identifier [Rows: 227151, Columns: 1] - Observation [Rows: 110527, Columns: 3] - Patient [Rows: 6100, Columns: 4] - Reference [Rows: 6100, Columns: 1] - Relationships: - Appointment_Participant.actor -> Reference.identifier - Appointment.participant -> Appointment_Participant.object_id - CodeableConcept.coding -> Coding.object_id - Observation.code -> CodeableConcept.object_id - Observation.subject -> Reference.identifier - Patient.address -> Address.object_id - -Showing, in this case, the resources that were loaded into the framework (**Entities** section) -and the relationship between the resources (**Relationships** section). - - -.. _Amazon S3: https://s3.amazonaws.com/dai-cardea/ -.. _Medical Appointment No Shows: https://www.kaggle.com/joniarroba/noshowappointments -.. _featuretools.EntitySet: https://docs.featuretools.com/generated/featuretools.EntitySet.html#featuretools.EntitySet diff --git a/docs/basic_concepts/index.rst b/docs/basic_concepts/index.rst deleted file mode 100644 index cf0ebc8d..00000000 --- a/docs/basic_concepts/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -.. _concepts: - -Basic Concepts -============== - -Before diving into advanced usage and contributions, let's review the basic concepts of the -library to help you get started. - - -.. toctree:: - :maxdepth: 3 - - data_loading - machine_learning_tasks - auto_featurization - auto_ml - auditing - advanced_use diff --git a/docs/basic_concepts/machine_learning_tasks.rst b/docs/basic_concepts/machine_learning_tasks.rst deleted file mode 100644 index f3757aa5..00000000 --- a/docs/basic_concepts/machine_learning_tasks.rst +++ /dev/null @@ -1,66 +0,0 @@ -Machine Learning Tasks -====================== - -The Problem Definition is considered a fundamental component that formulates the task for -Machine Learning models. It includes generating and identifying two main concepts: -the **target variable** and the **cutoff times**. - -Therefore, the first step to work with Cardea is defining a Machine Learning Task (or using one -of the already defined tasks). For example, **Missed Appointment** is a common task that aims -to predict whether the patient showed to the appointment or not, helping hospitals to optimize -their scheduling policies and resources efficiently. - -Outcome to predict ------------------- - -Following with the previous example, the **Missed Appointment** task is currently defined as -a binary classification task in the system, determining whether a patient showed to the appointment -or not from the point of appointment scheduling. - -Usually, the outcome is defined over the FHIR data schema, using the resource id values for -references between instances. - -Cutoff times and Labels ------------------------ - -As it was stated before, the success of the Problem Definition step and its outcome depends on -two main concepts: the **target variable** and the **cutoff times**. The target variable is -generated automatically by Cardea if it does not exist in the dataset and its objective is to -set the definition of the model output. In the other hand, the objective of cutoff times is to -split the data in such manner that any events before the cutoff time are used for training while -events after the cutoff time are used for testing. The following code shows the format for these -values in the **Missed Appointment** task: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.load_entityset(data='kaggle') - cardea.select_problem('MissedAppointment') - -Current Prediction Problems ---------------------------- - -Cardea encapsulates six different prediction problems for users to explore easily, -these are described as follows: - -1. Diagnosis Prediction: - a. Predicts whether a patient will be diagnosed with a specified diagnosis. -2. Length of Stay: - a. Predicts how many days the patient will be in the hospital. -3. Missed Appointment: - a. Predicts whether the patient showed to the appointment or not. -4. Mortality Prediction: - a. Predicts whether a patient will suffer from mortality. -5. Prolonged Length of Stay: - a. Predicts whether a patient stayed in the hospital more or less than a period of time (a week by default). -6. Readmission: - a. Predicts whether a patient will revisit the hospital within certain period of time (a month by default). - -You can see the list of problems using the ``list_problems(...)`` method, example: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.list_problems() diff --git a/docs/community/index.rst b/docs/community/index.rst deleted file mode 100644 index 3c34c3a3..00000000 --- a/docs/community/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -Community -================ - -Cardea is a community driven effort, so it relies on contributions from the community. Therefore, every contribution is welcome, and they are greatly appreciated! Every little bit helps, and credit will always be given. - -.. toctree:: - :maxdepth: 2 - - welcome - contributing \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 8465a602..6de35f01 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -91,7 +91,7 @@ copyright = u"2018, MIT Data To AI Lab" author = u"MIT Data To AI Lab" description = 'Automated Machine Learning on Electronic Health Records' -user = 'DAI-Lab' +user = 'MLBazaar' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout diff --git a/docs/community/contributing.rst b/docs/developer_guides/contributing.rst similarity index 97% rename from docs/community/contributing.rst rename to docs/developer_guides/contributing.rst index f73491fc..25722afa 100644 --- a/docs/community/contributing.rst +++ b/docs/developer_guides/contributing.rst @@ -5,8 +5,7 @@ Contributing Guidelines Ready to contribute with your own code? Great! -Before diving deeper into the contributing guidelines, please make sure to having read -the :ref:`concepts` section and to have gone through the development guide. +Before diving deeper into the contributing guidelines, please make sure to having read the :ref:`user_guides` section and to have gone through the :ref:`development` setup. Afterwards, please make sure to read the following contributing guidelines carefully, and later on head to the step-by-step guides for each possible type of contribution. diff --git a/docs/developer_guides/index.rst b/docs/developer_guides/index.rst new file mode 100644 index 00000000..e2f7567d --- /dev/null +++ b/docs/developer_guides/index.rst @@ -0,0 +1,13 @@ +.. _developer_guides: + +================ +Developer Guides +================ + +In the Developer Guides we discuss in depth the architecture of the Cardea project and the related libraries, while also providing clear instructions about how to extend its development to better adapt it to your needs to contribute to the development of the libraries. + +.. toctree:: + :maxdepth: 2 + + welcome + contributing \ No newline at end of file diff --git a/docs/community/welcome.rst b/docs/developer_guides/welcome.rst similarity index 97% rename from docs/community/welcome.rst rename to docs/developer_guides/welcome.rst index e67ac144..12f58a02 100644 --- a/docs/community/welcome.rst +++ b/docs/developer_guides/welcome.rst @@ -19,7 +19,7 @@ Reporting Issues ~~~~~~~~~~~~~~~~ If there is something that you would like to see changed in the project, or that you just want -to ask, please create an issue at https://github.com/D3-AI/Cardea/issues +to ask, please create an issue at https://github.com/MLBazaar/Cardea/issues If you do so, please: diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index db24350c..9ffb5c68 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -7,35 +7,65 @@ The first step to use Cardea is to follow the :ref:`installation` instructions. having a working environment, you can start using the Cardea library in a Python console using the following steps: -First, load the core class to work with: +First, we need to have a ``data_path`` referencing the data we will be working with. This data +can be in either FHIR or MIMIC format. In this quickstart, we will use a pre-processed version of +the Kaggle dataset: `Medical Appointment No Shows`_, using the following command: .. ipython:: python - from cardea import Cardea - cardea = Cardea() - -Second, load a dataset. By default, if no path is given, Cardea automatically loads a -pre-processed version of the Kaggle dataset: `Medical Appointment No Shows`_, using the -following command: + from cardea.data import download + data_path = download('kaggle') -.. ipython:: python +Alternatively, we can manually download this dataset from the s3 bucket using: - cardea.load_entityset(data='kaggle') - cardea.es +:: + curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip -You can see the list of problem definitions and select one with the following commands: +Then, we load the core class to work with: .. ipython:: python - cardea.list_problems() + from cardea import Cardea -From there, you can select the prediction problem you aim to solve by specifying the name of the class, which in return gives us the ``label_times`` of the problem. + cardea = Cardea(data_path=data_path, + fhir=True) + +To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.entityset`` which should output the following: + +:: + Entityset: kaggle + Entities: + Address [Rows: 81, Columns: 2] + Appointment_Participant [Rows: 6100, Columns: 2] + Appointment [Rows: 110527, Columns: 5] + CodeableConcept [Rows: 4, Columns: 2] + Coding [Rows: 3, Columns: 2] + Identifier [Rows: 227151, Columns: 1] + Observation [Rows: 110527, Columns: 3] + Patient [Rows: 6100, Columns: 4] + Reference [Rows: 6100, Columns: 1] + Relationships: + Appointment_Participant.actor -> Reference.identifier + Appointment.participant -> Appointment_Participant.object_id + CodeableConcept.coding -> Coding.object_id + Observation.code -> CodeableConcept.object_id + Observation.subject -> Reference.identifier + Patient.address -> Address.object_id + +After that, we need to select a specific prediction problem that I am interested in, you +can use the command ``cardea.list_labelers`` to view the readily available functions. Once +that has been determined, we pass the function of interested to the data labeler, which in +return gives us the ``label_times`` of the problem. .. ipython:: python + + from cardea.data_labeling import appointment_no_show - label_times = cardea.select_problem('MissedAppointment') + label_times = cardea.label(appointment_no_show, subset=100) label_times.head() +``label_times`` summarizes for each instance in the dataset (1) what is its corresponding label of the instance and (2) what is the time index that indicates the timespan allowed for calculating features that pertain to each instance in the dataset. + Then, you can perform the AutoML steps and take advantage of Cardea. Cardea extracts features through automated feature engineering by supplying the ``label_times`` pertaining to the problem you aim to solve, using the following commands: @@ -43,7 +73,7 @@ Cardea extracts features through automated feature engineering by supplying the .. ipython:: python :okwarning: - feature_matrix = cardea.generate_features(label_times[:1000]) # a subset + feature_matrix = cardea.featurize(label_times) feature_matrix.head() Once we have the features, we can now split the data into training and testing @@ -51,7 +81,7 @@ Once we have the features, we can now split the data into training and testing .. ipython:: python :okwarning: - y = list(feature_matrix.pop('label')) + y = feature_matrix.pop('label').values X = feature_matrix.values X_train, X_test, y_train, y_test = cardea.train_test_split( @@ -63,7 +93,7 @@ Now that we have our feature matrix properly divided, we can use to train our ma .. ipython:: python :okwarning: - cardea.select_pipeline('Random Forest') + cardea.set_pipeline('Random Forest') cardea.fit(X_train, y_train) y_pred = cardea.predict(X_test) @@ -73,7 +103,7 @@ Finally, you can see accuracy results using the following commands: .. ipython:: python :okwarning: - cardea.evaluate(X, y, test_size=0.2, metrics=['Accuracy', 'F1 Macro']) + cardea.evaluate(X_test, y_test, metrics=['Accuracy', 'F1 Macro']) .. _Medical Appointment No Shows: https://www.kaggle.com/joniarroba/noshowappointments diff --git a/docs/index.rst b/docs/index.rst index 98f68b93..e30fc4ca 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ An open source project from Data to AI Lab at MIT.

-|Development Status| |PyPi Shield| |Run Tests| |Downloads| |Binder| +|Development Status| |PyPi Shield| |Run Tests Shield| |Downloads| |Binder| Welcome to Cardea ================== @@ -34,28 +34,41 @@ Cardea is a machine learning library built on top of *schemas* that support elec Our goal is to provide an easy to use library to develop machine learning models from electronic health records. A typical usage of this library will involve interacting with our API to develop prediction models. +Machine Learning Process +~~~~~~~~~~~~~~~~~~~~~~~~ +Cardea is composed of a series of sequential processes that are applied to organize, structure, and build machine learning models on electronic health records datasets. These processes are visualized in the following diagram, where each block represents a process and the output of that process will be used by the succeeding block. + .. figure:: images/cardea-process.png :width: 600 px :alt: Cardea Process -A series of sequential processes are applied to build a machine learning model. These processes are triggered using our following APIs to perform the following: +Diving into this diagram more thoroughly: + +* we first load the desired data using the **data assembler** to generate an entityset representation of the data. The entityset datastructure contains the entities (tables) and the relationships that occur between these tables. Read more about the :ref:`data_assembler`. + +* next, you can investigate the given entityset and decide which prediction problem you wish to solve by using the **data labeler**. Based on the desired prediction problem, cardea creates ``label_times`` which is a data representation, specifically a ``pandas.DataFrame`` that contains three columns: + + * an *instance id* that is unique per row. + * a *time index* that indicates the timespan in which I can use the data in that timespan to generate the corresponding features for the associated instance. + * a *label* that denotes what the framework is trying to predict given the selected problem. + +You can read more about the :ref:`data_labeler`. It is important to note that ``label_times`` is an essential input to the featurization process. -* loading data using the automatic **data assembler**, where we capture data from its raw format into an entityset representation. -* **data labeling** where we create label times that generates (1) the time index that indicates the timespan for which I create my features (2) the encoded labels of the prediction task. this is essential for our feature engineering phase. +* then we can automatically engineer features of our entityset using the **featurizer** by supplying ``label_time``. This will generate a ``feature_matrix`` that contains the instance, its extracted features, and its label. Visit :ref:`featurizer` for more information. -* **featurization** for which we automatically feature engineer our data to generate a feature matrix. +* lastly comes the **modeling** process. In this block, we use the generated ``feature_matrix`` to train our model, tune it, and then assess its performance. More on pipeline training and hyperparameter tuning is provided in the :ref:`modeler` section. -* lastly, we build, train, and tune our machine learning model using the **modeling component**. +This was a quick overview on how we designed the cardea framework. For further details on each process and the data structures in each block, please visit the page of the corresponding process. Explore Cardea -------------- * `Getting Started `_ -* `Basic Concepts `_ +* `User Guides `_ * `API Reference `_ -* `Community `_ +* `Developer Guides `_ * `Release Notes `_ -------------- @@ -64,7 +77,7 @@ Explore Cardea :target: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha .. |PyPi Shield| image:: https://img.shields.io/pypi/v/cardea.svg :target: https://pypi.python.org/pypi/cardea -.. |Run Tests| image:: https://github.com/MLBazaar/Cardea/workflows/Run%20Tests/badge.svg +.. |Run Tests Shield| image:: https://github.com/MLBazaar/Cardea/workflows/Run%20Tests/badge.svg :target: https://github.com/MLBazaar/Cardea/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster .. |Downloads| image:: https://pepy.tech/badge/cardea :target: https://pepy.tech/project/cardea @@ -78,9 +91,9 @@ Explore Cardea :titlesonly: getting_started/index - basic_concepts/index + user_guides/index api_reference/index - community/index + developer_guides/index Release Notes .. _FHIR: https://www.hl7.org/fhir/ diff --git a/docs/user_guides/auto_featurization.rst b/docs/user_guides/auto_featurization.rst new file mode 100644 index 00000000..d2bc27a7 --- /dev/null +++ b/docs/user_guides/auto_featurization.rst @@ -0,0 +1,25 @@ +.. _featurizer: + +========== +Featurizer +========== + +Cardea automatically generates features using the `Featuretools`_ package, specifically, the `Deep Feature Synthesis (DFS)`_ algorithm to generate a feature matrix from a given dataset. Aiming to fully automate this process, it determines the focus values of the automated feature engineering +task using the **target entity**, and **label times** of the prediction problem created by :ref:`data_labeler`. + +Once you featurize the data, you will obtain a feature matrix, where each row pertains to a specific ``instance_id`` defined in the ``label_times``, and a collection of calculated features. + +Featurizing Demo +---------------- + +We can continue on our example walkthrough and generate futures on the Missed Appointment dataset. + +.. code-block:: python + + feature_matrix = cardea.featurize(label_times) + +.. note:: + the last column in the feature matrix is the ``label`` column which denotes the value we want to predict based on the selected prediction task. + +.. _Featuretools: https://www.featuretools.com/ +.. _Deep Feature Synthesis (DFS): https://docs.featuretools.com/automated_feature_engineering/afe.html#deep-feature-synthesis diff --git a/docs/user_guides/auto_ml.rst b/docs/user_guides/auto_ml.rst new file mode 100644 index 00000000..6cf2ad24 --- /dev/null +++ b/docs/user_guides/auto_ml.rst @@ -0,0 +1,52 @@ +.. _modeler: + +======= +Modeler +======= + +Cardea makes use of two packages to automate and simplify the modeling step in the Machine +Learning tasks: `MLPrimitives`_ and `MLBlocks`_. + +MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning tools developed in Python, whether they are custom developments or belong to third party libraries, and build Pipelines out of them that can be fitted and then used to make predictions. +This is achieved by providing a simple and intuitive annotation language that allows the user to specify how to integrate with each tool, called **primitives**, in order to provide a common uniform interface to each one of them. + +On the other hand, *MLPrimitives* is a repository containing primitive annotations to be used by the *MLBlocks* library. + +Thanks to the use of these two packages, the Machine Learning algorithm selection and the hyper-parameter tuning steps can be done easily. + +Modeling Demo +------------- + +Continuing from the previous example of *Missed Appointments*, let’s divide our ``feature_matrix`` into training and testing portions. + +.. code-block:: python + + y = feature_matrix.pop('label').values + X = feature_matrix.values + X_train, X_test, y_train, y_test = cardea.train_test_split( + X, y, test_size=0.2, shuffle=True) + +Second, we specify the pipeline we want to use for our prediction. Cardea has a number of pre-created pipelines which you can find in `cardea pipelines `__. We can then use the modeler component to help us train, tune, and select the best version of the pipeline. + +.. code-block:: python + + cardea.select_pipeline('Random Forest') + cardea.fit(X_train, y_train) + y_pred = cardea.predict(X_test) + +.. note:: + you can set ``tune=True`` to optimize the hyperparameters of the pipeline during the ``fit`` process. + + +Additionally, you can use ``fit_predict`` to train the pipeline then make predictions directly on the same dataset. + +We can also evaluate the performance of the pipeline. You can use ``cardea.evaluate`` that will compare the predicted labels against the ground truth according to a list of given metrics. + +.. code-block:: python + + cardea.evaluate(X_test, y_test) + +Metrics used are developed by `sklearn `__. By default classification metrics include: accuracy, f1 score, precision, and recall. On the other hand, regression metrics are shown through: variance score, mean absolute error, mean squared error, mean squared log error, median absolute error, and r2 score. + +.. _MLPrimitives: https://MLBazaar.github.io/MLPrimitives/ +.. _MLBlocks: https://MLBazaar.github.io/MLBlocks/ diff --git a/docs/user_guides/data_assembler.rst b/docs/user_guides/data_assembler.rst new file mode 100644 index 00000000..4e4eb4b5 --- /dev/null +++ b/docs/user_guides/data_assembler.rst @@ -0,0 +1,85 @@ +.. _data_assembler: + +============== +Data Assembler +============== + +Cardea makes use of a module to plugin the user’s data and automatically organize it into the framework. It is built on top of schemas that support electronic health records (EHR). One of the schemas we support is Fast Healthcare Interoperability Resources (FHIR) schema. You can read more about the supported :ref:`schemas`. + +Cardea expects the raw data to be a folder pointing to data in ``.csv`` format to ingest. Each table/resource should correspond to a single ``.csv`` file which is then directly fed into cardea using ``load_entityset``. + +Entityset +--------- + +Entityset represents the data structure produced by the data assembler module. The process organizes the data into its corresponding table/resource within the schema and produces an entityset. Generally, the entityset is a collection of entities and relationships: + +* **entities** are used to prepare the data (tables), into a structured input for later usage. It contains a ``pandas.DataFrame`` at its core, with meta information indicating the index column, time columns, and other information. +* **relationships** indicate the connection between two entities. It represents how for an entity *A* with primary key *1*, there is another entity *B* with foreign key *2* that references it. The relationship ``B.2 -> A.1`` ties them together. This parent-child relationship is embedded within the entityset. + +To read more about entitisets, visit `EntitySet`_. + + +Data Assembling Demo +-------------------- + +Let's start first by looking at some raw data. Here, in this example, we have the Kaggle dataset: `Medical Appointment No Shows`_ already preprocessed to be representative of the FHIR schema. You can download the dataset directly from `Amazon S3`_ or you can run the following command to download it and unzip it: + +.. code-block:: console + + curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip + +then you would have the following directory + +.. code-block:: console + + kaggle + ├── Coding.csv + ├── Appointment_Participant.csv + ├── Address.csv + ├── CodeableConcept.csv + ├── Reference.csv + ├── Observation.csv + ├── Identifier.csv + └── Appointment.csv + +.. note:: + notice how the file names correspond to the resource name in FHIR. + +Then you can directly load the dataset into cardea by supplying the folder path to the ``load_entityset``. + + +.. code-block:: python + + from cardea import Cardea + cardea = Cardea(data_path='path/to/data', fhir=True) + +We can investigate what the entityset looks like by simply displaying the entityset through ``es``. + +.. code-block:: python + + cardea.entityset + +.. note:: + you can use ``cardea.entityset.plot()`` to visualize your entityset. + +Showing above are two sections: + +* **Entities** where we can see the resources loaded. For example, *Appointment* is a resource in FHIR that contains most of the records of this dataset and contains 5 columns. +* **Relationships** where we can see the parent-child relationship. For example, ``Appointment.participant`` is the column ``participant`` in table ``appointment`` that references the primary key ``object_id`` in table ``Appointment_Partipant``. + + +We will utilize this structure to develop our :ref:`data_labeler` and :ref:`featurizer`. + + +FAQ +--- + +1. **What schemas do we support right now?** +We currently support two :ref:`schemas`, Fast Healthcare Interoperability Resources (FHIR), and Medical Information Mart for Intensive Care III (MIMIC-III). +2. **What if I only have a subset of tables?** +Cardea seamlessly integrates the available data, dropping missing variables and links. Having only a subset of the data does not preclude your from solving a prediction problem if all the necessary information is still present. + + +.. _EntitySet: https://featuretools.alteryx.com/en/stable/api_reference.html#entityset-entity-relationship-variable-types +.. _Amazon S3: https://dai-cardea.s3.amazonaws.com/kaggle.zip +.. _Medical Appointment No Shows: https://www.kaggle.com/joniarroba/noshowappointments diff --git a/docs/user_guides/data_labeler.rst b/docs/user_guides/data_labeler.rst new file mode 100644 index 00000000..178abfee --- /dev/null +++ b/docs/user_guides/data_labeler.rst @@ -0,0 +1,51 @@ +.. _data_labeler: + +============ +Data Labeler +============ + +The data labeler is considered a fundamental component that formulates the prediction task for Machine Learning models. It includes generating and identifying **label times**. In this page, we detail what label times are. + +Label Times +----------- + +After loading the data, you will need to define the prediction task you want to solve (or use one of the already defined tasks). For example, *Missed Appointment* is a common task that aims to predict whether the patient showed up to the appointment or not, helping hospitals to optimize their scheduling policies and resources efficiently. So how do we formulate prediction? + +First, you will need to articulate what is the **label** (outcome) you want to predict? Following with the previous example, the *Missed Appointment* task is currently defined as a binary classification task in the system, determining whether a patient showed to the appointment or not. + +Next, you will need to determine the **time** in which you define your features over. Continuing with the previous example of *Missed Appointments*, I would like to predict whether the patient will show up to the appointment or not from the point of scheduling appointment. In other words, I would use all the data I could get up until the time when the appointment was scheduled as data for featurization. + +Lastly, you will determine the entity that contains this piece of information, **target entity**. Combining these information together, we get ``label_times``. + + +Available Prediction Problems +----------------------------- +There is currently six readily available prediction problems for users to explore easily, these are described as follows: + +* **Diagnosis prediction**: predicts whether a patient will be diagnosed with a given ICD diagnosis code. +* **Length of Stay (LOS) prediction**: predicts how many days the patient will be in the hospital. +* **Prolonged Length of Stay (PLOS) prediction**: predicts whether a patient stayed in the hospital more or less than a period of time (a week by default). +* **Missed Appointment prediction**: predicts whether the patient will show up to the appointment or not. +* **Mortality prediction**: predicts patient’s mortality. +* **Readmission prediction**: predicts whether a patient will revisit the hospital within a certain period of time (a month by default). + + +Data Labeling Demo +------------------ + +Contiuning from :ref:`data_assembler`, we can now use ``label`` with the desired prediction problem to generate ``label_times``. + +.. code-block:: python + + from cardea.data_labeling import appointment_no_show + + label_times = cardea.label(appointment_no_show) + +.. note:: + you can use ``cardea.list_labelers()`` to view available prediction problems. + + +Creating New Prediction Problems +-------------------------------- + +Coming Soon. diff --git a/docs/user_guides/index.rst b/docs/user_guides/index.rst new file mode 100644 index 00000000..8e468d30 --- /dev/null +++ b/docs/user_guides/index.rst @@ -0,0 +1,16 @@ +.. _user_guides: + +=========== +User Guides +=========== + +In the user guide, we go through some of the main concepts needed to understand how the framework is built and what are the underlying data structures used to make this framework possible. + +.. toctree:: + :maxdepth: 3 + + data_assembler + data_labeler + auto_featurization + auto_ml + schemas diff --git a/docs/user_guides/schemas.rst b/docs/user_guides/schemas.rst new file mode 100644 index 00000000..7ef2046e --- /dev/null +++ b/docs/user_guides/schemas.rst @@ -0,0 +1,28 @@ +.. _schemas: + +======= +Schemas +======= + +Cardea is built on top of schemas that support electronic health records (EHR). In this page, we list the currently supported schemas. + + +Fast Healthcare Interoperability Resources (FHIR) +------------------------------------------------- + +`Fast Healthcare Interoperability Resources (FHIR) `__, is a standard for health care data exchange, published by HL7®. Among the advantages of FHIR over other standards are: + +* Fast and easy to implement +* Specification is free for use with no restrictions +* Strong foundation in Web standards: XML, JSON, HTTP, OAuth, etc. +* Support for RESTful architectures +* Concise and easily understood specifications +* A human-readable serialization format for ease of use by developers + +MIMIC-III +--------- + +`MIMIC-III `__, a freely accessible critical care database. + +Coming soon. + diff --git a/notebooks/appointment_noshow_tutorial.ipynb b/notebooks/appointment_noshow_tutorial.ipynb index e5ddac68..8b20c92b 100644 --- a/notebooks/appointment_noshow_tutorial.ipynb +++ b/notebooks/appointment_noshow_tutorial.ipynb @@ -20,41 +20,20 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bF4XlPH8UPPO", - "outputId": "aca5874e-a449-4ec5-f111-78a2fc7a29c2" - }, - "outputs": [], - "source": [ - "# if you are running from Google Colab, uncomment the following commands to \n", - "# install cardea.\n", - "\n", - "# ! pip install cardea\n", - "# ! pip install 'urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1'" - ] - }, - { - "cell_type": "code", - "execution_count": 2, "metadata": { "id": "uqYRyFYLVfBK" }, "outputs": [], "source": [ - "# imports \n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.metrics import accuracy_score\n", + "%load_ext autoreload\n", + "%autoreload 2\n", "\n", "from cardea import Cardea" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "id": "rA1hkWm3VkpI" }, @@ -76,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "bN9h70jQVm6V" }, @@ -97,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -112,14 +91,13 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 2983k 100 2983k 0 0 4618k 0 --:--:-- --:--:-- --:--:-- 4611k\n", + "100 2988k 100 2988k 0 0 6345k 0 --:--:-- --:--:-- --:--:-- 6332k\n", "Archive: kaggle.zip\n", - " creating: kaggle/\n", " inflating: kaggle/Patient.csv \n", " inflating: kaggle/Coding.csv \n", " inflating: kaggle/Appointment_Participant.csv \n", " inflating: kaggle/Address.csv \n", - " inflating: kaggle/CodeableConcept.csv \n", + " extracting: kaggle/CodeableConcept.csv \n", " inflating: kaggle/Reference.csv \n", " inflating: kaggle/Observation.csv \n", " inflating: kaggle/Identifier.csv \n", @@ -128,12 +106,12 @@ } ], "source": [ - "! curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip kaggle.zip" + "! curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -165,13 +143,13 @@ " Appointment.participant -> Appointment_Participant.object_id" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cd.load_data_entityset(folder_path='kaggle')\n", + "cd.load_entityset(data_path='kaggle', fhir=True)\n", "\n", "# to view the loaded entityset\n", "cd.es" @@ -198,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -210,21 +188,20 @@ { "data": { "text/plain": [ - "{'DiagnosisPrediction',\n", - " 'LengthOfStay',\n", - " 'MissedAppointmentProblemDefinition',\n", - " 'MortalityPrediction',\n", - " 'ProlongedLengthOfStay',\n", - " 'Readmission'}" + "{'appointment_no_show',\n", + " 'diagnosis_prediction',\n", + " 'length_of_stay',\n", + " 'mortality',\n", + " 'readmission'}" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cd.list_problems()" + "cd.list_labelers()" ] }, { @@ -249,6 +226,13 @@ "outputId": "0281f75b-9e89-4c90-84ae-774415e10d11" }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elapsed: 01:50 | Remaining: 00:00 | Progress: 100%|██████████| identifier: 110527/110527 \n" + ] + }, { "data": { "text/html": [ @@ -270,53 +254,53 @@ " \n", " \n", " \n", - " cutoff_time\n", - " instance_id\n", - " label\n", + " identifier\n", + " time\n", + " missed\n", " \n", " \n", " \n", " \n", - " 5030230\n", - " 2015-11-10 07:13:56\n", + " 0\n", " 5030230\n", - " noshow\n", + " 2015-11-10 07:13:56\n", + " True\n", " \n", " \n", - " 5122866\n", - " 2015-12-03 08:17:28\n", + " 1\n", " 5122866\n", - " fulfilled\n", + " 2015-12-03 08:17:28\n", + " False\n", " \n", " \n", - " 5134197\n", - " 2015-12-07 10:40:59\n", + " 2\n", " 5134197\n", - " fulfilled\n", + " 2015-12-07 10:40:59\n", + " False\n", " \n", " \n", - " 5134220\n", - " 2015-12-07 10:42:42\n", + " 3\n", " 5134220\n", - " noshow\n", + " 2015-12-07 10:42:42\n", + " True\n", " \n", " \n", - " 5134223\n", - " 2015-12-07 10:43:01\n", + " 4\n", " 5134223\n", - " noshow\n", + " 2015-12-07 10:43:01\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " cutoff_time instance_id label\n", - "5030230 2015-11-10 07:13:56 5030230 noshow\n", - "5122866 2015-12-03 08:17:28 5122866 fulfilled\n", - "5134197 2015-12-07 10:40:59 5134197 fulfilled\n", - "5134220 2015-12-07 10:42:42 5134220 noshow\n", - "5134223 2015-12-07 10:43:01 5134223 noshow" + " identifier time missed\n", + "0 5030230 2015-11-10 07:13:56 True\n", + "1 5122866 2015-12-03 08:17:28 False\n", + "2 5134197 2015-12-07 10:40:59 False\n", + "3 5134220 2015-12-07 10:42:42 True\n", + "4 5134223 2015-12-07 10:43:01 True" ] }, "execution_count": 8, @@ -326,7 +310,7 @@ ], "source": [ "# select problem\n", - "label_times = cd.select_problem('MissedAppointmentProblemDefinition')\n", + "label_times = cd.create_label_times()\n", "label_times.head(5)" ] }, @@ -347,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -361,8 +345,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Built 13 features\n", - "Elapsed: 00:52 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks\n" + "Built 14 features\n", + "Elapsed: 00:26 | Progress: 100%|██████████\n" ] }, { @@ -386,392 +370,267 @@ " \n", " \n", " \n", - " participant = 4159901403\n", - " participant = 3856467788\n", - " participant = 3807024061\n", - " participant = 3215247433\n", - " participant = 2872476717\n", - " participant = 2417505282\n", - " participant = 4162690658\n", - " participant = 3562155678\n", - " participant = 3488625302\n", - " participant = 3418939447\n", - " ...\n", - " Appointment_Participant.actor = 27200000000000\n", - " Appointment_Participant.actor = 9740000000000\n", - " Appointment_Participant.actor = 8460000000000\n", - " Appointment_Participant.actor = 923000000000000\n", - " Appointment_Participant.actor = 795000000000000\n", - " Appointment_Participant.actor = 724000000000000\n", - " Appointment_Participant.actor = 659000000000000\n", - " Appointment_Participant.actor is unknown\n", + " status\n", + " participant\n", + " DAY(created)\n", + " DAY(start)\n", + " IS_WEEKEND(created)\n", + " IS_WEEKEND(start)\n", + " MONTH(created)\n", + " MONTH(start)\n", + " WEEKDAY(created)\n", + " WEEKDAY(start)\n", + " YEAR(created)\n", + " YEAR(start)\n", + " Appointment_Participant.actor\n", " Appointment_Participant.COUNT(Appointment)\n", - " label\n", + " missed\n", + " \n", + " \n", + " identifier\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 5030230\n", + " noshow\n", + " 3353377007\n", + " 10\n", + " 4\n", + " False\n", + " False\n", + " 11\n", + " 5\n", " 1\n", + " 2\n", + " 2015\n", + " 2016\n", + " 832000000000000\n", " 56\n", - " noshow\n", + " True\n", " \n", " \n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 55\n", + " 5122866\n", " fulfilled\n", + " 486500845\n", + " 3\n", + " 2\n", + " False\n", + " False\n", + " 12\n", + " 5\n", + " 3\n", + " 0\n", + " 2015\n", + " 2016\n", + " 91600000000000\n", + " 55\n", + " False\n", " \n", " \n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 33\n", + " 5134197\n", " fulfilled\n", + " 64062658\n", + " 7\n", + " 3\n", + " False\n", + " False\n", + " 12\n", + " 6\n", + " 0\n", + " 4\n", + " 2015\n", + " 2016\n", + " 1220000000000\n", + " 33\n", + " False\n", " \n", " \n", - " 3\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 48\n", + " 5134220\n", " noshow\n", + " 207195819\n", + " 7\n", + " 3\n", + " False\n", + " False\n", + " 12\n", + " 6\n", + " 0\n", + " 4\n", + " 2015\n", + " 2016\n", + " 31900000000000\n", + " 48\n", + " True\n", " \n", " \n", - " 4\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 38\n", + " 5134223\n", " noshow\n", + " 1089855247\n", + " 7\n", + " 3\n", + " False\n", + " False\n", + " 12\n", + " 6\n", + " 0\n", + " 4\n", + " 2015\n", + " 2016\n", + " 9580000000000\n", + " 38\n", + " True\n", " \n", " \n", "\n", - "

5 rows × 75 columns

\n", "" ], "text/plain": [ - " participant = 4159901403 participant = 3856467788 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 3807024061 participant = 3215247433 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 2872476717 participant = 2417505282 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 4162690658 participant = 3562155678 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 3488625302 participant = 3418939447 ... \\\n", - "0 0 0 ... \n", - "1 0 0 ... \n", - "2 0 0 ... \n", - "3 0 0 ... \n", - "4 0 0 ... \n", - "\n", - " Appointment_Participant.actor = 27200000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 9740000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 8460000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 923000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 795000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 724000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", + " status participant DAY(created) DAY(start) \\\n", + "identifier \n", + "5030230 noshow 3353377007 10 4 \n", + "5122866 fulfilled 486500845 3 2 \n", + "5134197 fulfilled 64062658 7 3 \n", + "5134220 noshow 207195819 7 3 \n", + "5134223 noshow 1089855247 7 3 \n", "\n", - " Appointment_Participant.actor = 659000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", + " IS_WEEKEND(created) IS_WEEKEND(start) MONTH(created) \\\n", + "identifier \n", + "5030230 False False 11 \n", + "5122866 False False 12 \n", + "5134197 False False 12 \n", + "5134220 False False 12 \n", + "5134223 False False 12 \n", "\n", - " Appointment_Participant.actor is unknown \\\n", - "0 1 \n", - "1 1 \n", - "2 1 \n", - "3 1 \n", - "4 1 \n", + " MONTH(start) WEEKDAY(created) WEEKDAY(start) YEAR(created) \\\n", + "identifier \n", + "5030230 5 1 2 2015 \n", + "5122866 5 3 0 2015 \n", + "5134197 6 0 4 2015 \n", + "5134220 6 0 4 2015 \n", + "5134223 6 0 4 2015 \n", "\n", - " Appointment_Participant.COUNT(Appointment) label \n", - "0 56 noshow \n", - "1 55 fulfilled \n", - "2 33 fulfilled \n", - "3 48 noshow \n", - "4 38 noshow \n", + " YEAR(start) Appointment_Participant.actor \\\n", + "identifier \n", + "5030230 2016 832000000000000 \n", + "5122866 2016 91600000000000 \n", + "5134197 2016 1220000000000 \n", + "5134220 2016 31900000000000 \n", + "5134223 2016 9580000000000 \n", "\n", - "[5 rows x 75 columns]" + " Appointment_Participant.COUNT(Appointment) missed \n", + "identifier \n", + "5030230 56 True \n", + "5122866 55 False \n", + "5134197 33 False \n", + "5134220 48 True \n", + "5134223 38 True " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# feature engineering\n", - "feature_matrix = cd.generate_features(label_times[:1000]) # takes a while for the full dataset\n", + "feature_matrix = cd.generate_features(label_times[:1000], verbose=True) # takes a while for the full dataset\n", "feature_matrix.head(5)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have the features, we can now split the data into training and testing" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": { "id": "xGmr2hXEWw9T" }, "outputs": [], "source": [ - "# shuffle the dataframe\n", - "feature_matrix = feature_matrix.sample(frac=1)\n", - "\n", "# pop the target labels\n", - "y = list(feature_matrix.pop('label'))\n", - "X = feature_matrix.values" + "y = feature_matrix.pop('missed').values\n", + "X = feature_matrix.values\n", + "\n", + "X_train, X_test, y_train, y_test = cd.train_test_split(\n", + " X, y, test_size=0.2, shuffle=True)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Kr5EsvQUW4Yo" - }, + "metadata": {}, "source": [ - "The pipeline variable represents the order in which machine learning algorithms are executed. It can be used to compare models together by specifying multiple algorithms in different lists. Such as:\n", - "\n", - "```\n", - "pipeline = [['sklearn.ensemble.RandomForestClassifier'], \n", - " ['sklearn.neighbors.KNeighborsClassifier']]\n", - "```\n", - "\n", - "Here we execute two different models, the first one being Random Forest and the second is K-Nearest Neighbor (KNN). In addition, you can use the pipeline to create your own encoding and modeling pipeline where the data crosses several algorithms to create the prediction model. For example, I can use a sequence of primitives that allow me to (1) normalize my data (2) use Random Forest. This can be modeled as:\n", - "```\n", - "pipeline = [['sklearn.preprocessing.StandardScaler', 'sklearn.ensemble.RandomForestClassifier'], \n", - " ['sklearn.neighbors.KNeighborsClassifier']]\n", - "```\n", - "Here there are two different models, the first one composes of two primitives (preprocessing through normalization then applying Random Forest) and the second is basic KNN. More on machine learning algorithms and MLPrimitives can be found here: https://HDI-Project.github.io/MLPrimitives" + "Now that we have our feature matrix properly divided, we can use to train our machine learning pipeline, Modeling, optimizing hyperparameters and finding the most optimal model" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JNvYIt-dXb7G", - "outputId": "b3081349-c2ac-4334-a02a-61f942f2bfd6" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ - "# modeling\n", - "pipeline = [['sklearn.ensemble.RandomForestClassifier'], \n", - " ['sklearn.naive_bayes.MultinomialNB'], \n", - " ['sklearn.neighbors.KNeighborsClassifier']]\n", - "\n", - "exe = cd.execute_model(feature_matrix=X,\n", - " target=y, \n", - " primitives=pipeline)" + "cd.set_pipeline('Random Forest')\n", + "cd.fit(X_train, y_train)\n", + "y_pred = cd.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": { - "id": "Or-EtJ3_XguM" + "id": "Kr5EsvQUW4Yo" }, "source": [ - "## Visualize Results\n", - "\n", - "After executing the pipelines, the method returns a list composing of each pipeline with each fold representing three main results:\n", - "\n", - "* The list of primitives used.\n", - "* The actual label vector.\n", - "* The predicted label vector.\n", - "* The tuned hyperparameters (if given).\n", - "\n", - "In order to perceive the results and look at the performance of each pipeline we can view it's training process by merely plotting the confusion matrix." + "Finally, you can evaluate the performance of the model" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 281 + "base_uri": "https://localhost:8080/" }, - "id": "WXNtz9VvXxcm", - "outputId": "0711adcf-ca8d-4269-8803-f16497367798" + "id": "JNvYIt-dXb7G", + "outputId": "b3081349-c2ac-4334-a02a-61f942f2bfd6" }, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAARUAAAEICAYAAABxpmCnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAYMElEQVR4nO3debCcVZ3G8e8jgUQRyYaAQAIosokEyICyBkGWqIQSFBAhWFBxAa0RNxgsYAI4gVlQawSMGAkgi6JolCCGJW4YNCohspiEoEAIEJIQQCAQ+M0f51znTdN915Pu29fnU9V1u99z3rd/7+3bT79b36OIwMyslNe1ugAzG1gcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUekDS6yX9RNIqSd/vw3KOl/TzkrW1gqSbJU3s5bybSHpA0utL12WNSRqcf++brKvnGJChIukjkuZKek7S0vzHv2+BRR8NbAqMiIgP9XYhEfHdiDikQD1rkTROUki6sWb6rnn67G4u51xJV3fVLyIOj4jpvSz3DOCKiHihl/MPOJI+K+lxSc9ImiZpcCd93yDpEklP5Q+5X1baDpR0R57+1+p8EbEamEb6/a8TAy5UJJ0OfBX4CikARgGXABMKLH40sCAi1hRY1rqyDHi3pBGVaROBBaWeQEmv/3bym2Ui0GVwNYuk9Vr8/IeS3ugHkf7OtgX+vZNZpgLDgR3zz89W2v5OCo4vNJj3GmBiZ6HVJxExYG7AxsBzwIc66TOYFDqP5dtXgcG5bRzwKPA54ElgKfCx3PbvwEvAy/k5TgbOBa6uLHtrIIBB+fFJwGLgWeAh4PjK9F9X5tsb+D2wKv/cu9I2GzgP+E1ezs+BkQ3WraP+y4BT87T1gCXA2cDsSt+vAY8AzwB/APbL0w+rWc95lTouyHW8ALwtTzslt18K/KCy/AuB2wDVqXN/YFHNtI8B9+d1XAx8vKZ9AnB3rvdB4LA8fTjwnfxargR+VO93nKcF8LZ8/4pc80zSm/Bg4H3An/JzPAKcWzP/vsCdwNO5/STgX4AngPUq/T7Y8Xvrwd/uNcBXKo8PAh5v0HeHXOObuljmwcBfG7QtBA5YJ+/DZr3hm3HLb4g15Dd1gz6TgTnAm4FN8h/JebltXJ5/MrA+MB54HhiW289l7RCpfbx1/sMdBGyYX/jtc9vmwM61f/D5TbESOCHPd1x+PCK3z85vorcDr8+PpzRYt3GkUNkbuCtPGw/cApzC2qHyUWBEfs7PAY8DQ+qtV6WOh4Gd8zzrs3aovIG0NXQSsB/wFLBlgzpPBW6qmfY+4K2AgAPy73333LYnKXDfS9q63gLYIbfdBFwPDMs1HVD7O648R22orAL2ycsckn9/u+TH7ySFxZG5/2hS4B2Xn2cEMCa33QccXnmeG4HP5fsfIYVQo9uo3G8ecExlGSNzvSPq/P5OBOYDF+ff83zgqDr9OguVGcBn1sX7cKDt/owAnorOd0+OByZHxJMRsYy0BXJCpf3l3P5yRMwkfVpv38t6XgXeIen1EbE0Iu6t0+d9wMKIuCoi1kTEtcADwAcqfb4TEQsiHX/4HjCmsyeNiDuB4ZK2J/0BXlmnz9URsTw/53+TtuC6Ws8rIuLePM/LNct7nvR7/B/Sbs2nI+LRBssZSnqDVue/KSIejOQXpC2y/XLzycC0iJgVEa9GxJKIeEDS5sDhwCciYmV+zX7RxTpU/TgifpOX+WJEzI6I+fnxPcC1pICDFA63RsS1+XmWR8TduW06KaSRNBw4lLTlQURcExFDO7k9nJfxRlLIdei4v1GdurcE3pH7vAU4DZguaccerPuzpNehuIEWKsuBkZIGddLnLcDfKo//lqf9Yxk1ofQ86QXvkYj4O3AM8AlgqaSbJO3QjXo6atqi8vjxXtRzFemP7UDSJ+daJH1e0v35YN7TpF3HkV0s85HOGiPiLtKui0jh18hKat4skg6XNEfSilzP+Eo9W5G21mptBayIiJVd1N3IWusjaa98gHOZpFWk166rGiCF6AckbQh8GPhVRCztYS3PAW+qPO64/2ydvi+QPvzOj4iXcpDeAfTk4P9GpC2l4gZaqPwWWA0c2Umfx0ibsh1G5Wm98XfSZn+HzaqNEXFLRLyXtOvzAPCtbtTTUdOSXtbU4SrgU8DMvBXxD5L2A75IegMMi4ihpE89dZTeYJmdfqVd0qmkLZ7H8vIbuYe0O9cx32DgB8B/AZvmemZW6nmEtGtU6xHSFtnQOm1rvTaSNqvTp3Z9riHtFmwVERuTjk11VQMRsYT0t/dB0tbaVZXnPT6fhWx0G5W73gvsWlnsrsATEbG8zlPe04116cqOpF2u4gZUqETEKtIByW9IOjKfdls/fwpelLtdC3w5XycxMvfv7VmIu4H9JY2StDFwZkeDpE0lTcifXqtJn0Sv1lnGTODt+TT4IEnHADsBP+1lTQBExEOkTfez6jRvRDp2tAwYJOls1v6UfALYuidneCS9HTiftBtwAvBFSWMadP8dMFRSx9bYBqQwWgaskXQ4a3/qfhv4mKSDJL1O0haSdshbAzcDl0gall/r/fM884CdJY2RNIR0nKgrG5G2fF6UtCdpl6fDd4GDJX04v04jatbvSlKQ7gL8sGNipMsH3tjJ7eHK/CdL2imH5JdJx33q+SXp+NaZuZZ9SFuktwDk39EQ0rEfSRoiaYOOmfPvfTjp2GJxAypUAPLxgdNJL8oy0ifMacCPcpfzgbmktJ8P/DFP681zzSIdJLyHdAalGgSvy3U8BqwgvcE/WWcZy4H3kw6WLif9Yb4/Ip7qTU01y/51RNTbCrsF+BnpwOrfgBdZe1eg48K+5ZL+2NXz5N3Nq4ELI2JeRCwE/g24qt5py4h4ifSG+Wh+/CzwGdIu00rSm3lGpf/vSGeHLiZtUf2C/9+6O4G0K/AA6Yzdv+Z5FpAOuN9KOtPx667Wg7RlN1nSs6QPm3/swuU3/3jS67SC9IFS3bK4Mdd0Y+2WYXdExM+Ai0i7MQ+TXpdzOtol3Svp+Nz3ZdLZsPGk38e3gBMj4oHcfX/SLtJM0lbvC6RjVB0+AkyPdM1KccpHgs2aSumKzl8Bu8UAuQBO0oOkU+G3trqWRnLIzwP2j4gn18lzOFTM+k7SUaRrc94eEfV2c/9p9Gn3R9JwSbMkLcw/hzXo94qku/NtRmX6NpLukrRI0vXV/T6zdqH09YdLSRcc/lMHCvRxSyUf/FwREVMknUE6k/ClOv2ei4jXnAaV9D3ghxFxnaTLSFchXtrrgsys5foaKn8BxkXE0nwh0uyIeM0FVPVCRZJIB1I3i4g1kt5Nuiz60F4XZGYt19lFYt2xaeUin8dJX+CrZ4ikuaTTmFMi4kekq1+frlxo9ihrX/C1FkmTgEkAG2644R7bb1/vOjLrr5Y+u05ONNg68vQTS3h+1Qp13fO1ugwVSbdSc1FXttb1DxERkhpt9oyOiCWStgVulzSftS9J7lJETCV9M5M99hgbv7lrbk9mtxa78PaFrS7BeuCbp32w1/N2GSoRcXCjNklPSNq8svtT9xRVvuKQiFicD2rtRrqCcqikQXlrZUv6fhWpmbVYXy9+m0H6vxjknz+u7ZCvdByc748kfSv0vkgHc+4g/eOjhvObWXvpa6hMAd4raSHpa9ZTACSNlXR57rMjMFfSPFKITImI+3Lbl4DTJS0iHWP5dh/rMbMW69OB2nyJ+UF1ps8l/f+Ojq/h79Jg/sWk/5VhZgPEgPvuj5m1lkPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMilrnw55KGiPpt3nU+nskHVNpu0LSQ5UhUcf0pR4za72+bqmcAdwWEdsBt+XHtZ4HToyInYHDgK9KGlpp/0JEjMm3u/tYj5m1WF9DZQIwPd+fDhxZ2yEiFkTEwnz/MdLYQJv08XnNrJ/qa6h0d9hTACTtCWwAPFiZfEHeLbq4Y3wgM2tfzRr2lDyC4VXAxIh4NU8+kxRGG5CGNP0SMLnB/P8YS3mrUaO6KtvMWqQpw55KehNwE3BWRMypLLtjK2e1pO8An++kjrXGUu6qbjNrjWYMe7oBcCNwZUTcUNO2ef4p0vGYP/exHjNrsWYMe/phYH/gpDqnjr8raT4wHxgJnN/HesysxZox7OnVwNUN5n9PX57fzPofX1FrZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFVUkVCQdJukvkhZJes3Qp5IGS7o+t98laetK25l5+l8kHVqiHjNrnT6HiqT1gG8AhwM7AcdJ2qmm28nAyoh4G3AxcGGedyfgWKBjnOVL8vLMrE2V2FLZE1gUEYsj4iXgOtIYy1XVMZdvAA7KY/1MAK6LiNUR8RCwKC/PzNpUiVDZAnik8vjRPK1un4hYA6wCRnRzXiANeypprqS5y55aVqBsM1sX2uZAbURMjYixETF2k5GbtLocM2ugRKgsAbaqPN4yT6vbR9IgYGNgeTfnNbM2UiJUfg9sJ2mbPG7ysaQxlquqYy4fDdweEZGnH5vPDm0DbAf8rkBNZtYifRr2FNIxEkmnAbcA6wHTIuJeSZOBuRExA/g2cJWkRcAKUvCQ+30PuA9YA5waEa/0tSYza50+hwpARMwEZtZMO7ty/0XgQw3mvQC4oEQdZtZ6bXOg1szag0PFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMimrWsKenS7pP0j2SbpM0utL2iqS78632H2abWZvp8/+orQx7+l7SYGC/lzQjIu6rdPsTMDYinpf0SeAi4Jjc9kJEjOlrHWbWPzRl2NOIuCMins8P55DG9zGzAahZw55WnQzcXHk8JA9nOkfSkY1m8rCnZu2hyBAd3SXpo8BY4IDK5NERsUTStsDtkuZHxIO180bEVGAqwB57jI2mFGxmPdasYU+RdDBwFnBERKzumB4RS/LPxcBsYLcCNZlZizRl2FNJuwHfJAXKk5XpwyQNzvdHAvuQRis0szbVrGFP/xN4I/B9SQAPR8QRwI7ANyW9Sgq4KTVnjcyszTRr2NODG8x3J7BLiRrMrH/wFbVmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpq1rCnJ0laVhne9JRK20RJC/NtYol6zKx1mjXsKcD1EXFazbzDgXNIYwEF8Ic878q+1mVmrdGUYU87cSgwKyJW5CCZBRxWoCYza5ES/02/3rCne9Xpd5Sk/YEFwGcj4pEG89YdMlXSJGASwBZbbsWyZ1bX62b91JQvfa3VJVgPrH70iV7P26wDtT8Bto6Id5K2Rqb3dAERMTUixkbE2OEjNileoJmV0ZRhTyNieWWo08uBPbo7r5m1l2YNe7p55eERwP35/i3AIXn402HAIXmambWpZg17+hlJRwBrgBXASXneFZLOIwUTwOSIWNHXmsysdZo17OmZwJkN5p0GTCtRh5m1nq+oNbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWVLOGPb24MuTpAklPV9peqbTNqJ3XzNpLU4Y9jYjPVvp/GtitsogXImJMX+sws/6hFcOeHgdcW+B5zawfKhEqPRm6dDSwDXB7ZfIQSXMlzZF0ZKMnkTQp95u7YvmyAmWb2brQ7AO1xwI3RMQrlWmjI2Is8BHgq5LeWm9GD3tq1h6aMuxpxbHU7PpExJL8czEwm7WPt5hZm2nKsKcAknYAhgG/rUwbJmlwvj8S2Ae4r3ZeM2sfzRr2FFLYXBcRUZl9R+Cbkl4lBdyU6lkjM2s/TRn2ND8+t858dwK7lKjBzPoHX1FrZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKyoUsOeTpP0pKQ/N2iXpK/nYVHvkbR7pW2ipIX5NrFEPWbWOqW2VK4ADuuk/XBgu3ybBFwKIGk4cA6wF2mkw3MkDStUk5m1QJFQiYhfAis66TIBuDKSOcBQSZsDhwKzImJFRKwEZtF5OJlZP9esYyqNhkbtyZCpHvbUrA20zYFaD3tq1h6aFSqNhkbtyZCpZtYGmhUqM4AT81mgdwGrImIpaVTDQ/Lwp8OAQ/I0M2tTRUYolHQtMA4YKelR0hmd9QEi4jLS6IXjgUXA88DHctsKSeeRxmMGmBwRnR3wNbN+rtSwp8d10R7AqQ3apgHTStRhZq3XNgdqzaw9OFTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysqGYNe3p8Hu50vqQ7Je1aaftrnn63pLkl6jGz1mnWsKcPAQdExC7AecDUmvYDI2JMRIwtVI+ZtUipf3z9S0lbd9J+Z+XhHNL4PmY2ALXimMrJwM2VxwH8XNIfJE1qQT1mVlCRLZXuknQgKVT2rUzeNyKWSHozMEvSA3nA99p5JwGTALbYcqvaZjPrJ5q2pSLpncDlwISIWN4xPSKW5J9PAjcCe9ab32Mpm7WHpoSKpFHAD4ETImJBZfqGkjbquE8a9rTuGSQzaw/NGvb0bGAEcIkkgDX5TM+mwI152iDgmoj4WYmazKw1mjXs6SnAKXWmLwZ2fe0cZtaufEWtmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVlSzxlIeJ2lVHi/5bklnV9oOk/QXSYsknVGiHjNrnWaNpQzwqzxe8piImAwgaT3gG8DhwE7AcZJ2KlSTmbVAkVDJIwqu6MWsewKLImJxRLwEXAdMKFGTmbVGM4c9fbekecBjwOcj4l5gC+CRSp9Hgb3qzVwd9hRYPWrEkIE46NhI4KlWF7GODNR1G6jrtX1vZ2xWqPwRGB0Rz0kaD/wI2K4nC4iIqcBUAElz82BkA8pAXS8YuOs2kNert/M25exPRDwTEc/l+zOB9SWNBJYA1dHWt8zTzKxNNWss5c2UxzaVtGd+3uXA74HtJG0jaQPgWGBGM2oys3WjWWMpHw18UtIa4AXg2IgIYI2k04BbgPWAaflYS1emlqi7Hxqo6wUDd928XjWU3ttmZmX4ilozK8qhYmZFtUWoSBouaZakhfnnsAb9Xql8FaDfHvDt6qsJkgZLuj633yVp6xaU2WPdWK+TJC2rvEantKLOnurG11Ak6et5ve+RtHuza+yNvny9plMR0e9vwEXAGfn+GcCFDfo91+pau7Eu6wEPAtsCGwDzgJ1q+nwKuCzfPxa4vtV1F1qvk4D/bXWtvVi3/YHdgT83aB8P3AwIeBdwV6trLrRe44Cf9nS5bbGlQrp0f3q+Px04snWl9Fl3vppQXd8bgIM6Tsn3YwP2KxfR9ddQJgBXRjIHGCpp8+ZU13vdWK9eaZdQ2TQilub7jwObNug3RNJcSXMkHdmc0nqs3lcTtmjUJyLWAKuAEU2prve6s14AR+VdhBskbVWnvR11d93b0bslzZN0s6SduzNDM7/70ylJtwKb1Wk6q/ogIkJSo/PgoyNiiaRtgdslzY+IB0vXar32E+DaiFgt6eOkrbH3tLgma6xXX6/pN6ESEQc3apP0hKTNI2Jp3qx8ssEyluSfiyXNBnYj7ef3J935akJHn0clDQI2Jl2B3J91uV4RUV2Hy0nHygaCAfl1k4h4pnJ/pqRLJI2MiE6/QNkuuz8zgIn5/kTgx7UdJA2TNDjfHwnsA9zXtAq7rztfTaiu79HA7ZGPnPVjXa5XzXGGI4D7m1jfujQDODGfBXoXsKqyu962Ovl6TedafQS6m0epRwC3AQuBW4HhefpY4PJ8f29gPumsw3zg5FbX3cn6jAcWkLaizsrTJgNH5PtDgO8Di4DfAdu2uuZC6/UfwL35NboD2KHVNXdzva4FlgIvk46XnAx8AvhEbhfpn409mP/2xra65kLrdVrl9ZoD7N2d5foyfTMrql12f8ysTThUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVH/BxC2WzKmvdkuAAAAAElFTkSuQmCC\n", "text/plain": [ - "
" + "Accuracy 1.0\n", + "F1 Macro 1.0\n", + "Precision 1.0\n", + "Recall 1.0\n", + "dtype: float64" ] }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "y_test = []\n", - "y_pred = []\n", - "for i in range(0, 10):\n", - " y_test.extend(exe['pipeline0']['folds'][str(i)]['Actual'])\n", - " y_pred.extend(exe['pipeline0']['folds'][str(i)]['predicted'])\n", - "\n", - "y_test = pd.Categorical(pd.Series(y_test)).codes\n", - "y_pred = pd.Categorical(pd.Series(y_pred)).codes\n", - "\n", - "plt.title(\"Confusion Matrix (accuracy=%.2f)\" % accuracy_score(y_test, y_pred))\n", - "plt.imshow(pd.crosstab(y_test, y_pred), cmap=\"Blues\")\n", - "\n", - "plt.show()" + "cd.evaluate(X, y, fit=True, test_size=0.2, shuffle=True)" ] } ], @@ -797,7 +656,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index 87d1b912..7e759ffa 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,9 @@ 'baytune>=0.4.0,<0.5', 'pyCLI==2.0.3', 'scikit-learn>=0.21,<0.22', - 'featuretools>=0.20.0,<0.25' + 'featuretools>=0.20.0,<0.25', + 'composeml', + 'jedi==0.17.2' ] setup_requires = [ @@ -68,9 +70,9 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], description="Cardea", entry_points={ diff --git a/tests/cardea/problem_definition/__init__.py b/tests/cardea/problem_definition/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/cardea/problem_definition/test_definition.py b/tests/cardea/problem_definition/test_definition.py deleted file mode 100644 index 05914672..00000000 --- a/tests/cardea/problem_definition/test_definition.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import ProblemDefinition - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def problem_definition(): - return ProblemDefinition() - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]},) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['1/1/2000 20:00', '2/1/2000 5:00', '3/1/2000 22:00'], - "end": ['1/2/2000 21:10', '2/2/2000 18:00', '3/3/2000 20:00']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - return [encounter, period, patient] - - -@pytest.fixture() -def entityset(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -def test_check_target_label_true(entityset, problem_definition): - assert problem_definition.check_target_label(entityset, 'Patient', 'gender') is True - - -def test_check_target_label_false(entityset, problem_definition): - - assert problem_definition.check_target_label( - entityset, 'Encounter', 'class') is False - - -def test_check_target_label_values_true(entityset, problem_definition): - - assert problem_definition.check_for_missing_values_in_target_label( - entityset, 'Patient', 'active') is True - - -def test_check_target_label_values_false(entityset, problem_definition): - assert problem_definition.check_for_missing_values_in_target_label( - entityset, 'Patient', 'gender') is False - - -def test_check_target_label_values_error(entityset, problem_definition): - assert problem_definition.check_for_missing_values_in_target_label( - entityset, 'Encounter', 'class') is False diff --git a/tests/cardea/problem_definition/test_length_of_stay.py b/tests/cardea/problem_definition/test_length_of_stay.py deleted file mode 100644 index bd23ed90..00000000 --- a/tests/cardea/problem_definition/test_length_of_stay.py +++ /dev/null @@ -1,306 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import LengthOfStay - - -@pytest.fixture() -def length_of_stay(): - return LengthOfStay() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/19/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "label": [2, 1, 7] - }) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, patient] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', nan]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_cutoff_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_label(objects_missing_generation_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_error_missing_cutoff_label(objects_missing_cutoff_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_cutoff_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_cutoff_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_cutoff_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_target_label(entityset_fail, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_fail) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label( - entityset_fail_missing_generation_label, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_fail_missing_generation_label) - - -def test_generate_cutoff_times_with_missing_cutoff_label( - entityset_error_missing_cutoff_label, length_of_stay): - entityset_error_missing_cutoff_label['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_error_missing_cutoff_label) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times(entityset_fail_missing_generation_value) diff --git a/tests/cardea/problem_definition/test_mortality_prediction.py b/tests/cardea/problem_definition/test_mortality_prediction.py deleted file mode 100644 index bb33eaeb..00000000 --- a/tests/cardea/problem_definition/test_mortality_prediction.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition.mortality_prediction import MortalityPrediction - - -@pytest.fixture() -def mortality_prediction(): - return MortalityPrediction() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "label": [True, False, True]}) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["X60", "C12", "V02"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "end": ['9/22/2018 00:10', '9/21/2018 00:10', '10/4/2018 00:10']}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "end": ['9/22/2018 00:10', '9/21/2018 00:10', '10/4/2018 00:10']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "length": [2, 1, 7, 0, 0, 0]}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, patient, duration] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["Z10", "C12", "A10"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/20/2018 21:10', '9/20/2018 18:00', '9/27/2018 20:00'], - "end": ['9/22/2018 20:00', '9/21/2018 5:00', '10/4/2018 22:00'] - }) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, mortality_prediction, cutoff_times): - _, _, generated_df = mortality_prediction.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - generated_df['label'] = generated_df['label'].astype(bool) # same data type - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label(entityset_success, mortality_prediction): - entityset_success['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - mortality_prediction.generate_cutoff_times( - entityset_success) - - -def test_generate_label_with_missing_label(entityset_success, mortality_prediction): - entityset_success['Encounter'].delete_variables(['diagnosis']) - with pytest.raises(ValueError): - mortality_prediction.generate_cutoff_times(entityset_success) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, mortality_prediction): - es_fail = entityset_fail_missing_generation_value - temp = es_fail['Encounter'].df - temp['diagnosis'] = [nan, nan, nan] - es = es_fail.entity_from_dataframe(entity_id='Encounter', - dataframe=temp, - index='identifier') - with pytest.raises(ValueError): - mortality_prediction.generate_cutoff_times(es) diff --git a/tests/cardea/problem_definition/test_predicting_diagnosis.py b/tests/cardea/problem_definition/test_predicting_diagnosis.py deleted file mode 100644 index f4fac436..00000000 --- a/tests/cardea/problem_definition/test_predicting_diagnosis.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition.predicting_diagnosis import DiagnosisPrediction - - -@pytest.fixture() -def diagnosis_prediction(): - return DiagnosisPrediction("Z10") - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "label": [True, False, False]}) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["Z10", "C12", "A10"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "end": ['9/22/2018 00:10', '9/21/2018 00:10', '10/4/2018 00:10']}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:00'], - "end": ['9/20/2018 00:00', '9/20/2018 00:10', '9/27/2018 00:10']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "length": [2, 1, 7, 0, 0, 0]}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, patient, duration] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["Z10", "C12", "A10"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/20/2018 21:10', '9/20/2018 18:00', '9/27/2018 20:00'], - "end": ['9/22/2018 20:00', '9/21/2018 5:00', '10/4/2018 22:00'] - }) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, diagnosis_prediction, cutoff_times): - _, _, generated_df = diagnosis_prediction.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label(entityset_success, diagnosis_prediction): - entityset_success['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - diagnosis_prediction.generate_cutoff_times( - entityset_success) - - -def test_generate_label_with_missing_label(entityset_success, diagnosis_prediction): - entityset_success['Encounter'].delete_variables(['diagnosis']) - with pytest.raises(ValueError): - diagnosis_prediction.generate_cutoff_times(entityset_success) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, diagnosis_prediction): - es_fail = entityset_fail_missing_generation_value - temp = es_fail['Encounter'].df - temp['diagnosis'] = [nan, nan, nan] - es = es_fail.entity_from_dataframe(entity_id='Encounter', - dataframe=temp, - index='identifier') - with pytest.raises(ValueError): - diagnosis_prediction.generate_cutoff_times(es) diff --git a/tests/cardea/problem_definition/test_prolonged_length_of_stay.py b/tests/cardea/problem_definition/test_prolonged_length_of_stay.py deleted file mode 100644 index c2f418a2..00000000 --- a/tests/cardea/problem_definition/test_prolonged_length_of_stay.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import ProlongedLengthOfStay - - -@pytest.fixture() -def length_of_stay(): - return ProlongedLengthOfStay() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/19/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "label": [0, 0, 1] - }) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, patient] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', nan]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_cutoff_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_label(objects_missing_generation_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_error_missing_cutoff_label(objects_missing_cutoff_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_cutoff_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_cutoff_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_cutoff_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_target_label(entityset_fail, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_fail) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label( - entityset_fail_missing_generation_label, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_fail_missing_generation_label) - - -def test_generate_cutoff_times_with_missing_cutoff_label( - entityset_error_missing_cutoff_label, length_of_stay): - entityset_error_missing_cutoff_label['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_error_missing_cutoff_label) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times(entityset_fail_missing_generation_value) - - -def test_generate_cutoff_times_with_threshold(entityset_success): - los = ProlongedLengthOfStay(t=2) - values_should_be = [1, 0, 1] - es, _, generated_df = los.generate_cutoff_times( - entityset_success) - generated_labels = list(generated_df['label']) - assert values_should_be == generated_labels diff --git a/tests/cardea/problem_definition/test_readmission.py b/tests/cardea/problem_definition/test_readmission.py deleted file mode 100644 index 032bf794..00000000 --- a/tests/cardea/problem_definition/test_readmission.py +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import Readmission - - -@pytest.fixture() -def readmission(): - return Readmission() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12, 13, 14, 15], - "time": ['9/22/2018', '9/21/2018', '10/4/2018', - '9/28/2018', '10/30/2018', '11/18/2018'], - "label": [False, False, False, True, False, True] - }) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "period": [120, 121, 122, 125, 123, 124], - "length": [2, 1, 7, 0, 0, 0]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122, 125, 123, 124], - "start": ['9/20/2018', '9/20/2018', '9/27/2018', - '9/28/2018', '10/30/2018', '11/18/2018'], - "end": ['9/22/2018', '9/21/2018', '10/4/2018', - '9/28/2018', '10/30/2018', '11/18/2018'] - }) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018'], - "end": ['9/20/2018', '9/20/2018', '9/27/2018']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "length": [2, 1, 7, 0, 0, 0]}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, patient, duration] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018'], - "end": ['9/18/2018', '9/19/2018', nan]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, readmission, cutoff_times): - _, _, generated_df = readmission.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_labels_success(entityset_success, readmission, cutoff_times): - es, _, generated_df = readmission.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - - labels = list(generated_df['label']) - - assert labels == [False, False, False, True, False, True] - - -def test_generate_labels_success_threshold(entityset_success, cutoff_times): - - es, _, generated_df = Readmission(6).generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - - labels = list(generated_df['label']) - assert labels == [False, False, False, True, False, False] - - -def test_generate_cutoff_times_missing_generation_label(entityset_success, readmission): - entityset_success['Period'].delete_variables(['end']) - with pytest.raises(ValueError): - readmission.generate_cutoff_times( - entityset_success) - - -def test_generate_label_with_missing_values(entityset_fail_missing_generation_value, readmission): - with pytest.raises(ValueError): - readmission.generate_cutoff_times(entityset_fail_missing_generation_value) diff --git a/tests/cardea/problem_definition/test_show_noshow_appointment.py b/tests/cardea/problem_definition/test_show_noshow_appointment.py deleted file mode 100644 index 1a39df32..00000000 --- a/tests/cardea/problem_definition/test_show_noshow_appointment.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import MissedAppointment - - -@pytest.fixture() -def missed_appointment(): - return MissedAppointment() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - return pd.DataFrame( - {"instance_id": [10, 11, 12], - "time": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "label": ['noshow', 'noshow', 'fulfilled'] - }) - - -@pytest.fixture() -def objects(es_loader): - - appointment_df = pd.DataFrame({"identifier": [10, 11, 12], - "status": ['noshow', 'noshow', 'fulfilled'], - "start": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "participant": [120, 121, 122], - "created": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018]}) - - participant_df = pd.DataFrame({"object_id": [120, 121, 122], - "actor": [0, 1, 2]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - appointment = es_loader.create_object(appointment_df, 'Appointment') - participant = es_loader.create_object(participant_df, 'Appointment_Participant') - patient = es_loader.create_object(patient_df, 'Patient') - - return [appointment, participant, patient] - - -@pytest.fixture() -def es_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def object_error_missing_label(es_loader): - - appointment_df = pd.DataFrame({"identifier": [10, 11, 12], - "start": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "participant": [120, 121, 122], - "created": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018]}) - - appointment = es_loader.create_object(appointment_df, 'Appointment') - - return appointment - - -@pytest.fixture() -def objects_error_missing_cutoff_label(es_loader): - - appointment_df = pd.DataFrame({"identifier": [10, 11, 12], - "start": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "status": ['noshow', 'noshow', 'fulfilled'], - "participant": [120, 121, 122]}) - - appointment = es_loader.create_object(appointment_df, 'Appointment') - return appointment - - -@pytest.fixture() -def entityset_error_missing_label(objects, object_error_missing_label, es_loader): - es = ft.EntitySet(id="test") - - objects.extend([object_error_missing_label]) - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_error_missing_cutoff_label(objects, objects_error_missing_cutoff_label, es_loader): - es = ft.EntitySet(id="test") - - for object in objects: - es_loader.create_entity(object, entity_set=es) - - for object in objects: - es_loader.create_relationships(object, entity_set=es) - - es_loader.create_entity(objects_error_missing_cutoff_label, entity_set=es) - es_loader.create_relationships(objects_error_missing_cutoff_label, entity_set=es) - return es - - -def test_generate_cutoff_times_success( - es_success, missed_appointment, cutoff_times): - _, _, generated_df = missed_appointment.generate_cutoff_times(es_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_error( - entityset_error_missing_label, missed_appointment): - with pytest.raises(ValueError): - missed_appointment.generate_cutoff_times( - entityset_error_missing_label) - - -def test_generate_cutoff_times_error_value(es_success, missed_appointment): - es_success['Appointment'].df.loc[len(es_success['Appointment'].df)] = [ - nan, nan, nan, nan, nan] - with pytest.raises(ValueError): - missed_appointment.generate_cutoff_times( - es_success) - - -def test_generate_cutoff_times_missing_cutoff_time( - es_success, missed_appointment): - es_success['Appointment'].delete_variables(['created']) - with pytest.raises(ValueError): - missed_appointment.generate_cutoff_times( - es_success) diff --git a/tests/cardea/__init__.py b/tests/data_assembling/__init__.py similarity index 100% rename from tests/cardea/__init__.py rename to tests/data_assembling/__init__.py diff --git a/tests/cardea/data_loader/test_data_loader.py b/tests/data_assembling/test_data_loader.py similarity index 99% rename from tests/cardea/data_loader/test_data_loader.py rename to tests/data_assembling/test_data_loader.py index f83c4d66..370b4597 100644 --- a/tests/cardea/data_loader/test_data_loader.py +++ b/tests/data_assembling/test_data_loader.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from cardea.data_loader import DataLoader, Diamond +from cardea.data_assembling import DataLoader, Diamond @pytest.fixture() diff --git a/tests/cardea/data_loader/test_entityset_loader.py b/tests/data_assembling/test_entityset_loader.py similarity index 97% rename from tests/cardea/data_loader/test_entityset_loader.py rename to tests/data_assembling/test_entityset_loader.py index 2f15a475..84d9c833 100644 --- a/tests/cardea/data_loader/test_entityset_loader.py +++ b/tests/data_assembling/test_entityset_loader.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from cardea.data_loader import EntitySetLoader +from cardea.data_assembling import EntitySetLoader @pytest.fixture() diff --git a/tests/cardea/data_loader/test_load_mimic.py b/tests/data_assembling/test_load_mimic.py similarity index 83% rename from tests/cardea/data_loader/test_load_mimic.py rename to tests/data_assembling/test_load_mimic.py index 4ccd4e69..912b1571 100644 --- a/tests/cardea/data_loader/test_load_mimic.py +++ b/tests/data_assembling/test_load_mimic.py @@ -3,7 +3,7 @@ import pytest -from cardea.data_loader.load_mimic import get_table_properties, get_table_relationships +from cardea.data_assembling.load_mimic import get_table_properties, get_table_relationships @pytest.fixture() @@ -25,7 +25,7 @@ def relationships(admission): def test_get_table_properties_types(properties): types = properties[0] - assert len(types) == 19 and types['language'] == str + assert len(types) == 19 and types['LANGUAGE'] == str def test_get_table_properties_primkey(properties): diff --git a/tests/cardea/data_loader/__init__.py b/tests/data_labeling/__init__.py similarity index 100% rename from tests/cardea/data_loader/__init__.py rename to tests/data_labeling/__init__.py diff --git a/tests/data_labeling/test_appointment_no_show.py b/tests/data_labeling/test_appointment_no_show.py new file mode 100644 index 00000000..3942b1bc --- /dev/null +++ b/tests/data_labeling/test_appointment_no_show.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import appointment_no_show + + +@patch('cardea.data_labeling.utils.denormalize') +def test_appointment_no_show(denormalize_mock): + es = Mock(autospec=ft.EntitySet) + df = Mock(autospec=pd.DataFrame) + denormalize_mock.return_value = df + + returned = appointment_no_show(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(meta, dict) diff --git a/tests/data_labeling/test_data_labeler.py b/tests/data_labeling/test_data_labeler.py new file mode 100644 index 00000000..bb8c5a34 --- /dev/null +++ b/tests/data_labeling/test_data_labeler.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from unittest.mock import Mock + +import featuretools as ft + +from cardea.data_labeling import DataLabeler + + +class TestDataLabeler: + + @classmethod + def setup_class(cls): + cls.function = lambda x: x + cls.es = Mock(autospec=ft.EntitySet) + cls.subset = None + cls.verbose = False + + def test_data_labeler(self): + def function(x): + return x + + DataLabeler(function) diff --git a/tests/data_labeling/test_diagnosis.py b/tests/data_labeling/test_diagnosis.py new file mode 100644 index 00000000..b0e56543 --- /dev/null +++ b/tests/data_labeling/test_diagnosis.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import diagnosis_prediction + + +@patch('cardea.data_labeling.utils.denormalize') +def test_mortality_prediction_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id='mimic') + df = Mock(autospec=pd.DataFrame) + denormalize_mock.return_value = df + + returned = diagnosis_prediction(es, 'disease') + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(meta, dict) diff --git a/tests/data_labeling/test_length_of_stay.py b/tests/data_labeling/test_length_of_stay.py new file mode 100644 index 00000000..db2d882c --- /dev/null +++ b/tests/data_labeling/test_length_of_stay.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import length_of_stay + + +@patch('cardea.data_labeling.utils.denormalize') +def test_length_of_stay_fhir(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="fhir") + + df = pd.DataFrame({ + 'col 1': range(5), + 'start': range(5), + 'end': range(5) + }) + denormalize_mock.return_value = df + + returned = length_of_stay(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + + +@patch('cardea.data_labeling.utils.denormalize') +def test_length_of_stay_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="mimic") + + df = pd.DataFrame({ + 'col 1': range(5), + 'admittime': range(5), + 'dischtime': range(5) + }) + denormalize_mock.return_value = df + + returned = length_of_stay(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + + +@patch('cardea.data_labeling.utils.denormalize') +def test_length_of_stay_classification(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="mimic") + + df = pd.DataFrame({ + 'col 1': range(5), + 'admittime': range(5), + 'dischtime': range(5) + }) + denormalize_mock.return_value = df + + returned = length_of_stay(es, k=7) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + assert meta['type'] == 'classification' + assert meta['thresh'] == 7 diff --git a/tests/data_labeling/test_mortality.py b/tests/data_labeling/test_mortality.py new file mode 100644 index 00000000..2f971c21 --- /dev/null +++ b/tests/data_labeling/test_mortality.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import mortality_prediction + + +@patch('cardea.data_labeling.utils.denormalize') +def test_mortality_prediction_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id='mimic') + df = Mock(autospec=pd.DataFrame) + denormalize_mock.return_value = df + + returned = mortality_prediction(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(meta, dict) diff --git a/tests/data_labeling/test_readmission.py b/tests/data_labeling/test_readmission.py new file mode 100644 index 00000000..f27fc3ce --- /dev/null +++ b/tests/data_labeling/test_readmission.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import readmission + + +@patch('cardea.data_labeling.utils.denormalize') +def test_readmission_fhir(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="fhir") + + df = pd.DataFrame({ + 'col 1': range(5), + 'start': range(5), + 'end': range(5) + }) + denormalize_mock.return_value = df + + returned = readmission(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + + +@patch('cardea.data_labeling.utils.denormalize') +def test_readmission_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="mimic") + + df = pd.DataFrame({ + 'col 1': range(5), + 'admittime': range(5), + 'dischtime': range(5) + }) + denormalize_mock.return_value = df + + returned = readmission(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) diff --git a/tests/cardea/featurization/__init__.py b/tests/featurizing/__init__.py similarity index 100% rename from tests/cardea/featurization/__init__.py rename to tests/featurizing/__init__.py diff --git a/tests/cardea/featurization/test_featurization.py b/tests/featurizing/test_featurization.py similarity index 74% rename from tests/cardea/featurization/test_featurization.py rename to tests/featurizing/test_featurization.py index 697cf0a5..3056c8bd 100644 --- a/tests/cardea/featurization/test_featurization.py +++ b/tests/featurizing/test_featurization.py @@ -5,8 +5,8 @@ import pandas as pd import pytest -from cardea.data_loader import EntitySetLoader -from cardea.featurization import Featurization +from cardea.data_assembling import EntitySetLoader +from cardea.featurizing import Featurization @pytest.fixture() @@ -46,12 +46,12 @@ def entityset(objects, es_loader): @pytest.fixture() -def cutoff(): - cutoff = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['1/1/2000 20:00', '2/1/2000 5:00', '3/1/2000 22:00']}) +def label_times(): + label_times = pd.DataFrame({"instance_id": [10, 11, 12], + "time": ['1/1/2000 20:00', '2/1/2000 5:00', '3/1/2000 22:00']}) - cutoff['time'] = pd.to_datetime(cutoff['time']) - return cutoff + label_times['time'] = pd.to_datetime(label_times['time']) + return label_times @pytest.fixture() @@ -59,13 +59,13 @@ def featurization(): return Featurization() -def test_generate_feature_matrix(featurization, entityset, cutoff): +def test_generate_feature_matrix(featurization, entityset, label_times): feature_matrix, features_defs = featurization.generate_feature_matrix( - entityset, "Encounter", cutoff, encode=False) + entityset, "Encounter", label_times, encode=False) assert len(feature_matrix) == 3 and len(feature_matrix.columns) == 12 -def test_generate_feature_matrix_encoded(featurization, entityset, cutoff): +def test_generate_feature_matrix_encoded(featurization, entityset, label_times): fm_encoded, features_encoded = featurization.generate_feature_matrix( - entityset, "Encounter", cutoff, encode=True) + entityset, "Encounter", label_times, encode=True) assert len(fm_encoded) == 3 and len(fm_encoded.columns) == 32 diff --git a/tests/cardea/fhir/test_fhirbase.py b/tests/fhir/test_fhirbase.py similarity index 100% rename from tests/cardea/fhir/test_fhirbase.py rename to tests/fhir/test_fhirbase.py diff --git a/tests/cardea/modeling/__init__.py b/tests/modeling/__init__.py similarity index 100% rename from tests/cardea/modeling/__init__.py rename to tests/modeling/__init__.py diff --git a/tests/cardea/modeling/test_modeler.py b/tests/modeling/test_modeler.py similarity index 100% rename from tests/cardea/modeling/test_modeler.py rename to tests/modeling/test_modeler.py diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 00000000..7d16fc74 --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,92 @@ +import os + +import pandas as pd +from sklearn.datasets import load_iris + +from cardea.core import Cardea +from cardea.data import download + + +def prediction_problem_function(es): + def label(ds): + return False + + meta = { + "entity": "Appointment", + "target_entity": "identifier", + "time_index": "created", + "type": "classification", + "num_examples_per_instance": 1 + } + + df = es['Appointment'].df.iloc[:100] + + return label, df, meta + + +class TestCardea: + + @classmethod + def setup_class(cls): + cls.X, cls.y = load_iris(return_X_y=True) + + def setup(self): + data_path = download('kaggle') + self.cardea = Cardea(data_path, True) + self.label_times = self.cardea.label(prediction_problem_function) + self.cardea.fit(self.X, self.y) + + def test__load_entityset(self): + es = self.cardea.entityset + assert len(es.entities) == 9 + assert len(es.relationships) == 6 + + def test_list_labelers(self): + labelers = self.cardea.list_labelers() + assert isinstance(labelers, set) + + def test_label(self): + assert len(self.label_times) == 100 + + def test_featurize(self): + label_times = self.label_times.iloc[:10] + feature_matrix = self.cardea.featurize(label_times) + assert len(feature_matrix) == 10 + + def test_set_pipeline(self): + pipeline = "Random Forest" + self.cardea.set_pipeline(pipeline) + + def test_fit(self): + self.cardea.fit(self.X, self.y) + + def test_predict(self): + y = self.cardea.predict(self.X) + assert self.y.shape == y.shape + + def test_fit_predict(self): + y = self.cardea.fit_predict(self.X, self.y) + assert self.y.shape == y.shape + + def test_train_test_split(self): + X_train, X_test, y_train, y_test = self.cardea.train_test_split(self.X, self.y) + assert X_train.shape[1] == X_test.shape[1] + assert len(X_train) == len(y_train) + assert len(X_test) == len(y_test) + + def test_evaluate(self): + results = self.cardea.evaluate(self.X, self.y) + assert isinstance(results, pd.Series) + assert len(results) == 4 + + def test_evaluate_fit(self): + results = self.cardea.evaluate(self.X, self.y, fit=True) + assert isinstance(results, pd.Series) + assert len(results) == 4 + + def test_save_load(self, tmpdir): + path = os.path.join(tmpdir, 'some/path.pkl') + self.cardea.save(path) + + new_cardea = Cardea.load(path) + assert new_cardea.entityset == self.cardea.entityset diff --git a/tests/test_something.py b/tests/test_something.py deleted file mode 100644 index f5d0f9f0..00000000 --- a/tests/test_something.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - - -def test_something(): - pass diff --git a/tox.ini b/tox.ini index 6895ed00..5e7a7fdc 100644 --- a/tox.ini +++ b/tox.ini @@ -27,3 +27,9 @@ commands = skipsdist = true commands = /usr/bin/env make docs + + +[testenv:readme] +skipsdist = true +commands = + /usr/bin/env make test-readme \ No newline at end of file