From 877904238059916345983370b4554a8e923c24e3 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 1 Dec 2020 23:58:53 -0500 Subject: [PATCH 01/13] initial composeml refactor --- cardea/problem_definition/definition.py | 296 +++++--------------- cardea/problem_definition/length_of_stay.py | 163 +++-------- 2 files changed, 114 insertions(+), 345 deletions(-) diff --git a/cardea/problem_definition/definition.py b/cardea/problem_definition/definition.py index 5c03554b..bc43127f 100644 --- a/cardea/problem_definition/definition.py +++ b/cardea/problem_definition/definition.py @@ -1,247 +1,99 @@ +import composeml as cp import pandas as pd -from cardea.data_loader import DataLoader - class ProblemDefinition: - """A class that defines the prediction problem - by specifying cutoff times and generating the target label if it does not exist. + """Class that defines the prediction problem. + + This class supports the generation of `label_times` which + is fundamental to the feature generation phase as well + as specifying the target labels. + + Args: + target_entity (str): + The instance id of the target entity. + time_index (str): + The time index specifying at what point to start the prediction. + prediction_type (str): + The type of the machine learning prediction; classification or + regression. + es (featuretools.EntitySet): + An entityset representation of the data. """ - def check_target_label(self, entity_set, target_entity, target_label): - """Checks if target label exists in the entity set. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. + def __init__(self, target_entity, time_index, prediction_type, es): + self.target_entity = target_entity + self.time_index = time_index + self.prediction_type = prediction_type + self.es = es - Returns: - True if the target label exists. - """ - return DataLoader().check_column_existence(entity_set, target_entity, target_label) + def _search_relationship(self, left, right): + for r in self.es.relationships: + if r.parent_entity.id in left: + if right == r.child_entity.id: + left_on = r.parent_variable.id + right_on = r.child_variable.id - def check_for_missing_values_in_target_label( - self, entity_set, target_entity, target_label_column_name): - """Checks if there is a missing value in the target label. + elif r.child_entity.id in left: + if right == r.parent_entity.id: + left_on = r.child_variable.id + right_on = r.parent_variable.id - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. + return left_on, right_on - Returns: - False is the target label does not contain a missing value. - """ - return DataLoader().check_for_missing_values(entity_set, - target_entity, - target_label_column_name) + def denormalize(self, entities): + """Merge a set of entities into a single dataframe. - def generate_target_label(self, entity_set, target_entity, target_label): - """Generates target labels if the entityset is missing labels. + Convert a set of entities from the entityset into a single + dataframe by repetitively merging the selected entities. The + merge process is applied sequentially. Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. + entities (list): + list of strings denoting which entities to merge. Returns: - Target entity with the generated label. - """ - - def generate_cutoff_times(self, entity_set): - """Generates cutoff times for the predection problem. - - Args: - entity_set: fhir entityset. - - Returns: - entity_set, target_entity, series of target_labels and a dataframe of cutoff_times. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. + pandas.DataFrame: + A single dataframe containing all the information from the + selected entities. """ + k = len(entities) + assert k > 0 - def unify_cutoff_times_hours_admission_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared time. + # initial entity to start from (should be the target entity) + first = entities[0] + previous = [first] + df = self.es[first].df - Args: - df: cutoff_entity dataframe. - """ - df = df.sort_values(by=[cutoff_time_label]) - df = df.reset_index() - - for i in df.index: - - if i == 0: + # merge the dataframes to create a single input + for i in range(1, k): + right = entities[i] - if df.get_value(i, 'checked') is not True: - df.set_value(i, 'ct', df.get_value(i, cutoff_time_label)) - df.set_value(i, 'checked', True) + left_on, right_on = self._search_relationship(previous, right) + df = pd.merge(df, es[right].df, + left_on=left_on, right_on=right_on, + how='left', suffixes=('', '_y')).filter(regex='^(?!.*_y)') - elif df.get_value(i, 'checked') is not True: + previous.append(right) - ct_val1 = df.get_value(i - 1, 'ct') - end_val1 = df.get_value(i - 1, 'end') - start_val2 = df.get_value(i, cutoff_time_label) - df.get_value(i, 'end') - - if ct_val1 < start_val2 < end_val1: - df.set_value(i - 1, 'ct', start_val2) - df.set_value(i, 'ct', start_val2) - df.set_value(i, 'checked', True) - - else: - df.set_value(i, 'ct', df.get_value(i, cutoff_time_label)) - df.set_value(i, 'checked', True) - - if i + 1 == len(df): - break return df - def unify_cutoff_times_days_admission_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared days. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['date']): - sub_day = df[df['date'] == d] - - sub_duration_greater = sub_day[sub_day['duration'] > 0] - sub_duration_less = sub_day[sub_day['duration'] <= 0] - frames.append(sub_duration_less) - sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label]) - if len(sub_duration_greater) != 0: - final_date = sub_duration_greater.iloc[-1][cutoff_time_label] - - for i in sub_duration_greater.index: - sub_duration_greater.set_value(i, 'ct', final_date) - sub_duration_greater.set_value(i, 'checked', True) - - frames.append(sub_duration_greater) - - for i in sub_duration_less.index: - sub_duration_less.set_value(i, 'ct', pd.NaT) - sub_duration_less.set_value(i, 'checked', False) - - frames.append(sub_duration_less) - - result = pd.concat(frames) - result = result.drop_duplicates() - result[cutoff_time_label] = pd.to_datetime(result.start) - result = result.sort_values(by=[cutoff_time_label]) - result = result.reset_index() - return result - - def unify_cutoff_time_admission_time(self, es, cutoff_entity, cutoff_time_label): - """Process records in the entity that contains cutoff times - based on shared days and time. - - Args: - es: fhir entityset. - - Returns: - processed entity - """ - - df = es[cutoff_entity].df - df[cutoff_time_label] = pd.to_datetime(df[cutoff_time_label]) - df['end'] = pd.to_datetime(df['end']) - duration = (df['end'] - df[cutoff_time_label]).dt.days - duration = duration.tolist() - df['duration'] = duration - df['date'] = df[cutoff_time_label].dt.date - df['ct'] = '' - df['checked'] = False - result1 = self.unify_cutoff_times_days_admission_time(df, cutoff_time_label) - result = self.unify_cutoff_times_hours_admission_time(result1, cutoff_time_label) - if 'level_0' in result.columns: - result = result.drop(columns=['level_0']) - return result - - def unify_cutoff_times_days_discharge_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared days. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['end_date']): - sub_day = df[df['end_date'] == d] - - sub_duration_greater = sub_day[sub_day['duration'] > 0] - sub_duration_less = sub_day[sub_day['duration'] <= 0] - frames.append(sub_duration_less) - sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label]) - if len(sub_duration_greater) != 0: - first_date = sub_duration_greater.iloc[0][cutoff_time_label] - - for i in sub_duration_greater.index: - sub_duration_greater.set_value(i, 'ct', first_date) - sub_duration_greater.set_value(i, 'checked', True) - frames.append(sub_duration_greater) - - for i in sub_duration_less.index: - sub_duration_less.set_value(i, 'ct', pd.NaT) - sub_duration_less.set_value(i, 'checked', False) - frames.append(sub_duration_less) - - result = pd.concat(frames) - result = result.drop_duplicates() - result[cutoff_time_label] = pd.to_datetime(result.end) - result = result.reset_index() - return result - - def unify_cutoff_times_hours_discharge_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared time. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['end_date']): - sub_day = df[df['end_date'] == d] - for h in set(sub_day['hour']): - sub_hour = sub_day[sub_day['hour'] == h] - sub_hour = sub_hour.sort_values(by=[cutoff_time_label]) - if len(sub_hour) != 0: - first_date = sub_hour.iloc[0][cutoff_time_label] - for i in sub_hour.index: - sub_hour.set_value(i, 'ct', first_date) - sub_hour.set_value(i, 'checked', True) - - frames.append(sub_hour) - - result = pd.concat(frames) - result = result.drop_duplicates() - return result - - def unify_cutoff_time_discharge_time(self, es, cutoff_entity, cutoff_time_label): - """Process records in the entity that contains cutoff times - based on shared days and time. - - Args: - es: fhir entityset. - - Returns: - processed entity + def generate_label_times(self, df, *args, **kwargs): + """Searches the data to calculate label times. + + Args: + df (pandas.DataFrame): + Data frame to search and extract labels. + *args: + Positional arguments for label maker. + **kwargs: + Keyword arguments for label maker. + Returns: + composeml.LabelTimes: + Calculated labels with cutoff times. """ + label_maker = cp.LabelMaker(*args, **kwargs) + label_times = label_maker.search(df.sort_values(self.time_index), + num_examples_per_instance=1) - df = es[cutoff_entity].df - df['end_date'] = df[cutoff_time_label].dt.date - df['hour'] = df.end.apply(lambda x: x.hour) - duration = (df[cutoff_time_label] - df['start']).dt.days - duration = duration.tolist() - df['duration'] = duration - df['ct'] = '' - df['checked'] = False - result1 = self.unify_cutoff_times_days_discharge_time(df, cutoff_time_label) - result = self.unify_cutoff_times_hours_discharge_time(result1, cutoff_time_label) - if 'level_0' in result.columns: - result = result.drop(columns=['level_0']) - return result + return label_times diff --git a/cardea/problem_definition/length_of_stay.py b/cardea/problem_definition/length_of_stay.py index 42ccad96..3bd2ac5e 100644 --- a/cardea/problem_definition/length_of_stay.py +++ b/cardea/problem_definition/length_of_stay.py @@ -1,141 +1,58 @@ -import featuretools as ft import pandas as pd -from cardea.data_loader import DataLoader as DL from cardea.problem_definition import ProblemDefinition -class LengthOfStay (ProblemDefinition): - """Defines the problem of length of stay, predicting how many days - the patient will be in the hospital. +class LengthOfStay(ProblemDefinition): + """Defines the problem of length of stay. - Attributes: - target_label_column_name: The target label of the prediction problem. - target_entity: Name of the entity containing the target label. - cutoff_time_label: The cutoff time label of the prediction problem. - cutoff_entity: Name of the entity containing the cutoff time label. - prediction_type: The type of the machine learning prediction. + Predict how many days the patient will be in the hospital. For + a classification version of the problem, refer to ProlongedLengthOfStay. """ - __name__ = 'los' + def __init__(self, es): + target_entity = "hadm_id" + time_index = "admittime" + prediction_type = "regression" + ProblemDefinition.__init__(self, target_entity, time_index, prediction_type, es) - updated_es = None - target_label_column_name = 'length' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - conn = 'period' - prediction_type = 'regression' + self.label = "los" - def generate_cutoff_times(self, es): - """Generates cutoff times for the predection problem. + def los(self, ds): + return (ds[self.label].dt.days).sum() - Args: - es: fhir entityset. + def _generate_label(self, df): + start = 'admittime' + end = 'dischtime' + df[end] = pd.to_datetime(df[end]) + df[start] = pd.to_datetime(df[start]) - Returns: - entity_set, target_entity, and a dataframe of cutoff_times and target_labels. + df[self.label] = df[end] - df[start] - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ + def generate_label_times(self, *args, **kwargs): + df = self.denormalize(entities=['admissions']) + self._generate_label(df) + label_times = super().generate_label_times(df, + target_entity=self.target_entity, + time_index=self.time_index, + labeling_function=self.los, + *args, **kwargs) + return label_times - if (self.check_target_label(es, - self.target_entity, - self.target_label_column_name) and not - self.check_for_missing_values_in_target_label(es, - self.target_entity, - self.target_label_column_name)): - if DL().check_column_existence(es, - self.cutoff_entity, - self.cutoff_time_label): - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') +class ProlongedLengthOfStay(LengthOfStay): + """Defines the problem of length of stay in a classification context. - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['cutoff_time', 'instance_id'] - - cutoff_times['label'] = list( - es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - else: - updated_es = self.generate_target_label(es) - return self.generate_cutoff_times(updated_es) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - - generate_from = 'Period' - start = self.cutoff_time_label - end = 'end' - label_name = self.target_label_column_name - - if (DL().check_column_existence(es, - generate_from, - start) and DL().check_column_existence(es, - generate_from, - end)): - if (not DL().check_for_missing_values(es, - generate_from, - start) and not - (DL().check_for_missing_values(es, - generate_from, - end))): - - es[generate_from].df[start] = pd.to_datetime( - es[generate_from].df[start]) - es[generate_from].df[end] = pd.to_datetime( - es[generate_from].df[end]) - duration = (es[generate_from].df[end] - es[generate_from].df[start]).dt.days - duration = duration.tolist() - es[self.target_entity].df[label_name] = duration - updated_target_entity = es[self.target_entity].df - duration_df = pd.DataFrame({'object_id': duration}) - - es = es.entity_from_dataframe(entity_id='Duration', - dataframe=duration_df, - index='object_id') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, index='identifier') - new_relationship = ft.Relationship(es['Duration']['object_id'], - es[self.target_entity][label_name]) - es = es.add_relationship(new_relationship) - - return es + Predict whether the patient will stay more than a number of days in the + hospital. For a regression version of the problem, refer to LengthOfStay. + """ - else: - raise ValueError('Can not generate target label {} in table {} \ - beacuse start or end labels in table {} contain \ - missing value.'.format(label_name, - self.target_entity, - generate_from)) + def __init__(self, es, thresh=7): + LengthOfStay.__init__(self, es) + self.prediction_type = "classification" + self.thresh = thresh - else: - raise ValueError('Can not generate target label {} in \ - table {}.'.format(label_name, - self.target_entity)) + def generate_label_times(self, *args, **kwargs): + label_times = super().generate_label_times(*args, **kwargs) + label_times = label_times.threshold(self.thresh) + return label_times From 5edadaf8478b51061b14fb0b881d9ea5eb09c53f Mon Sep 17 00:00:00 2001 From: sarahmish Date: Mon, 7 Dec 2020 10:57:00 -0500 Subject: [PATCH 02/13] updated structure --- .../__init__.py | 0 cardea/data_labeling/definition.py | 52 ++++++++++ cardea/data_labeling/length_of_stay.py | 35 +++++++ .../mortality_prediction.py | 0 .../predicting_diagnosis.py | 0 .../prolonged_length_of_stay.py | 0 .../readmission.py | 0 .../data_labeling/show_noshow_appointment.py | 24 +++++ cardea/data_labeling/utils.py | 0 cardea/problem_definition/definition.py | 99 ------------------- cardea/problem_definition/length_of_stay.py | 58 ----------- .../show_noshow_appointment.py | 66 ------------- 12 files changed, 111 insertions(+), 223 deletions(-) rename cardea/{problem_definition => data_labeling}/__init__.py (100%) create mode 100644 cardea/data_labeling/definition.py create mode 100644 cardea/data_labeling/length_of_stay.py rename cardea/{problem_definition => data_labeling}/mortality_prediction.py (100%) rename cardea/{problem_definition => data_labeling}/predicting_diagnosis.py (100%) rename cardea/{problem_definition => data_labeling}/prolonged_length_of_stay.py (100%) rename cardea/{problem_definition => data_labeling}/readmission.py (100%) create mode 100644 cardea/data_labeling/show_noshow_appointment.py create mode 100644 cardea/data_labeling/utils.py delete mode 100644 cardea/problem_definition/definition.py delete mode 100644 cardea/problem_definition/length_of_stay.py delete mode 100644 cardea/problem_definition/show_noshow_appointment.py diff --git a/cardea/problem_definition/__init__.py b/cardea/data_labeling/__init__.py similarity index 100% rename from cardea/problem_definition/__init__.py rename to cardea/data_labeling/__init__.py diff --git a/cardea/data_labeling/definition.py b/cardea/data_labeling/definition.py new file mode 100644 index 00000000..9daa8bf3 --- /dev/null +++ b/cardea/data_labeling/definition.py @@ -0,0 +1,52 @@ +import composeml as cp +import pandas as pd + + +class DataLabeler: + """Class that defines the prediction problem. + + This class supports the generation of `label_times` which + is fundamental to the feature generation phase as well + as specifying the target labels. + + Args: + clf (function): + function that defines the labeling function, it should return a + tuple of labeling function, the dataframe, and the name of the + target entity. + """ + def __init__(self, clf): + self.clf = clf + + def generate_label_times(self, es, *args, **kwargs): + """Searches the data to calculate label times. + + Args: + df (pandas.DataFrame): + Data frame to search and extract labels. + *args: + Positional arguments for label maker. + **kwargs: + Keyword arguments for label maker. + Returns: + composeml.LabelTimes: + Calculated labels with cutoff times. + """ + labeling_function, df, meta = self.clf(es) + kwargs = {**meta, **kwargs} + target_entity = kwargs.get('target_entity') + time_index = kwargs.get('time_index') + window_size = kwargs.get('window_size') + thresh = kwargs.get('thresh') + label_maker = cp.LabelMaker(labeling_function=labeling_function, + target_entity=kwargs.get('target_entity'), + time_index=kwargs.get('time_index'), + window_size=kwargs.get('window_size')) + + label_times = label_maker.search(df.sort_values(time_index), + *args, + **kwargs) + if thresh is not None: + label_times.threshold(thresh) + + return label_times, kwargs.get('entity') \ No newline at end of file diff --git a/cardea/data_labeling/length_of_stay.py b/cardea/data_labeling/length_of_stay.py new file mode 100644 index 00000000..9bdd7202 --- /dev/null +++ b/cardea/data_labeling/length_of_stay.py @@ -0,0 +1,35 @@ +import pandas as pd + +from cardea.data_labeling.utils import denormalize + + +def length_of_stay(es): + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For + a classification version of the problem, refer to ProlongedLengthOfStay. + """ + def los(ds, **kwargs): + return (ds["los"].dt.days).sum() + + label = "los" + + meta = { + "entity": "admissions", + "target_entity": "hadm_id", + "time_index": "admittime", + "type": "regression", + "num_examples_per_instance": 1, + "thresh": 7 + } + + df = denormalize(es, entities=['admissions', ]) + + # generate label + start = 'admittime' + end = 'dischtime' + df[end] = pd.to_datetime(df[end]) + df[start] = pd.to_datetime(df[start]) + df[label] = df[end] - df[start] + + return los, df, meta \ No newline at end of file diff --git a/cardea/problem_definition/mortality_prediction.py b/cardea/data_labeling/mortality_prediction.py similarity index 100% rename from cardea/problem_definition/mortality_prediction.py rename to cardea/data_labeling/mortality_prediction.py diff --git a/cardea/problem_definition/predicting_diagnosis.py b/cardea/data_labeling/predicting_diagnosis.py similarity index 100% rename from cardea/problem_definition/predicting_diagnosis.py rename to cardea/data_labeling/predicting_diagnosis.py diff --git a/cardea/problem_definition/prolonged_length_of_stay.py b/cardea/data_labeling/prolonged_length_of_stay.py similarity index 100% rename from cardea/problem_definition/prolonged_length_of_stay.py rename to cardea/data_labeling/prolonged_length_of_stay.py diff --git a/cardea/problem_definition/readmission.py b/cardea/data_labeling/readmission.py similarity index 100% rename from cardea/problem_definition/readmission.py rename to cardea/data_labeling/readmission.py diff --git a/cardea/data_labeling/show_noshow_appointment.py b/cardea/data_labeling/show_noshow_appointment.py new file mode 100644 index 00000000..52adcdc5 --- /dev/null +++ b/cardea/data_labeling/show_noshow_appointment.py @@ -0,0 +1,24 @@ + +import pandas as pd + +from cardea.data_labeling.utils import denormalize + +def appointment_no_show(es): + """Defines the labeling task of appointment no show. + """ + def missed(ds, **kwargs): + return (ds["status"]).sum() + + meta = { + "entity": "Appointment", + "target_entity": "identifier", + "time_index": "created", + "type": "classification", + "num_examples_per_instance": 1, + "thresh": 1 + } + + df = denormalize(es, entities=['Appointment']) + df['status'] = pd.Categorical(df['status']).codes + + return missed, df, meta diff --git a/cardea/data_labeling/utils.py b/cardea/data_labeling/utils.py new file mode 100644 index 00000000..e69de29b diff --git a/cardea/problem_definition/definition.py b/cardea/problem_definition/definition.py deleted file mode 100644 index bc43127f..00000000 --- a/cardea/problem_definition/definition.py +++ /dev/null @@ -1,99 +0,0 @@ -import composeml as cp -import pandas as pd - - -class ProblemDefinition: - """Class that defines the prediction problem. - - This class supports the generation of `label_times` which - is fundamental to the feature generation phase as well - as specifying the target labels. - - Args: - target_entity (str): - The instance id of the target entity. - time_index (str): - The time index specifying at what point to start the prediction. - prediction_type (str): - The type of the machine learning prediction; classification or - regression. - es (featuretools.EntitySet): - An entityset representation of the data. - """ - - def __init__(self, target_entity, time_index, prediction_type, es): - self.target_entity = target_entity - self.time_index = time_index - self.prediction_type = prediction_type - self.es = es - - def _search_relationship(self, left, right): - for r in self.es.relationships: - if r.parent_entity.id in left: - if right == r.child_entity.id: - left_on = r.parent_variable.id - right_on = r.child_variable.id - - elif r.child_entity.id in left: - if right == r.parent_entity.id: - left_on = r.child_variable.id - right_on = r.parent_variable.id - - return left_on, right_on - - def denormalize(self, entities): - """Merge a set of entities into a single dataframe. - - Convert a set of entities from the entityset into a single - dataframe by repetitively merging the selected entities. The - merge process is applied sequentially. - - Args: - entities (list): - list of strings denoting which entities to merge. - - Returns: - pandas.DataFrame: - A single dataframe containing all the information from the - selected entities. - """ - k = len(entities) - assert k > 0 - - # initial entity to start from (should be the target entity) - first = entities[0] - previous = [first] - df = self.es[first].df - - # merge the dataframes to create a single input - for i in range(1, k): - right = entities[i] - - left_on, right_on = self._search_relationship(previous, right) - df = pd.merge(df, es[right].df, - left_on=left_on, right_on=right_on, - how='left', suffixes=('', '_y')).filter(regex='^(?!.*_y)') - - previous.append(right) - - return df - - def generate_label_times(self, df, *args, **kwargs): - """Searches the data to calculate label times. - - Args: - df (pandas.DataFrame): - Data frame to search and extract labels. - *args: - Positional arguments for label maker. - **kwargs: - Keyword arguments for label maker. - Returns: - composeml.LabelTimes: - Calculated labels with cutoff times. - """ - label_maker = cp.LabelMaker(*args, **kwargs) - label_times = label_maker.search(df.sort_values(self.time_index), - num_examples_per_instance=1) - - return label_times diff --git a/cardea/problem_definition/length_of_stay.py b/cardea/problem_definition/length_of_stay.py deleted file mode 100644 index 3bd2ac5e..00000000 --- a/cardea/problem_definition/length_of_stay.py +++ /dev/null @@ -1,58 +0,0 @@ -import pandas as pd - -from cardea.problem_definition import ProblemDefinition - - -class LengthOfStay(ProblemDefinition): - """Defines the problem of length of stay. - - Predict how many days the patient will be in the hospital. For - a classification version of the problem, refer to ProlongedLengthOfStay. - """ - - def __init__(self, es): - target_entity = "hadm_id" - time_index = "admittime" - prediction_type = "regression" - ProblemDefinition.__init__(self, target_entity, time_index, prediction_type, es) - - self.label = "los" - - def los(self, ds): - return (ds[self.label].dt.days).sum() - - def _generate_label(self, df): - start = 'admittime' - end = 'dischtime' - df[end] = pd.to_datetime(df[end]) - df[start] = pd.to_datetime(df[start]) - - df[self.label] = df[end] - df[start] - - def generate_label_times(self, *args, **kwargs): - df = self.denormalize(entities=['admissions']) - self._generate_label(df) - label_times = super().generate_label_times(df, - target_entity=self.target_entity, - time_index=self.time_index, - labeling_function=self.los, - *args, **kwargs) - return label_times - - -class ProlongedLengthOfStay(LengthOfStay): - """Defines the problem of length of stay in a classification context. - - Predict whether the patient will stay more than a number of days in the - hospital. For a regression version of the problem, refer to LengthOfStay. - """ - - def __init__(self, es, thresh=7): - LengthOfStay.__init__(self, es) - self.prediction_type = "classification" - self.thresh = thresh - - def generate_label_times(self, *args, **kwargs): - label_times = super().generate_label_times(*args, **kwargs) - label_times = label_times.threshold(self.thresh) - return label_times diff --git a/cardea/problem_definition/show_noshow_appointment.py b/cardea/problem_definition/show_noshow_appointment.py deleted file mode 100644 index 54250da0..00000000 --- a/cardea/problem_definition/show_noshow_appointment.py +++ /dev/null @@ -1,66 +0,0 @@ - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class MissedAppointmentProblemDefinition (ProblemDefinition): - """Defines the problem of missed appointments, - whether the patient showed to the appointment or not. - - Attributes: - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - cutoff_time_label: The cutoff time label of the prediction problem. - cutoff_entity: Name of the entity containing the cutoff time label. - prediction_type: The type of the machine learning prediction. - """ - __name__ = 'mapp' - - target_label_column_name = 'status' - target_entity = 'Appointment' - prediction_type = 'classification' - cutoff_time_label = 'created' - cutoff_entity = target_entity - - def generate_cutoff_times(self, entity_set): - """Generates cutoff times for the predection problem. - - Args: - entity_set: fhir entityset. - - Returns: - entity_set, target_entity and a dataframe of cutoff_times and target_labels - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label( - entity_set, - self.target_entity, - self.target_label_column_name)) and\ - not (self.check_for_missing_values_in_target_label(entity_set, - self.target_entity, - self.target_label_column_name)): - - if DataLoader().check_column_existence(entity_set, - self.target_entity, - self.cutoff_time_label): - - instance_id = list(entity_set[self.target_entity].df.index) - cutoff_times = entity_set[self.cutoff_entity].df[self.cutoff_time_label].to_frame() - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['cutoff_time', 'instance_id'] - cutoff_times['label'] = list( - entity_set[self.target_entity].df[self.target_label_column_name]) - entity_set[self.target_entity].delete_variable(self.target_label_column_name) - return (entity_set, self.target_entity, cutoff_times) - else: - raise ValueError( - 'Cutoff time label {} in table {} does not exist'.format( - 'created', self.target_entity)) - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) From 6b554eb694f778a5f93875d42675c8991a3ac679 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 23 Mar 2021 19:46:15 -0400 Subject: [PATCH 03/13] update docs --- .gitignore | 2 +- README.md | 6 +- docs/basic_concepts/advanced_use.rst | 55 ----------- docs/basic_concepts/auditing.rst | 37 ------- docs/basic_concepts/auto_featurization.rst | 10 -- docs/basic_concepts/auto_ml.rst | 96 ------------------- docs/basic_concepts/data_loading.rst | 66 ------------- docs/basic_concepts/index.rst | 18 ---- .../basic_concepts/machine_learning_tasks.rst | 66 ------------- docs/community/index.rst | 10 -- docs/conf.py | 2 +- .../contributing.rst | 3 +- docs/developer_guides/index.rst | 13 +++ .../welcome.rst | 2 +- docs/index.rst | 35 ++++--- docs/user_guides/auto_featurization.rst | 31 ++++++ docs/user_guides/auto_ml.rst | 60 ++++++++++++ docs/user_guides/data_assembler.rst | 86 +++++++++++++++++ docs/user_guides/data_labeler.rst | 53 ++++++++++ docs/user_guides/index.rst | 16 ++++ docs/user_guides/schemas.rst | 28 ++++++ 21 files changed, 318 insertions(+), 377 deletions(-) delete mode 100644 docs/basic_concepts/advanced_use.rst delete mode 100644 docs/basic_concepts/auditing.rst delete mode 100644 docs/basic_concepts/auto_featurization.rst delete mode 100644 docs/basic_concepts/auto_ml.rst delete mode 100644 docs/basic_concepts/data_loading.rst delete mode 100644 docs/basic_concepts/index.rst delete mode 100644 docs/basic_concepts/machine_learning_tasks.rst delete mode 100644 docs/community/index.rst rename docs/{community => developer_guides}/contributing.rst (97%) create mode 100644 docs/developer_guides/index.rst rename docs/{community => developer_guides}/welcome.rst (97%) create mode 100644 docs/user_guides/auto_featurization.rst create mode 100644 docs/user_guides/auto_ml.rst create mode 100644 docs/user_guides/data_assembler.rst create mode 100644 docs/user_guides/data_labeler.rst create mode 100644 docs/user_guides/index.rst create mode 100644 docs/user_guides/schemas.rst diff --git a/.gitignore b/.gitignore index 1d1447de..8a4abd9e 100644 --- a/.gitignore +++ b/.gitignore @@ -67,7 +67,7 @@ docs/_build/ docs/cardea.rst docs/cardea.*.rst docs/modules.rst -docs/api +docs/api_reference # PyBuilder target/ diff --git a/README.md b/README.md index 3acba885..3cb4ee5a 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ cardea.load_entityset(data='kaggle') To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.es`` which should output the following: -```bash +``` Entityset: kaggle Entities: Address [Rows: 81, Columns: 2] @@ -110,7 +110,7 @@ label_times = cardea.select_problem('MissedAppointment') ``label_times`` summarizes for each instance in the dataset (1) what is its corresponding label of the instance and (2) what is the time index that indicates the timespan allowed for calculating features that pertain to each instance in the dataset. ```bash - cutoff_time instance_id label + time instance_id label 0 2015-11-10 07:13:56 5030230 noshow 1 2015-12-03 08:17:28 5122866 fulfilled 2 2015-12-07 10:40:59 5134197 fulfilled @@ -165,7 +165,7 @@ If you use Cardea for your research, please consider citing the following paper: Sarah Alnegheimish; Najat Alrashed; Faisal Aleissa; Shahad Althobaiti; Dongyu Liu; Mansour Alsaleh; Kalyan Veeramachaneni. [Cardea: An Open Automated Machine Learning Framework for Electronic Health Records](https://arxiv.org/abs/2010.00509). [IEEE DSAA 2020](https://ieeexplore.ieee.org/document/9260104). -```bash +``` @inproceedings{alnegheimish2020cardea, title={Cardea: An Open Automated Machine Learning Framework for Electronic Health Records}, author={Alnegheimish, Sarah and Alrashed, Najat and Aleissa, Faisal and Althobaiti, Shahad and Liu, Dongyu and Alsaleh, Mansour and Veeramachaneni, Kalyan}, diff --git a/docs/basic_concepts/advanced_use.rst b/docs/basic_concepts/advanced_use.rst deleted file mode 100644 index 09df92a7..00000000 --- a/docs/basic_concepts/advanced_use.rst +++ /dev/null @@ -1,55 +0,0 @@ -Advanced use -============ - -How to define a new machine learning task? ------------------------------------------- - -The definition of a new Machine Learning task in Cardea can be made in four simple steps: - -1. Go to the `problem_definition`_ directory and create a file with a class specifically for - your problem. This class should extend the `ProblemDefinition`_ class and overwrites - accordingly the necessary attributes and methods as needed. Usually, you should pay special - attention to the ``generate_target_label(...)`` and ``generate_cutoff_times(...)`` methods - as you might need to extend them or re-implemented in some cases. - -2. Expose your new class definition in the `init`_ file inside the `problem_definition`_ directory - -3. If you will be using a dataset in a different format that the expected by Cardea (CSV files), - then you will need to provide a specific loading dataset method for your data in the - `EntitySetLoader`_ class, where you will be creating your collection of entities and - relationships between them using the `featuretools.EntitySet`_ class. - -4. Finally, you need to update the `Cardea`_ class to support the new problem definition and be - able to instantiate the proper class when it is necessary in the ``Cardea.select_problem(...)`` - method. - -Features, primitives and AutoML integration -------------------------------------------- - -Once you have defined your problem, following the four steps in the previous section, you will be -able to perform featurization and run different primitives using the AutoML tool as follows: - -.. code-block:: python - - from cardea import Cardea - cardea = Cardea() - cardea.load_your_custom_data() - problem = cardea.select_problem('YourCustomProblemDefinition') - feature_matrix = cardea.generate_features(problem[:1000]) # a subset - feature_matrix = feature_matrix.sample(frac=1) # shuffle - y = list(feature_matrix.pop('label')) - X = feature_matrix.values - pipeline = [ - ['sklearn.ensemble.RandomForestClassifier'], - ['sklearn.naive_bayes.MultinomialNB'], - ['sklearn.neighbors.KNeighborsClassifier'] - ] - result = cardea.execute_model(feature_matrix=X, target=y, primitives=pipeline) - - -.. _featuretools.EntitySet: https://docs.featuretools.com/generated/featuretools.EntitySet.html#featuretools.EntitySet -.. _problem_definition: https://github.com/D3-AI/Cardea/tree/master/cardea/problem_definition -.. _ProblemDefinition: https://github.com/D3-AI/Cardea/blob/master/cardea/problem_definition/definition.py -.. _init: https://github.com/D3-AI/Cardea/blob/master/cardea/problem_definition/__init__.py -.. _EntitySetLoader: https://github.com/D3-AI/Cardea/blob/master/cardea/data_loader/entityset_loader.py#L9 -.. _Cardea: https://github.com/D3-AI/Cardea/blob/master/cardea/cardea.py diff --git a/docs/basic_concepts/auditing.rst b/docs/basic_concepts/auditing.rst deleted file mode 100644 index 4458f2ac..00000000 --- a/docs/basic_concepts/auditing.rst +++ /dev/null @@ -1,37 +0,0 @@ -Auditing -======== - -One element that is essential to prediction problems is the evaluation of the prediction results, -but this might come in various forms and users rely on different metrics to identify the best -model for a specific problem. Commonly, some metrics might be more representative than others -depending on problem. - -Therefore, to facilitate the auditing process, Cardea has two components designed specifically -to cover both: data and model auditing, given that prediction problems rely mainly on the data -that is being used. While Cardea provides a set of metrics that can be used as default metrics -for certain prediction problems, it also provides the means to expand them and allow users to -introduce new kind of metrics. - -Using Cardea, users have the ability to generate a data summary report describing the data through -the Data Auditor, enhancing users' understandability and engagement. Although the system includes -a set of predefined audits that are commonly applied in the literature, they can also specify special -types of audits that they want to apply on their dataset, using a dictionary of all the possible checks -that must be reported. - -These checks are divided in two categories: **data quality checks** and **data representation checks**. While -the data quality checks identifies the missing information in the data; the data representation checks -identifies data represents the users assumptions. - -Similarly, Cardea provides full report to users describing the performance and behavior of the model with -the `Model Auditor`_ component, aiming to give users more interpretability and understanding of the machine -learning model. - -Currently, prediction problems are categorized in regression or classification problems and each of them -has a wide range of metrics (e.g., accuracy, F1 scores, precision recall, AUC for classification and -mean square errors, mean absolute errors and r squared for regression). - -Additionally, given that Cardea provides the ability to run different pipelines composed of different -types of machine learning algorithms, the Model Auditor allows to compare multiple prediction -pipelines and evaluate changes in their behavior using different training and testing data sets. - -.. _Model Auditor: https://github.com/HDI-Project/ModelAudit diff --git a/docs/basic_concepts/auto_featurization.rst b/docs/basic_concepts/auto_featurization.rst deleted file mode 100644 index 725f26d4..00000000 --- a/docs/basic_concepts/auto_featurization.rst +++ /dev/null @@ -1,10 +0,0 @@ -Auto - Featurization -==================== - -Cardea automatically generates features using the `Featuretools`_ package, specifically, -the `Deep Feature Synthesis (DFS)`_ algorithm to generate a feature matrix from a given dataset. -Aiming to fully automate this process, it determines the focus values of the automated feature engineering -task using the **target entity**, **cutoff times**, and **label** of the prediction problem. - -.. _Featuretools: https://www.featuretools.com/ -.. _Deep Feature Synthesis (DFS): https://docs.featuretools.com/automated_feature_engineering/afe.html#deep-feature-synthesis diff --git a/docs/basic_concepts/auto_ml.rst b/docs/basic_concepts/auto_ml.rst deleted file mode 100644 index 0e244072..00000000 --- a/docs/basic_concepts/auto_ml.rst +++ /dev/null @@ -1,96 +0,0 @@ -Auto - ML -========= - -Cardea makes use of two packages to automate and simplify the modeling step in the Machine -Learning tasks: `MLPrimitives`_ and `MLBlocks`_. - -MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning -tools developed in Python, whether they are custom developments or belong to third party -libraries, and build Pipelines out of them that can be fitted and then used to make predictions. -This is achieved by providing a simple and intuitive annotation language that allows the user to -specify how to integrate with each tool, called **primitives**, in order to provide a common uniform -interface to each one of them. - -In the other hand, MLPrimitives is a repository containing primitive annotations to be used by the -MLBlocks library. - -Thanks to the use of these two packages, the Machine Learning algorithm selection and the -hyper-parameter tuning steps can be done easily using JSON annotations as follow: - -.. code-block:: python - - pipeline = [ - ['sklearn.ensemble.RandomForestClassifier'], - ['sklearn.naive_bayes.MultinomialNB'], - ['sklearn.neighbors.KNeighborsClassifier'] - ] - result = cardea.execute_model(..., primitives=pipeline) - -Where, for example, the ``sklearn.naive_bayes.MultinomialNB`` primitive is defined in the -`MLPrimitives`_ package, with the following structure: - -.. code-block:: python - - { - "name": "sklearn.naive_bayes.MultinomialNB", - "contributors": [...], - "documentation": "...", - "description": "...", - "classifiers": { - "type": "estimator", - "subtype": "classifier" - }, - "modalities": ["text"], - "primitive": "sklearn.naive_bayes.MultinomialNB", - "fit": { - "method": "fit", - "args": [ - { - "name": "X", - "type": "ndarray" - }, - { - "name": "y", - "type": "array" - } - ] - }, - "produce": { - "method": "predict", - "args": [ - { - "name": "X", - "type": "ndarray" - } - ], - "output": [ - { - "name": "y", - "type": "array" - } - ] - }, - "hyperparameters": { - "fixed": { - "fit_prior": { - "type": "bool", - "default": true - }, - "class_prior": { - "type": "iterable", - "default": null - } - }, - "tunable": { - "alpha": { - "type": "float", - "default": 1.0, - "range": [0.0, 1.0] - } - } - } - } - - -.. _MLPrimitives: https://hdi-project.github.io/MLPrimitives/ -.. _MLBlocks: https://hdi-project.github.io/MLBlocks/ diff --git a/docs/basic_concepts/data_loading.rst b/docs/basic_concepts/data_loading.rst deleted file mode 100644 index 73e06efd..00000000 --- a/docs/basic_concepts/data_loading.rst +++ /dev/null @@ -1,66 +0,0 @@ -Data Loading -============ - -Cardea makes use of a module to plugin the user's data and automatically organize it into the framework. -It expects data in Fast Healthcare Interoperability Resources (FHIR), a standard for health care data -exchange, published by HL7®. Among the advantages of FHIR over other standards are: - -* Fast and easy to implement -* Specification is free for use with no restrictions -* Strong foundation in Web standards: XML, JSON, HTTP, OAuth, etc. -* Support for RESTful architectures -* Concise and easily understood specifications -* A human-readable serialization format for ease of use by developers - -By default, Cardea loads a dataset hosted in `Amazon S3`_, representing a formatted version of the -Kaggle dataset: `Medical Appointment No Shows`_, but it also allows user to load datasets providing a -local path with CSV files, using the ``load_data_entityset(...)`` method. As an example, the following piece -of code will load the default Kaggle dataset: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.load_entityset(data='kaggle') - -While local files can be loaded using the same method with a ``data`` parameter: - -.. code-block:: python - - cardea.load_entityset(data="your/local/path/") - -Cardea handles datasets as a collection of entities and the relationships between them because they -are useful for preparing raw, structured datasets for feature engineering. For this, it uses -the `featuretools.EntitySet`_ class. - -Using the following command, you will be able to summarize the dataset: - -.. code-block:: python - - cardea.es - Entityset: fhir - Entities: - Address [Rows: 81, Columns: 2] - Appointment_Participant [Rows: 6100, Columns: 2] - Appointment [Rows: 110527, Columns: 5] - CodeableConcept [Rows: 4, Columns: 2] - Coding [Rows: 3, Columns: 2] - Identifier [Rows: 227151, Columns: 1] - Observation [Rows: 110527, Columns: 3] - Patient [Rows: 6100, Columns: 4] - Reference [Rows: 6100, Columns: 1] - Relationships: - Appointment_Participant.actor -> Reference.identifier - Appointment.participant -> Appointment_Participant.object_id - CodeableConcept.coding -> Coding.object_id - Observation.code -> CodeableConcept.object_id - Observation.subject -> Reference.identifier - Patient.address -> Address.object_id - -Showing, in this case, the resources that were loaded into the framework (**Entities** section) -and the relationship between the resources (**Relationships** section). - - -.. _Amazon S3: https://s3.amazonaws.com/dai-cardea/ -.. _Medical Appointment No Shows: https://www.kaggle.com/joniarroba/noshowappointments -.. _featuretools.EntitySet: https://docs.featuretools.com/generated/featuretools.EntitySet.html#featuretools.EntitySet diff --git a/docs/basic_concepts/index.rst b/docs/basic_concepts/index.rst deleted file mode 100644 index cf0ebc8d..00000000 --- a/docs/basic_concepts/index.rst +++ /dev/null @@ -1,18 +0,0 @@ -.. _concepts: - -Basic Concepts -============== - -Before diving into advanced usage and contributions, let's review the basic concepts of the -library to help you get started. - - -.. toctree:: - :maxdepth: 3 - - data_loading - machine_learning_tasks - auto_featurization - auto_ml - auditing - advanced_use diff --git a/docs/basic_concepts/machine_learning_tasks.rst b/docs/basic_concepts/machine_learning_tasks.rst deleted file mode 100644 index f3757aa5..00000000 --- a/docs/basic_concepts/machine_learning_tasks.rst +++ /dev/null @@ -1,66 +0,0 @@ -Machine Learning Tasks -====================== - -The Problem Definition is considered a fundamental component that formulates the task for -Machine Learning models. It includes generating and identifying two main concepts: -the **target variable** and the **cutoff times**. - -Therefore, the first step to work with Cardea is defining a Machine Learning Task (or using one -of the already defined tasks). For example, **Missed Appointment** is a common task that aims -to predict whether the patient showed to the appointment or not, helping hospitals to optimize -their scheduling policies and resources efficiently. - -Outcome to predict ------------------- - -Following with the previous example, the **Missed Appointment** task is currently defined as -a binary classification task in the system, determining whether a patient showed to the appointment -or not from the point of appointment scheduling. - -Usually, the outcome is defined over the FHIR data schema, using the resource id values for -references between instances. - -Cutoff times and Labels ------------------------ - -As it was stated before, the success of the Problem Definition step and its outcome depends on -two main concepts: the **target variable** and the **cutoff times**. The target variable is -generated automatically by Cardea if it does not exist in the dataset and its objective is to -set the definition of the model output. In the other hand, the objective of cutoff times is to -split the data in such manner that any events before the cutoff time are used for training while -events after the cutoff time are used for testing. The following code shows the format for these -values in the **Missed Appointment** task: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.load_entityset(data='kaggle') - cardea.select_problem('MissedAppointment') - -Current Prediction Problems ---------------------------- - -Cardea encapsulates six different prediction problems for users to explore easily, -these are described as follows: - -1. Diagnosis Prediction: - a. Predicts whether a patient will be diagnosed with a specified diagnosis. -2. Length of Stay: - a. Predicts how many days the patient will be in the hospital. -3. Missed Appointment: - a. Predicts whether the patient showed to the appointment or not. -4. Mortality Prediction: - a. Predicts whether a patient will suffer from mortality. -5. Prolonged Length of Stay: - a. Predicts whether a patient stayed in the hospital more or less than a period of time (a week by default). -6. Readmission: - a. Predicts whether a patient will revisit the hospital within certain period of time (a month by default). - -You can see the list of problems using the ``list_problems(...)`` method, example: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.list_problems() diff --git a/docs/community/index.rst b/docs/community/index.rst deleted file mode 100644 index 3c34c3a3..00000000 --- a/docs/community/index.rst +++ /dev/null @@ -1,10 +0,0 @@ -Community -================ - -Cardea is a community driven effort, so it relies on contributions from the community. Therefore, every contribution is welcome, and they are greatly appreciated! Every little bit helps, and credit will always be given. - -.. toctree:: - :maxdepth: 2 - - welcome - contributing \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index 8465a602..6de35f01 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -91,7 +91,7 @@ copyright = u"2018, MIT Data To AI Lab" author = u"MIT Data To AI Lab" description = 'Automated Machine Learning on Electronic Health Records' -user = 'DAI-Lab' +user = 'MLBazaar' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout diff --git a/docs/community/contributing.rst b/docs/developer_guides/contributing.rst similarity index 97% rename from docs/community/contributing.rst rename to docs/developer_guides/contributing.rst index f73491fc..25722afa 100644 --- a/docs/community/contributing.rst +++ b/docs/developer_guides/contributing.rst @@ -5,8 +5,7 @@ Contributing Guidelines Ready to contribute with your own code? Great! -Before diving deeper into the contributing guidelines, please make sure to having read -the :ref:`concepts` section and to have gone through the development guide. +Before diving deeper into the contributing guidelines, please make sure to having read the :ref:`user_guides` section and to have gone through the :ref:`development` setup. Afterwards, please make sure to read the following contributing guidelines carefully, and later on head to the step-by-step guides for each possible type of contribution. diff --git a/docs/developer_guides/index.rst b/docs/developer_guides/index.rst new file mode 100644 index 00000000..e2f7567d --- /dev/null +++ b/docs/developer_guides/index.rst @@ -0,0 +1,13 @@ +.. _developer_guides: + +================ +Developer Guides +================ + +In the Developer Guides we discuss in depth the architecture of the Cardea project and the related libraries, while also providing clear instructions about how to extend its development to better adapt it to your needs to contribute to the development of the libraries. + +.. toctree:: + :maxdepth: 2 + + welcome + contributing \ No newline at end of file diff --git a/docs/community/welcome.rst b/docs/developer_guides/welcome.rst similarity index 97% rename from docs/community/welcome.rst rename to docs/developer_guides/welcome.rst index e67ac144..12f58a02 100644 --- a/docs/community/welcome.rst +++ b/docs/developer_guides/welcome.rst @@ -19,7 +19,7 @@ Reporting Issues ~~~~~~~~~~~~~~~~ If there is something that you would like to see changed in the project, or that you just want -to ask, please create an issue at https://github.com/D3-AI/Cardea/issues +to ask, please create an issue at https://github.com/MLBazaar/Cardea/issues If you do so, please: diff --git a/docs/index.rst b/docs/index.rst index 98f68b93..e30fc4ca 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,7 +5,7 @@ An open source project from Data to AI Lab at MIT.

-|Development Status| |PyPi Shield| |Run Tests| |Downloads| |Binder| +|Development Status| |PyPi Shield| |Run Tests Shield| |Downloads| |Binder| Welcome to Cardea ================== @@ -34,28 +34,41 @@ Cardea is a machine learning library built on top of *schemas* that support elec Our goal is to provide an easy to use library to develop machine learning models from electronic health records. A typical usage of this library will involve interacting with our API to develop prediction models. +Machine Learning Process +~~~~~~~~~~~~~~~~~~~~~~~~ +Cardea is composed of a series of sequential processes that are applied to organize, structure, and build machine learning models on electronic health records datasets. These processes are visualized in the following diagram, where each block represents a process and the output of that process will be used by the succeeding block. + .. figure:: images/cardea-process.png :width: 600 px :alt: Cardea Process -A series of sequential processes are applied to build a machine learning model. These processes are triggered using our following APIs to perform the following: +Diving into this diagram more thoroughly: + +* we first load the desired data using the **data assembler** to generate an entityset representation of the data. The entityset datastructure contains the entities (tables) and the relationships that occur between these tables. Read more about the :ref:`data_assembler`. + +* next, you can investigate the given entityset and decide which prediction problem you wish to solve by using the **data labeler**. Based on the desired prediction problem, cardea creates ``label_times`` which is a data representation, specifically a ``pandas.DataFrame`` that contains three columns: + + * an *instance id* that is unique per row. + * a *time index* that indicates the timespan in which I can use the data in that timespan to generate the corresponding features for the associated instance. + * a *label* that denotes what the framework is trying to predict given the selected problem. + +You can read more about the :ref:`data_labeler`. It is important to note that ``label_times`` is an essential input to the featurization process. -* loading data using the automatic **data assembler**, where we capture data from its raw format into an entityset representation. -* **data labeling** where we create label times that generates (1) the time index that indicates the timespan for which I create my features (2) the encoded labels of the prediction task. this is essential for our feature engineering phase. +* then we can automatically engineer features of our entityset using the **featurizer** by supplying ``label_time``. This will generate a ``feature_matrix`` that contains the instance, its extracted features, and its label. Visit :ref:`featurizer` for more information. -* **featurization** for which we automatically feature engineer our data to generate a feature matrix. +* lastly comes the **modeling** process. In this block, we use the generated ``feature_matrix`` to train our model, tune it, and then assess its performance. More on pipeline training and hyperparameter tuning is provided in the :ref:`modeler` section. -* lastly, we build, train, and tune our machine learning model using the **modeling component**. +This was a quick overview on how we designed the cardea framework. For further details on each process and the data structures in each block, please visit the page of the corresponding process. Explore Cardea -------------- * `Getting Started `_ -* `Basic Concepts `_ +* `User Guides `_ * `API Reference `_ -* `Community `_ +* `Developer Guides `_ * `Release Notes `_ -------------- @@ -64,7 +77,7 @@ Explore Cardea :target: https://pypi.org/search/?c=Development+Status+%3A%3A+2+-+Pre-Alpha .. |PyPi Shield| image:: https://img.shields.io/pypi/v/cardea.svg :target: https://pypi.python.org/pypi/cardea -.. |Run Tests| image:: https://github.com/MLBazaar/Cardea/workflows/Run%20Tests/badge.svg +.. |Run Tests Shield| image:: https://github.com/MLBazaar/Cardea/workflows/Run%20Tests/badge.svg :target: https://github.com/MLBazaar/Cardea/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster .. |Downloads| image:: https://pepy.tech/badge/cardea :target: https://pepy.tech/project/cardea @@ -78,9 +91,9 @@ Explore Cardea :titlesonly: getting_started/index - basic_concepts/index + user_guides/index api_reference/index - community/index + developer_guides/index Release Notes .. _FHIR: https://www.hl7.org/fhir/ diff --git a/docs/user_guides/auto_featurization.rst b/docs/user_guides/auto_featurization.rst new file mode 100644 index 00000000..a0fd10f5 --- /dev/null +++ b/docs/user_guides/auto_featurization.rst @@ -0,0 +1,31 @@ +.. _featurizer: + +========== +Featurizer +========== + +Cardea automatically generates features using the `Featuretools`_ package, specifically, the `Deep Feature Synthesis (DFS)`_ algorithm to generate a feature matrix from a given dataset. Aiming to fully automate this process, it determines the focus values of the automated feature engineering +task using the **target entity**, and **label times** of the prediction problem created by :ref:`data_labeler`. + +Once you featurize the data, you will obtain a feature matrix, where each row pertains to a specific ``instance_id`` defined in the ``label_times``, and a collection of calculated features. + +Featurizing Demo +---------------- + +We can continue on our example walkthrough and generate futures on the Missed Appointment dataset. + +.. ipython:: python + :okwarning: + + from cardea import Cardea + cardea = Cardea() + cardea.load_entityset(data='kaggle') + label_times = cardea.select_problem('MissedAppointment') + feature_matrix = cardea.generate_features(label_times[:1000]) + feature_matrix.head() + +.. note:: + the last column in the feature matrix is the ``label`` column which denotes the value we want to predict based on the selected prediction task. + +.. _Featuretools: https://www.featuretools.com/ +.. _Deep Feature Synthesis (DFS): https://docs.featuretools.com/automated_feature_engineering/afe.html#deep-feature-synthesis diff --git a/docs/user_guides/auto_ml.rst b/docs/user_guides/auto_ml.rst new file mode 100644 index 00000000..b51a468f --- /dev/null +++ b/docs/user_guides/auto_ml.rst @@ -0,0 +1,60 @@ +.. _modeler: + +======= +Modeler +======= + +Cardea makes use of two packages to automate and simplify the modeling step in the Machine +Learning tasks: `MLPrimitives`_ and `MLBlocks`_. + +MLBlocks is a simple framework for seamlessly combining any possible set of Machine Learning tools developed in Python, whether they are custom developments or belong to third party libraries, and build Pipelines out of them that can be fitted and then used to make predictions. +This is achieved by providing a simple and intuitive annotation language that allows the user to specify how to integrate with each tool, called **primitives**, in order to provide a common uniform interface to each one of them. + +On the other hand, *MLPrimitives* is a repository containing primitive annotations to be used by the *MLBlocks* library. + +Thanks to the use of these two packages, the Machine Learning algorithm selection and the hyper-parameter tuning steps can be done easily. + +Modeling Demo +------------- + +Continuing from the previous example of *Missed Appointments*, let’s divide our ``feature_matrix`` into training and testing portions. + +.. ipython:: python + :okwarning: + + from cardea import Cardea + cardea = Cardea() + cardea.load_entityset(data='kaggle') + label_times = cardea.select_problem('MissedAppointment') + feature_matrix = cardea.generate_features(label_times[:1000]) + + # split the data + y = list(feature_matrix.pop('label')) + X = feature_matrix.values + X_train, X_test, y_train, y_test = cardea.train_test_split( + X, y, test_size=0.2, shuffle=True) + +Second, we specify the pipeline we want to use for our prediction. Cardea has a number of pre-created pipelines which you can find in `cardea pipelines `__. We can then use the modeler component to help us train, tune, and select the best version of the pipeline. + +.. ipython:: python + + cardea.select_pipeline('Random Forest') + cardea.fit(X_train, y_train) + y_pred = cardea.predict(X_test) + +.. note:: + you can set ``tune=True`` to optimize the hyperparameters of the pipeline during the ``fit`` process. + + +Additionally, you can use ``fit_predict`` to train the pipeline then make predictions directly on the same dataset. + +We can also evaluate the performance of the pipeline. You can use ``cardea.evaluate`` that will compare the predicted labels against the ground truth according to a list of given metrics. + +.. ipython:: python + + cardea.evaluate(X_test, y_test) + +Metrics used are developed by `sklearn `__. By default classification metrics include: accuracy, f1 score, precision, and recall. On the other hand, regression metrics are shown through: variance score, mean absolute error, mean squared error, mean squared log error, median absolute error, and r2 score. + +.. _MLPrimitives: https://MLBazaar.github.io/MLPrimitives/ +.. _MLBlocks: https://MLBazaar.github.io/MLBlocks/ diff --git a/docs/user_guides/data_assembler.rst b/docs/user_guides/data_assembler.rst new file mode 100644 index 00000000..adcb7158 --- /dev/null +++ b/docs/user_guides/data_assembler.rst @@ -0,0 +1,86 @@ +.. _data_assembler: + +============== +Data Assembler +============== + +Cardea makes use of a module to plugin the user’s data and automatically organize it into the framework. It is built on top of schemas that support electronic health records (EHR). One of the schemas we support is Fast Healthcare Interoperability Resources (FHIR) schema. You can read more about the supported :ref:`schemas`. + +Cardea expects the raw data to be a folder pointing to data in ``.csv`` format to ingest. Each table/resource should correspond to a single ``.csv`` file which is then directly fed into cardea using ``load_entityset``. + +Entityset +--------- + +Entityset represents the data structure produced by the data assembler module. The process organizes the data into its corresponding table/resource within the schema and produces an entityset. Generally, the entityset is a collection of entities and relationships: + +* **entities** are used to prepare the data (tables), into a structured input for later usage. It contains a ``pandas.DataFrame`` at its core, with meta information indicating the index column, time columns, and other information. +* **relationships** indicate the connection between two entities. It represents how for an entity *A* with primary key *1*, there is another entity *B* with foreign key *2* that references it. The relationship ``B.2 -> A.1`` ties them together. This parent-child relationship is embedded within the entityset. + +To read more about entitisets, visit `EntitySet`_. + + +Data Assembling Demo +-------------------- + +Let's start first by looking at some raw data. Here, in this example, we have the Kaggle dataset: `Medical Appointment No Shows`_ already preprocessed to be representative of the FHIR schema. You can download the dataset directly from `Amazon S3`_ or you can run the following command to download it and unzip it: + +.. code-block:: console + + curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip + +then you would have the following directory + +.. code-block:: console + + kaggle + ├── Coding.csv + ├── Appointment_Participant.csv + ├── Address.csv + ├── CodeableConcept.csv + ├── Reference.csv + ├── Observation.csv + ├── Identifier.csv + └── Appointment.csv + +.. note:: + notice how the file names correspond to the resource name in FHIR. + +Then you can directly load the dataset into cardea by supplying the folder path to the ``load_entityset``. + + +.. ipython:: python + + from cardea import Cardea + cardea = Cardea() + cardea.load_entityset(data='kaggle') + +We can investigate what the entityset looks like by simply displaying the entityset through ``es``. + +.. ipython:: python + + cardea.es + +.. note:: + you can use ``cardea.es.plot()`` to visualize your entityset. + +Showing above are two sections: + +* **Entities** where we can see the resources loaded. For example, *Appointment* is a resource in FHIR that contains most of the records of this dataset and contains 5 columns. +* **Relationships** where we can see the parent-child relationship. For example, ``Appointment.participant`` is the column ``participant`` in table ``appointment`` that references the primary key ``object_id`` in table ``Appointment_Partipant``. + + +We will utilize this structure to develop our :ref:`data_labeler` and :ref:`featurizer`. + + +FAQ +--- + +1. **What schemas do we support right now?** +We currently support two :ref:`schemas`, Fast Healthcare Interoperability Resources (FHIR), and Medical Information Mart for Intensive Care III (MIMIC-III). +2. **What if I only have a subset of tables?** +Cardea seamlessly integrates the available data, dropping missing variables and links. Having only a subset of the data does not preclude your from solving a prediction problem if all the necessary information is still present. + + +.. _EntitySet: https://featuretools.alteryx.com/en/stable/api_reference.html#entityset-entity-relationship-variable-types +.. _Amazon S3: https://dai-cardea.s3.amazonaws.com/kaggle.zip +.. _Medical Appointment No Shows: https://www.kaggle.com/joniarroba/noshowappointments diff --git a/docs/user_guides/data_labeler.rst b/docs/user_guides/data_labeler.rst new file mode 100644 index 00000000..fe296290 --- /dev/null +++ b/docs/user_guides/data_labeler.rst @@ -0,0 +1,53 @@ +.. _data_labeler: + +============ +Data Labeler +============ + +The data labeler is considered a fundamental component that formulates the prediction task for Machine Learning models. It includes generating and identifying **label times**. In this page, we detail what label times are. + +Label Times +----------- + +After loading the data, you will need to define the prediction task you want to solve (or use one of the already defined tasks). For example, *Missed Appointment* is a common task that aims to predict whether the patient showed up to the appointment or not, helping hospitals to optimize their scheduling policies and resources efficiently. So how do we formulate prediction? + +First, you will need to articulate what is the **label** (outcome) you want to predict? Following with the previous example, the *Missed Appointment* task is currently defined as a binary classification task in the system, determining whether a patient showed to the appointment or not. + +Next, you will need to determine the **time** in which you define your features over. Continuing with the previous example of *Missed Appointments*, I would like to predict whether the patient will show up to the appointment or not from the point of scheduling appointment. In other words, I would use all the data I could get up until the time when the appointment was scheduled as data for featurization. + +Lastly, you will determine the entity that contains this piece of information, **target entity**. Combining these information together, we get ``label_times``. + + +Available Prediction Problems +----------------------------- +There is currently six readily available prediction problems for users to explore easily, these are described as follows: + +* **Diagnosis prediction**: predicts whether a patient will be diagnosed with a given ICD diagnosis code. +* **Length of Stay (LOS) prediction**: predicts how many days the patient will be in the hospital. +* **Prolonged Length of Stay (PLOS) prediction**: predicts whether a patient stayed in the hospital more or less than a period of time (a week by default). +* **Missed Appointment prediction**: predicts whether the patient will show up to the appointment or not. +* **Mortality prediction**: predicts patient’s mortality. +* **Readmission prediction**: predicts whether a patient will revisit the hospital within a certain period of time (a month by default). + + +Data Labeling Demo +------------------ + +Contiuning from :ref:`data_assembler`, we can now use ``select_problem`` with the desired prediction problem to generate ``label_times``. + +.. ipython:: python + + from cardea import Cardea + cardea = Cardea() + cardea.load_entityset(data='kaggle') + label_times = cardea.select_problem('MissedAppointment') + label_times.head() + +.. note:: + you can use ``cardea.list_problems()`` to view available prediction problems. + + +Creating New Prediction Problems +-------------------------------- + +Coming Soon. diff --git a/docs/user_guides/index.rst b/docs/user_guides/index.rst new file mode 100644 index 00000000..8e468d30 --- /dev/null +++ b/docs/user_guides/index.rst @@ -0,0 +1,16 @@ +.. _user_guides: + +=========== +User Guides +=========== + +In the user guide, we go through some of the main concepts needed to understand how the framework is built and what are the underlying data structures used to make this framework possible. + +.. toctree:: + :maxdepth: 3 + + data_assembler + data_labeler + auto_featurization + auto_ml + schemas diff --git a/docs/user_guides/schemas.rst b/docs/user_guides/schemas.rst new file mode 100644 index 00000000..7ef2046e --- /dev/null +++ b/docs/user_guides/schemas.rst @@ -0,0 +1,28 @@ +.. _schemas: + +======= +Schemas +======= + +Cardea is built on top of schemas that support electronic health records (EHR). In this page, we list the currently supported schemas. + + +Fast Healthcare Interoperability Resources (FHIR) +------------------------------------------------- + +`Fast Healthcare Interoperability Resources (FHIR) `__, is a standard for health care data exchange, published by HL7®. Among the advantages of FHIR over other standards are: + +* Fast and easy to implement +* Specification is free for use with no restrictions +* Strong foundation in Web standards: XML, JSON, HTTP, OAuth, etc. +* Support for RESTful architectures +* Concise and easily understood specifications +* A human-readable serialization format for ease of use by developers + +MIMIC-III +--------- + +`MIMIC-III `__, a freely accessible critical care database. + +Coming soon. + From 35ab5c132994f4a3d184b2b09de1b041a735b85c Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 30 Mar 2021 01:10:26 -0400 Subject: [PATCH 04/13] change to compose (wip) --- cardea/core.py | 26 +- cardea/data_assembler/__init__.py | 4 - cardea/data_assembling/__init__.py | 5 + .../data_loader.py | 0 .../entityset_loader.py | 7 +- .../load_mimic.py | 0 .../schema.xml | 0 cardea/data_labeling/__init__.py | 23 +- cardea/data_labeling/definition.py | 10 +- cardea/data_labeling/length_of_stay.py | 52 ++- cardea/data_labeling/mortality_prediction.py | 174 +++------- cardea/data_labeling/predicting_diagnosis.py | 178 +++------- .../data_labeling/prolonged_length_of_stay.py | 169 ---------- cardea/data_labeling/readmission.py | 205 +++--------- .../data_labeling/show_noshow_appointment.py | 13 +- cardea/data_labeling/utils.py | 52 +++ cardea/data_loader/__init__.py | 11 - cardea/data_loader/entityset_loader.py | 127 ------- cardea/data_loader/load_mimic.py | 139 -------- cardea/featurization/__init__.py | 7 - cardea/featurizing/__init__.py | 7 + .../featurization.py | 0 cardea/problem_definition/definition.py | 247 -------------- cardea/problem_definition/length_of_stay.py | 150 --------- .../show_noshow_appointment.py | 76 ----- .../__init__.py | 0 .../test_data_loader.py | 2 +- .../test_entityset_loader.py | 2 +- .../test_load_mimic.py | 2 +- .../__init__.py | 0 tests/cardea/data_labeling/test_definition.py | 11 + .../data_labeling/test_length_of_stay.py | 10 + .../test_mortality_prediction.py | 9 + .../test_predicting_diagnosis.py | 9 + .../cardea/data_labeling/test_readmission.py | 9 + .../test_show_noshow_appointment.py | 9 + .../__init__.py | 0 .../test_featurization.py | 4 +- .../problem_definition/test_definition.py | 82 ----- .../problem_definition/test_length_of_stay.py | 306 ----------------- .../test_mortality_prediction.py | 268 --------------- .../test_predicting_diagnosis.py | 267 --------------- .../test_prolonged_length_of_stay.py | 313 ------------------ .../problem_definition/test_readmission.py | 235 ------------- .../test_show_noshow_appointment.py | 156 --------- 45 files changed, 316 insertions(+), 3060 deletions(-) delete mode 100644 cardea/data_assembler/__init__.py create mode 100644 cardea/data_assembling/__init__.py rename cardea/{data_assembler => data_assembling}/data_loader.py (100%) rename cardea/{data_assembler => data_assembling}/entityset_loader.py (96%) rename cardea/{data_assembler => data_assembling}/load_mimic.py (100%) rename cardea/{data_assembler => data_assembling}/schema.xml (100%) delete mode 100644 cardea/data_labeling/prolonged_length_of_stay.py delete mode 100644 cardea/data_loader/__init__.py delete mode 100644 cardea/data_loader/entityset_loader.py delete mode 100644 cardea/data_loader/load_mimic.py delete mode 100644 cardea/featurization/__init__.py create mode 100644 cardea/featurizing/__init__.py rename cardea/{featurization => featurizing}/featurization.py (100%) delete mode 100644 cardea/problem_definition/definition.py delete mode 100644 cardea/problem_definition/length_of_stay.py delete mode 100644 cardea/problem_definition/show_noshow_appointment.py rename tests/cardea/{data_loader => data_assembling}/__init__.py (100%) rename tests/cardea/{data_loader => data_assembling}/test_data_loader.py (99%) rename tests/cardea/{data_loader => data_assembling}/test_entityset_loader.py (97%) rename tests/cardea/{data_loader => data_assembling}/test_load_mimic.py (90%) rename tests/cardea/{featurization => data_labeling}/__init__.py (100%) create mode 100644 tests/cardea/data_labeling/test_definition.py create mode 100644 tests/cardea/data_labeling/test_length_of_stay.py create mode 100644 tests/cardea/data_labeling/test_mortality_prediction.py create mode 100644 tests/cardea/data_labeling/test_predicting_diagnosis.py create mode 100644 tests/cardea/data_labeling/test_readmission.py create mode 100644 tests/cardea/data_labeling/test_show_noshow_appointment.py rename tests/cardea/{problem_definition => featurizing}/__init__.py (100%) rename tests/cardea/{featurization => featurizing}/test_featurization.py (94%) delete mode 100644 tests/cardea/problem_definition/test_definition.py delete mode 100644 tests/cardea/problem_definition/test_length_of_stay.py delete mode 100644 tests/cardea/problem_definition/test_mortality_prediction.py delete mode 100644 tests/cardea/problem_definition/test_predicting_diagnosis.py delete mode 100644 tests/cardea/problem_definition/test_prolonged_length_of_stay.py delete mode 100644 tests/cardea/problem_definition/test_readmission.py delete mode 100644 tests/cardea/problem_definition/test_show_noshow_appointment.py diff --git a/cardea/core.py b/cardea/core.py index edca198e..493a16b8 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -8,6 +8,7 @@ import pickle from inspect import isclass from io import BytesIO +from functools import partial from urllib.request import urlopen from zipfile import ZipFile @@ -15,12 +16,12 @@ import pandas as pd import cardea -from cardea.data_loader import EntitySetLoader, load_mimic_data -from cardea.featurization import Featurization +from cardea.data_assembling import EntitySetLoader, load_mimic_data +from cardea.featurizing import Featurization from cardea.modeling import Modeler -from cardea.problem_definition import ( - DiagnosisPrediction, LengthOfStay, MissedAppointment, MortalityPrediction, - ProlongedLengthOfStay, Readmission) +from cardea.data_labeling import ( + diagnosis_prediction, length_of_stay, appointment_no_show, mortality, + readmission) LOGGER = logging.getLogger(__name__) @@ -145,19 +146,17 @@ def select_problem(self, selection, parameter=None): # problem selection if selection == 'LengthOfStay': - self.chosen_problem = LengthOfStay() + self.chosen_problem = length_of_stay elif selection == 'MortalityPrediction': - self.chosen_problem = MortalityPrediction() + self.chosen_problem = mortality elif selection == 'MissedAppointment': - self.chosen_problem = MissedAppointment() - - elif selection == 'ProlongedLengthOfStay' and parameter: - self.chosen_problem = ProlongedLengthOfStay(parameter) + self.chosen_problem = appointment_no_show elif selection == 'ProlongedLengthOfStay': - self.chosen_problem = ProlongedLengthOfStay() + plos = partial(length_of_stay, parameter) + self.chosen_problem = plos elif selection == 'Readmission' and parameter: self.chosen_problem = Readmission(parameter) @@ -166,7 +165,8 @@ def select_problem(self, selection, parameter=None): self.chosen_problem = Readmission() elif selection == 'DiagnosisPrediction' and parameter: - self.chosen_problem = DiagnosisPrediction(parameter) + diag = partial(diagnosis_prediction, parameter) + self.chosen_problem = diag elif selection == 'DiagnosisPrediction': raise ValueError('unspecified diagnosis code') diff --git a/cardea/data_assembler/__init__.py b/cardea/data_assembler/__init__.py deleted file mode 100644 index d1cda93f..00000000 --- a/cardea/data_assembler/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# import logging - -from cardea.data_loader.data_loader import DataLoader, Diamond -from cardea.data_loader.entityset_loader import EntitySetLoader diff --git a/cardea/data_assembling/__init__.py b/cardea/data_assembling/__init__.py new file mode 100644 index 00000000..3e88baef --- /dev/null +++ b/cardea/data_assembling/__init__.py @@ -0,0 +1,5 @@ +# import logging + +from cardea.data_assembling.data_loader import DataLoader, Diamond +from cardea.data_assembling.entityset_loader import EntitySetLoader +from cardea.data_assembling.load_mimic import load_mimic_data diff --git a/cardea/data_assembler/data_loader.py b/cardea/data_assembling/data_loader.py similarity index 100% rename from cardea/data_assembler/data_loader.py rename to cardea/data_assembling/data_loader.py diff --git a/cardea/data_assembler/entityset_loader.py b/cardea/data_assembling/entityset_loader.py similarity index 96% rename from cardea/data_assembler/entityset_loader.py rename to cardea/data_assembling/entityset_loader.py index 809d9883..3c44b3e6 100644 --- a/cardea/data_assembler/entityset_loader.py +++ b/cardea/data_assembling/entityset_loader.py @@ -3,7 +3,7 @@ import featuretools as ft import pandas as pd -from cardea.data_loader import DataLoader, Diamond +from cardea.data_assembling import DataLoader, Diamond class EntitySetLoader(DataLoader): @@ -23,14 +23,15 @@ def create_entity(self, fhir, identifiers, entity_set): id = identifiers[object_name] df = df.apply(pd.to_numeric, errors='ignore') + df.columns = map(str.lower, df.columns) if object_name == 'Period': - entity_set.entity_from_dataframe(entity_id=str(object_name), + entity_set.entity_from_dataframe(entity_id=str(object_name).lower(), dataframe=df, index=id, time_index="start") else: - entity_set.entity_from_dataframe(entity_id=str(object_name), + entity_set.entity_from_dataframe(entity_id=str(object_name).lower(), dataframe=df, index=id) diff --git a/cardea/data_assembler/load_mimic.py b/cardea/data_assembling/load_mimic.py similarity index 100% rename from cardea/data_assembler/load_mimic.py rename to cardea/data_assembling/load_mimic.py diff --git a/cardea/data_assembler/schema.xml b/cardea/data_assembling/schema.xml similarity index 100% rename from cardea/data_assembler/schema.xml rename to cardea/data_assembling/schema.xml diff --git a/cardea/data_labeling/__init__.py b/cardea/data_labeling/__init__.py index 9e39a925..095de682 100644 --- a/cardea/data_labeling/__init__.py +++ b/cardea/data_labeling/__init__.py @@ -1,18 +1,7 @@ # import logging -from cardea.problem_definition.definition import ProblemDefinition -from cardea.problem_definition.length_of_stay import LengthOfStay -from cardea.problem_definition.mortality_prediction import MortalityPrediction -from cardea.problem_definition.predicting_diagnosis import DiagnosisPrediction -from cardea.problem_definition.prolonged_length_of_stay import ProlongedLengthOfStay -from cardea.problem_definition.readmission import Readmission -from cardea.problem_definition.show_noshow_appointment import MissedAppointment - -__all__ = ( - "ProblemDefinition", - "LengthOfStay", - "MortalityPrediction", - "DiagnosisPrediction", - "ProlongedLengthOfStay", - "Readmission", - "MissedAppointment" -) +from cardea.data_labeling.definition import DataLabeler +from cardea.data_labeling.length_of_stay import length_of_stay +from cardea.data_labeling.mortality_prediction import mortality +from cardea.data_labeling.predicting_diagnosis import diagnosis_prediction +from cardea.data_labeling.readmission import readmission +from cardea.data_labeling.show_noshow_appointment import appointment_no_show diff --git a/cardea/data_labeling/definition.py b/cardea/data_labeling/definition.py index 9daa8bf3..a4d587a9 100644 --- a/cardea/data_labeling/definition.py +++ b/cardea/data_labeling/definition.py @@ -10,13 +10,13 @@ class DataLabeler: as specifying the target labels. Args: - clf (function): + function (method): function that defines the labeling function, it should return a tuple of labeling function, the dataframe, and the name of the target entity. """ - def __init__(self, clf): - self.clf = clf + def __init__(self, function): + self.function = function def generate_label_times(self, es, *args, **kwargs): """Searches the data to calculate label times. @@ -32,7 +32,7 @@ def generate_label_times(self, es, *args, **kwargs): composeml.LabelTimes: Calculated labels with cutoff times. """ - labeling_function, df, meta = self.clf(es) + labeling_function, df, meta = self.function(es) kwargs = {**meta, **kwargs} target_entity = kwargs.get('target_entity') time_index = kwargs.get('time_index') @@ -47,6 +47,6 @@ def generate_label_times(self, es, *args, **kwargs): *args, **kwargs) if thresh is not None: - label_times.threshold(thresh) + label_times = label_times.threshold(thresh) return label_times, kwargs.get('entity') \ No newline at end of file diff --git a/cardea/data_labeling/length_of_stay.py b/cardea/data_labeling/length_of_stay.py index 9bdd7202..7bbd4ed9 100644 --- a/cardea/data_labeling/length_of_stay.py +++ b/cardea/data_labeling/length_of_stay.py @@ -2,34 +2,52 @@ from cardea.data_labeling.utils import denormalize +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'hadm_id', + 'time_index': 'admittime', +} -def length_of_stay(es): +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'identifier', + 'time_index': 'start', +} + +def length_of_stay(es, k=None): """Defines the labeling task of length of stay. Predict how many days the patient will be in the hospital. For - a classification version of the problem, refer to ProlongedLengthOfStay. + a classification version of the problem, specify k. """ def los(ds, **kwargs): - return (ds["los"].dt.days).sum() + return (ds['los'].dt.days).sum() + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + start = 'admittime' + end = 'dischtime' + + elif es.id == 'fhir': + meta = FHIR_META + entities = ['encounter', 'period'] + start = 'start' + end = 'end' - label = "los" + meta['type'] = 'regression' + meta['num_examples_per_instance'] = 1 - meta = { - "entity": "admissions", - "target_entity": "hadm_id", - "time_index": "admittime", - "type": "regression", - "num_examples_per_instance": 1, - "thresh": 7 - } + if k: + meta['type'] = 'classification' + meta['thresh'] = k - df = denormalize(es, entities=['admissions', ]) + df = denormalize(es, entities=entities) # generate label - start = 'admittime' - end = 'dischtime' df[end] = pd.to_datetime(df[end]) df[start] = pd.to_datetime(df[start]) - df[label] = df[end] - df[start] + df['los'] = df[end] - df[start] - return los, df, meta \ No newline at end of file + return los, df, meta + \ No newline at end of file diff --git a/cardea/data_labeling/mortality_prediction.py b/cardea/data_labeling/mortality_prediction.py index 41b62514..a9c3840f 100644 --- a/cardea/data_labeling/mortality_prediction.py +++ b/cardea/data_labeling/mortality_prediction.py @@ -1,145 +1,47 @@ import pandas as pd -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class MortalityPrediction (ProblemDefinition): - """Defines the problem of diagnosis Prediction. - - Finding whether a patient will be diagnosed with a specifed diagnosis. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. +from cardea.data_labeling.utils import denormalize + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'hadm_id', + 'time_index': 'admittime', +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'identifier', + 'time_index': 'start', +} + +def mortality(es): + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For + a classification version of the problem, specify k. """ - __name__ = 'mortality' - - updated_es = None - target_label_column_name = 'diagnosis' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - prediction_type = 'classification' - conn = 'period' - causes_of_death = ['X60', 'X84', 'Y87.0', 'X85', 'Y09', - 'Y87.1', 'V02', 'V04', 'V09.0', 'V09.2', 'V12', 'V14'] - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - es = self.generate_target_label(es) - - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): # check the existance of the cutoff label - - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - - cutoff_times['label'] = list(es[self.target_entity].df[self.target_label_column_name]) - - for (idx, row) in cutoff_times.iterrows(): - new_val = row.loc['label'] in self.causes_of_death - cutoff_times.at[idx, 'label'] = new_val - - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - if (self.check_target_label( - es, - self.target_entity, - self.target_label_column_name)): - - if not DataLoader().check_for_missing_values(es, - self.target_entity, - self.target_label_column_name): - entity_set_df = es[self.target_entity].df - - merging_coding = pd.merge(es['Coding'].df, es['CodeableConcept'].df, - left_on='object_id', right_on='coding', how='left') - merging_condtion = pd.merge(merging_coding, es['Condition'].df, - left_on='object_id_y', right_on='code', how='left') - merging_diagnosis = pd.merge( - merging_condtion, - es['Encounter_Diagnosis'].df, - left_on='identifier', - right_on='condition', how='left') - - merging_encouter = pd.merge(merging_diagnosis, es[self.target_entity].df, - left_on='subject', right_on='identifier', how='left') - merging_encouter['target'] = merging_encouter['code_x'] + def mortal(ds, **kwargs): + return ds['hospital_expire_flag'].sum() > 0 - set(es[self.target_entity].df.identifier) + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] - entity_set_df[self.target_label_column_name] = list(merging_encouter['target']) + elif es.id == 'fhir': + meta = FHIR_META + entities = ['encounter', 'encounter_diagnosis', 'condition', + 'codeableconcept', 'coding', 'period'] - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=entity_set_df, - index='identifier') + meta['type'] = 'classification' + meta['num_examples_per_instance'] = 1 - return es + df = denormalize(es, entities=entities) + + # generate label + if es.id == 'fhir': + causes_of_death = ['X60', 'X84', 'Y87.0', 'X85', 'Y09', 'Y87.1', + 'V02', 'V04', 'V09.0', 'V09.2', 'V12', 'V14'] - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse end label in \ - table {} contains missing value.'.format( - self.target_label_column_name, self. target_entity, generate_from)) + df['hospital_expire_flag'] = int(df['code'].isin(causes_of_death)) - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) + return mortal, df, meta diff --git a/cardea/data_labeling/predicting_diagnosis.py b/cardea/data_labeling/predicting_diagnosis.py index f4afeb26..fbb2e83f 100644 --- a/cardea/data_labeling/predicting_diagnosis.py +++ b/cardea/data_labeling/predicting_diagnosis.py @@ -1,142 +1,42 @@ import pandas as pd -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class DiagnosisPrediction (ProblemDefinition): - """Defines the problem of diagnosis Prediction. - - Finding whether a patient will be diagnosed with a specifed diagnosis. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. +from cardea.data_labeling.utils import denormalize + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'hadm_id', + 'time_index': 'admittime', +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'identifier', + 'time_index': 'start', +} + +def diagnosis_prediction(es, diag): + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For + a classification version of the problem, specify k. """ - __name__ = 'diagnosis' - - updated_es = None - target_label_column_name = 'diagnosis' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - prediction_type = 'classification' - conn = 'period' - - def __init__(self, d): - self.diagnosis = d - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - es = self.generate_target_label(es) - - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): # check the existance of the cutoff label - - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - cutoff_times['label'] = list(es[self.target_entity].df[self.target_label_column_name]) - cutoff_times['label'] = cutoff_times['label'] == self.diagnosis - - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - if (self.check_target_label( - es, - self.target_entity, - self.target_label_column_name)): - - if not DataLoader().check_for_missing_values(es, - self.target_entity, - self.target_label_column_name): - entity_set_df = es[self.target_entity].df - - merging_coding = pd.merge(es['Coding'].df, es['CodeableConcept'].df, - left_on='object_id', right_on='coding', how='left') - merging_condtion = pd.merge(merging_coding, es['Condition'].df, - left_on='object_id_y', right_on='code', how='left') - merging_diagnosis = pd.merge( - merging_condtion, - es['Encounter_Diagnosis'].df, - left_on='identifier', - right_on='condition', how='left') - - merging_encouter = pd.merge(merging_diagnosis, es[self.target_entity].df, - left_on='subject', right_on='identifier', how='left') - merging_encouter['target'] = merging_encouter['code_x'] - - set(es[self.target_entity].df.identifier) - - entity_set_df[self.target_label_column_name] = list(merging_encouter['target']) - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=entity_set_df, - index='identifier') - - return es - - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse end label in \ - table {} contains missing value.'.format( - self.target_label_column_name, self. target_entity, generate_from)) - - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) + def diagnosis(ds, **kwargs): + return True if diag in ds[column].values else False + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + column = 'diagnosis' + + elif es.id == 'fhir': + meta = FHIR_META + entities = ['encounter', 'encounter_diagnosis', 'condition', + 'codeableconcept', 'coding', 'period'] + column = 'code' + + meta['type'] = 'classification' + meta['num_examples_per_instance'] = 1 + + df = denormalize(es, entities=entities) + + return diagnosis, df, meta diff --git a/cardea/data_labeling/prolonged_length_of_stay.py b/cardea/data_labeling/prolonged_length_of_stay.py deleted file mode 100644 index 0e88a374..00000000 --- a/cardea/data_labeling/prolonged_length_of_stay.py +++ /dev/null @@ -1,169 +0,0 @@ -import featuretools as ft -import pandas as pd - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class ProlongedLengthOfStay (ProblemDefinition): - """Defines the problem of length of stay - - Predicting whether a patient stayed in the hospital more or less than - a week (by default). - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'plos' - - updated_es = None - target_label_column_name = 'length' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - conn = 'period' - prediction_type = 'classification' - - def __init__(self, t=7): - self.threshold = t - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label(es, - self.target_entity, - self.target_label_column_name) and not - self.check_for_missing_values_in_target_label(es, - self.target_entity, - self.target_label_column_name)): - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): - - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - update_es = es[self.target_entity].df - - # threshold - update_es['length'] = (update_es['length'] >= self.threshold) - update_es['length'] = update_es['length'].astype(int) - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=update_es, - index='identifier') - - cutoff_times['label'] = list( - es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - else: - updated_es = self.generate_target_label(es) - return self.generate_cutoff_times(updated_es) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' - start = self.cutoff_time_label - end = 'end' - label_name = self.target_label_column_name - if (DataLoader().check_column_existence( - es, - generate_from, - start) and DataLoader().check_column_existence(es, - generate_from, - end)): - - if (not DataLoader().check_for_missing_values( - es, - generate_from, - start) and not DataLoader().check_for_missing_values(es, - generate_from, - end)): - - es[generate_from].df[start] = pd.to_datetime( - es[generate_from] - .df[start]) - es[generate_from].df[end] = pd.to_datetime( - es[generate_from].df[end]) - duration = (es[generate_from].df[end] - - es[generate_from].df[start]).dt.days - duration = duration.tolist() - es[self.target_entity].df[label_name] = duration - updated_target_entity = es[self.target_entity].df - duration_df = pd.DataFrame({'object_id': duration}) - - es = es.entity_from_dataframe( - entity_id='Duration', - dataframe=duration_df, - index='object_id') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, - index='identifier') - new_relationship = ft.Relationship(es['Duration']['object_id'], - es[self.target_entity][label_name]) - es = es.add_relationship(new_relationship) - - return es - - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse start or end labels in \ - table {} contain missing value.'.format( - label_name, self.target_entity, generate_from)) - - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - label_name, - self.target_entity)) diff --git a/cardea/data_labeling/readmission.py b/cardea/data_labeling/readmission.py index 7ce99400..fec1950e 100644 --- a/cardea/data_labeling/readmission.py +++ b/cardea/data_labeling/readmission.py @@ -1,169 +1,50 @@ import pandas as pd -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class Readmission (ProblemDefinition): - """Defines the problem of readmission. - - Predicting whether a patient will revisit the hospital within certain period of time. - - Note: - The patient visit is considered a readmission if he/she visits - the hospital again within 30 days. - - The readmission diagnosis does not have to be the same as the initial visit diagnosis, - (The patient could be diagnosed of something that is a complication - of the initial diagnosis). - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. +from cardea.data_labeling.utils import denormalize + +MIMIC_META = { + 'entity': 'admissions', + 'target_entity': 'patient_id', + 'time_index': 'dischtime' +} + +FHIR_META = { + 'entity': 'encounter', + 'target_entity': 'subject', + 'time_index': 'end' +} + +def readmission(es, k=30): + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For + a classification version of the problem, specify k. """ - __name__ = 'readmission' - - updated_es = None - target_label_column_name = 'readmitted' - target_entity = 'Encounter' - cutoff_time_label = 'end' - cutoff_entity = 'Period' - prediction_type = 'classification' - conn = 'period' - - def __init__(self, t=30): - self.readmission_threshold = t - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - self.generate_target_label(es) - - if DataLoader().check_column_existence( - es, - self.cutoff_entity, - self.cutoff_time_label): # check the existance of the cutoff label - - generated_cts = self.unify_cutoff_time_discharge_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - cutoff_times['label'] = list(es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - generate_from = 'Period' + def readmit(ds, **kwargs): + initial_discharge = min(ds[end].values) + second_admission = sorted(ds[start].values)[1] + return (second_admission - initial_discharge).dt.days + + if es.id == 'mimic': + meta = MIMIC_META + entities = ['admissions'] + start = 'admittime' + end = 'dischtime' + + elif es.id == 'fhir': + meta = FHIR_META + entities = ['encounter', 'period'] + start = 'start' end = 'end' - if (DataLoader().check_column_existence( - es, - generate_from, - end)) and (DataLoader().check_column_existence(es, - self.target_entity, - 'period')): - - if not DataLoader().check_for_missing_values( - es, - generate_from, end): - - entity_set_df = es[self.target_entity].df - generated_df = es[generate_from].df - merged_df = pd.merge(entity_set_df, generated_df, how='left', - left_on='period', right_on='object_id') - - generated_target_label = [] - encounter_identifier = [] - - for patient in set(merged_df['subject']): - patient_visits = merged_df[merged_df['subject'] == patient] - inital_date = patient_visits[end].iloc[0] - - encounter_identifier.append(patient_visits['identifier'].iloc[0]) - generated_target_label.append(False) # first visit - - if len(patient_visits) != 1: - for visit_date, encounter_id in zip(patient_visits[end][1:], - patient_visits['identifier'][1:]): - - visit_range = visit_date - inital_date - inital_date = visit_date - - if visit_range.days <= self.readmission_threshold: - generated_target_label.append(True) - encounter_identifier.append(encounter_id) - - else: - generated_target_label.append(False) - encounter_identifier.append(encounter_id) - - generated_labels = pd.DataFrame( - {self.target_label_column_name: generated_target_label, - 'identifier': encounter_identifier}) - updated_target_entity = pd.merge(entity_set_df, - generated_labels, - on='identifier') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, - index='identifier') - return es + meta['type'] = 'classification' + meta['thresh'] = k + meta['num_examples_per_instance'] = 2 - else: - raise ValueError( - 'Can not generate target label {} in table {} beacuse end label in \ - table {} contains missing value.'.format( - self.target_label_column_name, self. target_entity, generate_from)) + df = denormalize(es, entities=entities) + + # generate label + df[end] = pd.to_datetime(df[end]) + df[start] = pd.to_datetime(df[start]) - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) + return readmit, df, meta diff --git a/cardea/data_labeling/show_noshow_appointment.py b/cardea/data_labeling/show_noshow_appointment.py index 52adcdc5..5518f0a6 100644 --- a/cardea/data_labeling/show_noshow_appointment.py +++ b/cardea/data_labeling/show_noshow_appointment.py @@ -7,18 +7,19 @@ def appointment_no_show(es): """Defines the labeling task of appointment no show. """ def missed(ds, **kwargs): - return (ds["status"]).sum() + return True if 'noshow' in ds["status"].values else False + + if es.id == 'mimic': + raise ValueError("Problem not supported for MIMIC data.") meta = { "entity": "Appointment", - "target_entity": "identifier", + "target_entity": "identifier", # automatically, should this be the index of the table? "time_index": "created", "type": "classification", - "num_examples_per_instance": 1, - "thresh": 1 + "num_examples_per_instance": 1 } df = denormalize(es, entities=['Appointment']) - df['status'] = pd.Categorical(df['status']).codes - + return missed, df, meta diff --git a/cardea/data_labeling/utils.py b/cardea/data_labeling/utils.py index e69de29b..dd5b0c35 100644 --- a/cardea/data_labeling/utils.py +++ b/cardea/data_labeling/utils.py @@ -0,0 +1,52 @@ +import inspect + +def _search_relationship(es, left, right): + for r in es.relationships: + if r.parent_entity.id in left: + if right == r.child_entity.id: + left_on = r.parent_variable.id + right_on = r.child_variable.id + + elif r.child_entity.id in left: + if right == r.parent_entity.id: + left_on = r.child_variable.id + right_on = r.parent_variable.id + + return left_on, right_on + + +def denormalize(es, entities): + """Merge a set of entities into a single dataframe. + + Convert a set of entities from the entityset into a single + dataframe by repetitively merging the selected entities. The + merge process is applied sequentially. + + Args: + entities (list): + list of strings denoting which entities to merge. + + Returns: + pandas.DataFrame: + A single dataframe containing all the information from the + selected entities. + """ + k = len(entities) + + # initial entity to start from (should be the target entity) + first = entities[0] + previous = [first] + df = es[first].df + + # merge the dataframes to create a single input + for i in range(1, k): + right = entities[i] + + left_on, right_on = _search_relationship(es, previous, right) + df = pd.merge(df, es[right].df, + left_on=left_on, right_on=right_on, + how='left', suffixes=('', '_y')).filter(regex='^(?!.*_y)') + + previous.append(right) + + return df diff --git a/cardea/data_loader/__init__.py b/cardea/data_loader/__init__.py deleted file mode 100644 index b8a48075..00000000 --- a/cardea/data_loader/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Data loader module.""" - -from cardea.data_loader.data_loader import DataLoader, Diamond -from cardea.data_loader.entityset_loader import EntitySetLoader -from cardea.data_loader.load_mimic import load_mimic_data - -__all__ = ( - "DataLoader", - "EntitySetLoader", - "load_mimic_data" -) diff --git a/cardea/data_loader/entityset_loader.py b/cardea/data_loader/entityset_loader.py deleted file mode 100644 index f0e51a05..00000000 --- a/cardea/data_loader/entityset_loader.py +++ /dev/null @@ -1,127 +0,0 @@ -from glob import glob - -import featuretools as ft -import pandas as pd - -from cardea.data_loader import DataLoader, Diamond - - -class EntitySetLoader(DataLoader): - """A class that loads fhir class objects to featuretools entityset.""" - - __name__ = 'EntitySetLoader' - - def create_entity(self, fhir, identifiers, entity_set): - """Creates an entity from fhir dataframes and add it to entityset. - - Args: - fhir (dict): - A dictionary of fhir class dataframes. - entity_set (featuretools.EntitySet): - The global entityset that the entity will be added to. - """ - - for object_name, df in fhir.items(): - - id = identifiers[object_name] - df = df.apply(pd.to_numeric, errors='ignore') - - if object_name == 'Period': - entity_set.entity_from_dataframe(entity_id=str(object_name), - dataframe=df, - index=id, - time_index="start") - else: - entity_set.entity_from_dataframe(entity_id=str(object_name), - dataframe=df, - index=id) - - def create_relationships(self, relationships, entity_set): - """Binds entities in the entityset. - - Args: - relationships: A dataframe of the relationships in fhir. - entity_set: The global entityset that the entity will be added to. - """ - - for i, relation in relationships.iterrows(): - # parent table: 0, field: 1 - # child table: 2, field: 3 - - new_relationship = ft.Relationship( - entity_set[relation['parent_entity']][relation['parent_variable']], - entity_set[relation['child_entity']][relation['child_variable']]) - - entity_set.add_relationship(new_relationship) - - def load_data_entityset(self, folder_path): - """Returns an entityset loaded with .csv files in folder_path. - - Loads .csv files into pandas dataframes then loads them into featuretools' entityset. - - Args: - folder_path (dict): - A directory of all .csv files that should be loaded. - - Returns: - featuretools.EntitySet: - An entityset with loaded data. - """ - - fhir = self.read_csv_files(folder_path=folder_path) - return self.load_df_entityset(fhir=fhir) - - def read_csv_files(self, folder_path): - """Returns a dictionary with loaded .csv files in folder_path. - - Loads .csv files into pandas dataframes. - - Args: - folder_path: A dictionary of fhir resources in pandas dataframe format. - - Returns: - A dictionary of fhir resources in pandas dataframe format. - """ - - fhir = {} - - csv_files = glob(folder_path + "/*.csv") - for file_path in csv_files: - df = pd.read_csv(file_path) - file_name = file_path.split("/")[-1].split(".")[0] - - fhir[file_name] = df - - return fhir - - def load_df_entityset(self, fhir): - """Returns an entityset loaded with received dataframes in fhir. - - Loads the received dictionary of fhir resources into featuretools' entityset, where - the key is the resource name and the value is a pandas dataframe. - - Args: - fhir: A dictionary of fhir resources in pandas dataframe format. - - Returns: - An entityset with loaded data. - """ - - all_objects = [] - entity_set = ft.EntitySet(id="fhir") - - for name, df in fhir.items(): - - object = self.create_object(df, name) - all_objects.append(object) - - diamond = Diamond(all_objects) - diamond.resolve_diamond() - fhir = diamond.get_fhir_dataframes() - relationships = diamond.get_fhir_relationships() - identifiers = diamond.get_object_ids(all_objects) - - self.create_entity(fhir, identifiers, entity_set=entity_set) - self.create_relationships(relationships, entity_set=entity_set) - - return entity_set diff --git a/cardea/data_loader/load_mimic.py b/cardea/data_loader/load_mimic.py deleted file mode 100644 index a9caf5b7..00000000 --- a/cardea/data_loader/load_mimic.py +++ /dev/null @@ -1,139 +0,0 @@ -import os -import xml.etree.ElementTree as ET -from glob import glob - -import featuretools as ft -import pandas as pd - -path = os.path.dirname(os.path.abspath(__file__)) -root = ET.parse(path + '/schema.xml').getroot() - - -def get_table_properties(name): - """Returns a tuple containing the datatype of each column, the primary key of the table, - and the time indices. - - Args: - name: The name of the table in the formal XML file. - - Returns: - A tuple with three components, a list with the datatypes of each column, the primary key - of the table, and a list of columns that consider the time indices of the table. - """ - - types = {} - arr_time = [] - prim_key = 'row_id' - - x = root.find('.//table[@name="' + name + '"]') - for t in x.findall('column'): - - column = t.get('name') - a_type = t.get('type') - d_type = get_type(a_type) - prim_key = column if 'Primary key' in t.get('remarks') else prim_key - - if a_type == 'timestamp': - arr_time.append(column) - - types[column.lower()] = d_type - - return types, prim_key, arr_time - - -def get_table_relationships(name): - """Returns a list of the relationships in the table. - - Args: - name: The name of the table in the formal XML file. - - Returns: - A list of the relationships in the table, formatted as a dictionary. - """ - - relations = [] - x = root.find('.//table[@name="' + name + '"]') - - for c in x.findall('column/child'): - target_table = c.get('table') - target_handle = c.get('column') - - handle = x.find('.//column/child/...').get('name') - - relations.append({'parent': name, 'primary_key': handle, - 'child': target_table, 'foreign_key': target_handle}) - - return relations - - -def get_type(x): - return { - 'int4': float, - 'int2': float, - 'varchar': str, - 'float8': float, - 'text': str - }.get(x, str) - - -def load_mimic_data(path=None, subset=None): - """Returns an entityset loaded with the dataframes in the received path. - - Args: - path (str): - The folder path that contains the data. - subset (str): - List of tables to include. - - Returns: - featuretools.EntitySet: - An entityset with loaded data. - """ - es = ft.EntitySet(id="mimic") - - relationships = [] - global_tables = [] - files = glob(path + '/*.csv') - - for tag in root.findall('tables/table'): - table = tag.get('name') - file = os.path.join(path, table.upper() + '.csv') - - if subset and table not in subset: - continue - - if file in files: - # table name - global_tables.append(table) - - # get table relationships - relationships = relationships + get_table_relationships(table) - - # get table properties - prop, key, arr_time = get_table_properties(table) - - # load table into a dataframe - df = pd.read_csv(file, dtype=prop, date_parser=pd.to_datetime) - - df.columns = [column.lower() for column in df.columns] - - # check if arr_time should be None (no time index) - arr_time = arr_time[0] if len(arr_time) > 0 else None - - if arr_time and df[arr_time].isnull().all(): - arr_time = None - - # load dataframe into the entityset - es.entity_from_dataframe(entity_id=table, - dataframe=df, - index=key, - time_index=arr_time) - - for r in relationships: - if (r['parent'] in global_tables and r['child'] in global_tables): - new_relationship = ft.Relationship(es[r['parent']][r['primary_key']], - es[r['child']][r['foreign_key']]) - - es = es.add_relationship(new_relationship) - - return es diff --git a/cardea/featurization/__init__.py b/cardea/featurization/__init__.py deleted file mode 100644 index 85a20358..00000000 --- a/cardea/featurization/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# -*- coding: utf-8 -*- - -from cardea.featurization.featurization import Featurization - -__all__ = ( - "Featurization" -) diff --git a/cardea/featurizing/__init__.py b/cardea/featurizing/__init__.py new file mode 100644 index 00000000..3a501a37 --- /dev/null +++ b/cardea/featurizing/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +from cardea.featurizing.featurization import Featurization + +__all__ = ( + "Featurization" +) diff --git a/cardea/featurization/featurization.py b/cardea/featurizing/featurization.py similarity index 100% rename from cardea/featurization/featurization.py rename to cardea/featurizing/featurization.py diff --git a/cardea/problem_definition/definition.py b/cardea/problem_definition/definition.py deleted file mode 100644 index df82b0e3..00000000 --- a/cardea/problem_definition/definition.py +++ /dev/null @@ -1,247 +0,0 @@ -import pandas as pd - -from cardea.data_loader import DataLoader - - -class ProblemDefinition: - """A class that defines the prediction problem - by specifying cutoff times and generating the target label if it does not exist. - """ - - def check_target_label(self, entity_set, target_entity, target_label): - """Checks if target label exists in the entity set. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - - Returns: - True if the target label exists. - """ - return DataLoader().check_column_existence(entity_set, target_entity, target_label) - - def check_for_missing_values_in_target_label( - self, entity_set, target_entity, target_label_column_name): - """Checks if there is a missing value in the target label. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - - Returns: - False is the target label does not contain a missing value. - """ - return DataLoader().check_for_missing_values(entity_set, - target_entity, - target_label_column_name) - - def generate_target_label(self, entity_set, target_entity, target_label): - """Generates target labels if the entityset is missing labels. - - Args: - entity_set: fhir entityset. - target_label: The target label of the prediction problem. - target_entity: The entity name which contains the target label. - - Returns: - Target entity with the generated label. - """ - - def generate_cutoff_times(self, entity_set): - """Generates cutoff times for the predection problem. - - Args: - entity_set: fhir entityset. - - Returns: - entity_set, target_entity, series of target_labels and a dataframe of cutoff_times. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - def unify_cutoff_times_hours_admission_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared time. - - Args: - df: cutoff_entity dataframe. - """ - df = df.sort_values(by=[cutoff_time_label]) - df = df.reset_index() - - for i in df.index: - - if i == 0: - - if df.at[i, 'checked'] is not True: - df.at[i, 'ct'] = df.at[i, cutoff_time_label] - df.at[i, 'checked'] = True - - elif df.at[i, 'checked'] is not True: - - ct_val1 = df.at[i - 1, 'ct'] - end_val1 = df.at[i - 1, 'end'] - start_val2 = df.at[i, cutoff_time_label] - df.at[i, 'end'] - - if ct_val1 < start_val2 < end_val1: - df.at[i - 1, 'ct'] = start_val2 - df.at[i, 'ct'] = start_val2 - df.at[i, 'checked'] = True - - else: - df.at[i, 'ct'] = df.at[i, cutoff_time_label] - df.at[i, 'checked'] = True - - if i + 1 == len(df): - break - return df - - def unify_cutoff_times_days_admission_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared days. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['date']): - sub_day = df[df['date'] == d] - - sub_duration_greater = sub_day[sub_day['duration'] > 0] - sub_duration_less = sub_day[sub_day['duration'] <= 0] - frames.append(sub_duration_less) - sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label]) - if len(sub_duration_greater) != 0: - final_date = sub_duration_greater.iloc[-1][cutoff_time_label] - - for i in sub_duration_greater.index: - sub_duration_greater.at[i, 'ct'] = final_date - sub_duration_greater.at[i, 'checked'] = True - - frames.append(sub_duration_greater) - - for i in sub_duration_less.index: - sub_duration_less.at[i, 'ct'] = pd.NaT - sub_duration_less.at[i, 'checked'] = False - - frames.append(sub_duration_less) - - result = pd.concat(frames) - result = result.drop_duplicates() - result[cutoff_time_label] = pd.to_datetime(result.start) - result = result.sort_values(by=[cutoff_time_label]) - result = result.reset_index() - return result - - def unify_cutoff_time_admission_time(self, es, cutoff_entity, cutoff_time_label): - """Process records in the entity that contains cutoff times - based on shared days and time. - - Args: - es: fhir entityset. - - Returns: - processed entity - """ - - df = es[cutoff_entity].df - df[cutoff_time_label] = pd.to_datetime(df[cutoff_time_label]) - df['end'] = pd.to_datetime(df['end']) - duration = (df['end'] - df[cutoff_time_label]).dt.days - duration = duration.tolist() - df['duration'] = duration - df['date'] = df[cutoff_time_label].dt.date - df['ct'] = '' - df['checked'] = False - result1 = self.unify_cutoff_times_days_admission_time(df, cutoff_time_label) - result = self.unify_cutoff_times_hours_admission_time(result1, cutoff_time_label) - if 'level_0' in result.columns: - result = result.drop(columns=['level_0']) - return result - - def unify_cutoff_times_days_discharge_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared days. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['end_date']): - sub_day = df[df['end_date'] == d] - - sub_duration_greater = sub_day[sub_day['duration'] > 0] - sub_duration_less = sub_day[sub_day['duration'] <= 0] - frames.append(sub_duration_less) - sub_duration_greater = sub_duration_greater.sort_values(by=[cutoff_time_label]) - if len(sub_duration_greater) != 0: - first_date = sub_duration_greater.iloc[0][cutoff_time_label] - - for i in sub_duration_greater.index: - sub_duration_greater.at[i, 'ct'] = first_date - sub_duration_greater.at[i, 'checked'] = True - frames.append(sub_duration_greater) - - for i in sub_duration_less.index: - sub_duration_less.at[i, 'ct'] = pd.NaT - sub_duration_less.at[i, 'checked'] = False - frames.append(sub_duration_less) - - result = pd.concat(frames) - result = result.drop_duplicates() - result[cutoff_time_label] = pd.to_datetime(result.end) - result = result.reset_index() - return result - - def unify_cutoff_times_hours_discharge_time(self, df, cutoff_time_label): - """Unify records cutoff times based on shared time. - - Args: - df: cutoff_entity dataframe. - """ - - frames = [] - for d in set(df['end_date']): - sub_day = df[df['end_date'] == d] - for h in set(sub_day['hour']): - sub_hour = sub_day[sub_day['hour'] == h] - sub_hour = sub_hour.sort_values(by=[cutoff_time_label]) - if len(sub_hour) != 0: - first_date = sub_hour.iloc[0][cutoff_time_label] - for i in sub_hour.index: - sub_hour.at[i, 'ct'] = first_date - sub_hour.at[i, 'checked'] = True - - frames.append(sub_hour) - - result = pd.concat(frames) - result = result.drop_duplicates() - return result - - def unify_cutoff_time_discharge_time(self, es, cutoff_entity, cutoff_time_label): - """Process records in the entity that contains cutoff times - based on shared days and time. - - Args: - es: fhir entityset. - - Returns: - processed entity - """ - - df = es[cutoff_entity].df - df['end_date'] = df[cutoff_time_label].dt.date - df['hour'] = df.end.apply(lambda x: x.hour) - duration = (df[cutoff_time_label] - df['start']).dt.days - duration = duration.tolist() - df['duration'] = duration - df['ct'] = '' - df['checked'] = False - result1 = self.unify_cutoff_times_days_discharge_time(df, cutoff_time_label) - result = self.unify_cutoff_times_hours_discharge_time(result1, cutoff_time_label) - if 'level_0' in result.columns: - result = result.drop(columns=['level_0']) - return result diff --git a/cardea/problem_definition/length_of_stay.py b/cardea/problem_definition/length_of_stay.py deleted file mode 100644 index 834c461c..00000000 --- a/cardea/problem_definition/length_of_stay.py +++ /dev/null @@ -1,150 +0,0 @@ -import featuretools as ft -import pandas as pd - -from cardea.data_loader import DataLoader as DL -from cardea.problem_definition import ProblemDefinition - - -class LengthOfStay (ProblemDefinition): - """Defines the problem of length of stay, predicting how many days - the patient will be in the hospital. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - - __name__ = 'los' - - updated_es = None - target_label_column_name = 'length' - target_entity = 'Encounter' - cutoff_time_label = 'start' - cutoff_entity = 'Period' - conn = 'period' - prediction_type = 'regression' - - def generate_cutoff_times(self, es): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label(es, - self.target_entity, - self.target_label_column_name) and not - self.check_for_missing_values_in_target_label(es, - self.target_entity, - self.target_label_column_name)): - if DL().check_column_existence(es, - self.cutoff_entity, - self.cutoff_time_label): - generated_cts = self.unify_cutoff_time_admission_time( - es, self.cutoff_entity, self.cutoff_time_label) - - es = es.entity_from_dataframe(entity_id=self.cutoff_entity, - dataframe=generated_cts, - index='object_id') - - cutoff_times = es[self.cutoff_entity].df['ct'].to_frame() - - label = es[self.target_entity].df[self.conn].values - instance_id = list(es[self.target_entity].df.index) - cutoff_times = cutoff_times.reindex(index=label) - cutoff_times = cutoff_times[cutoff_times.index.isin(label)] - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - - cutoff_times['label'] = list( - es[self.target_entity].df[self.target_label_column_name]) - return(es, self.target_entity, cutoff_times) - else: - raise ValueError('Cutoff time label {} in table {} does not exist'.format( - self.cutoff_time_label, self.target_entity)) - - else: - updated_es = self.generate_target_label(es) - return self.generate_cutoff_times(updated_es) - - def generate_target_label(self, es): - """Generates target labels in the case of having missing label in the entityset. - - Args: - es: fhir entityset. - - Returns: - Updated entityset with the generated label. - - Raises: - ValueError: An error occurs if the target label cannot be generated. - """ - - generate_from = 'Period' - start = self.cutoff_time_label - end = 'end' - label_name = self.target_label_column_name - - if (DL().check_column_existence(es, - generate_from, - start) and DL().check_column_existence(es, - generate_from, - end)): - if (not DL().check_for_missing_values(es, - generate_from, - start) and not - (DL().check_for_missing_values(es, - generate_from, - end))): - - es[generate_from].df[start] = pd.to_datetime( - es[generate_from].df[start]) - es[generate_from].df[end] = pd.to_datetime( - es[generate_from].df[end]) - duration = (es[generate_from].df[end] - es[generate_from].df[start]).dt.days - duration = duration.tolist() - es[self.target_entity].df[label_name] = duration - updated_target_entity = es[self.target_entity].df - duration_df = pd.DataFrame({'object_id': duration}) - - es = es.entity_from_dataframe(entity_id='Duration', - dataframe=duration_df, - index='object_id') - - es = es.entity_from_dataframe(entity_id=self.target_entity, - dataframe=updated_target_entity, index='identifier') - new_relationship = ft.Relationship(es['Duration']['object_id'], - es[self.target_entity][label_name]) - es = es.add_relationship(new_relationship) - - return es - - else: - raise ValueError('Can not generate target label {} in table {} \ - beacuse start or end labels in table {} contain \ - missing value.'.format(label_name, - self.target_entity, - generate_from)) - - else: - raise ValueError('Can not generate target label {} in \ - table {}.'.format(label_name, - self.target_entity)) diff --git a/cardea/problem_definition/show_noshow_appointment.py b/cardea/problem_definition/show_noshow_appointment.py deleted file mode 100644 index 6093802f..00000000 --- a/cardea/problem_definition/show_noshow_appointment.py +++ /dev/null @@ -1,76 +0,0 @@ - -from cardea.data_loader import DataLoader -from cardea.problem_definition import ProblemDefinition - - -class MissedAppointment(ProblemDefinition): - """Defines the problem of missed appointment - - Predict whether the patient will show to the appointment or not. - - Args: - target_label_column_name (str): - The target label of the prediction problem. - target_entity (str): - Name of the entity containing the target label. - cutoff_time_label (str): - The cutoff time label of the prediction problem. - cutoff_entity (str): - Name of the entity containing the cutoff time label. - prediction_type (str): - The type of the machine learning prediction. - """ - __name__ = 'mapp' - - target_label_column_name = 'status' - target_entity = 'Appointment' - prediction_type = 'classification' - cutoff_time_label = 'created' - cutoff_entity = target_entity - - def generate_cutoff_times(self, entity_set): - """Generates cutoff times for the prediction problem. - - Args: - es (featuretools.EntitySet): - An EntitySet with the loaded data. - - Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. - - Raises: - ValueError: An error occurs if the cutoff variable does not exist. - """ - - if (self.check_target_label( - entity_set, - self.target_entity, - self.target_label_column_name)) and\ - not (self.check_for_missing_values_in_target_label(entity_set, - self.target_entity, - self.target_label_column_name)): - - if DataLoader().check_column_existence(entity_set, - self.target_entity, - self.cutoff_time_label): - - instance_id = list(entity_set[self.target_entity].df.index) - cutoff_times = entity_set[self.cutoff_entity].df[self.cutoff_time_label].to_frame() - cutoff_times['instance_id'] = instance_id - cutoff_times.columns = ['time', 'instance_id'] - cutoff_times['label'] = list( - entity_set[self.target_entity].df[self.target_label_column_name]) - entity_set[self.target_entity].delete_variables([self.target_label_column_name]) - return (entity_set, self.target_entity, cutoff_times) - else: - raise ValueError( - 'Cutoff time label {} in table {} does not exist'.format( - 'created', self.target_entity)) - else: - raise ValueError( - 'Can not generate target label {} in table {}.'.format( - self.target_label_column_name, - self.target_entity)) diff --git a/tests/cardea/data_loader/__init__.py b/tests/cardea/data_assembling/__init__.py similarity index 100% rename from tests/cardea/data_loader/__init__.py rename to tests/cardea/data_assembling/__init__.py diff --git a/tests/cardea/data_loader/test_data_loader.py b/tests/cardea/data_assembling/test_data_loader.py similarity index 99% rename from tests/cardea/data_loader/test_data_loader.py rename to tests/cardea/data_assembling/test_data_loader.py index f83c4d66..370b4597 100644 --- a/tests/cardea/data_loader/test_data_loader.py +++ b/tests/cardea/data_assembling/test_data_loader.py @@ -4,7 +4,7 @@ import pandas as pd import pytest -from cardea.data_loader import DataLoader, Diamond +from cardea.data_assembling import DataLoader, Diamond @pytest.fixture() diff --git a/tests/cardea/data_loader/test_entityset_loader.py b/tests/cardea/data_assembling/test_entityset_loader.py similarity index 97% rename from tests/cardea/data_loader/test_entityset_loader.py rename to tests/cardea/data_assembling/test_entityset_loader.py index 2f15a475..84d9c833 100644 --- a/tests/cardea/data_loader/test_entityset_loader.py +++ b/tests/cardea/data_assembling/test_entityset_loader.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from cardea.data_loader import EntitySetLoader +from cardea.data_assembling import EntitySetLoader @pytest.fixture() diff --git a/tests/cardea/data_loader/test_load_mimic.py b/tests/cardea/data_assembling/test_load_mimic.py similarity index 90% rename from tests/cardea/data_loader/test_load_mimic.py rename to tests/cardea/data_assembling/test_load_mimic.py index 4ccd4e69..5752372e 100644 --- a/tests/cardea/data_loader/test_load_mimic.py +++ b/tests/cardea/data_assembling/test_load_mimic.py @@ -3,7 +3,7 @@ import pytest -from cardea.data_loader.load_mimic import get_table_properties, get_table_relationships +from cardea.data_assembling.load_mimic import get_table_properties, get_table_relationships @pytest.fixture() diff --git a/tests/cardea/featurization/__init__.py b/tests/cardea/data_labeling/__init__.py similarity index 100% rename from tests/cardea/featurization/__init__.py rename to tests/cardea/data_labeling/__init__.py diff --git a/tests/cardea/data_labeling/test_definition.py b/tests/cardea/data_labeling/test_definition.py new file mode 100644 index 00000000..bb6251eb --- /dev/null +++ b/tests/cardea/data_labeling/test_definition.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import featuretools as ft +import pandas as pd +import pytest + +from cardea.data_labeling import DataLabeler + +def test_data_labeler(): + pass \ No newline at end of file diff --git a/tests/cardea/data_labeling/test_length_of_stay.py b/tests/cardea/data_labeling/test_length_of_stay.py new file mode 100644 index 00000000..9ce7b92f --- /dev/null +++ b/tests/cardea/data_labeling/test_length_of_stay.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import featuretools as ft +import pandas as pd +import pytest +from numpy import nan + +from cardea.data_labeling import length_of_stay + diff --git a/tests/cardea/data_labeling/test_mortality_prediction.py b/tests/cardea/data_labeling/test_mortality_prediction.py new file mode 100644 index 00000000..aaa684b2 --- /dev/null +++ b/tests/cardea/data_labeling/test_mortality_prediction.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import featuretools as ft +import pandas as pd +import pytest +from numpy import nan + +from cardea.data_labeling import mortality diff --git a/tests/cardea/data_labeling/test_predicting_diagnosis.py b/tests/cardea/data_labeling/test_predicting_diagnosis.py new file mode 100644 index 00000000..61d7456d --- /dev/null +++ b/tests/cardea/data_labeling/test_predicting_diagnosis.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import featuretools as ft +import pandas as pd +import pytest +from numpy import nan + +from cardea.data_labeling import diagnosis_prediction diff --git a/tests/cardea/data_labeling/test_readmission.py b/tests/cardea/data_labeling/test_readmission.py new file mode 100644 index 00000000..62286967 --- /dev/null +++ b/tests/cardea/data_labeling/test_readmission.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import featuretools as ft +import pandas as pd +import pytest +from numpy import nan + +from cardea.data_labeling import readmission diff --git a/tests/cardea/data_labeling/test_show_noshow_appointment.py b/tests/cardea/data_labeling/test_show_noshow_appointment.py new file mode 100644 index 00000000..97f99aa2 --- /dev/null +++ b/tests/cardea/data_labeling/test_show_noshow_appointment.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import featuretools as ft +import pandas as pd +import pytest +from numpy import nan + +from cardea.data_labeling import appointment_no_show diff --git a/tests/cardea/problem_definition/__init__.py b/tests/cardea/featurizing/__init__.py similarity index 100% rename from tests/cardea/problem_definition/__init__.py rename to tests/cardea/featurizing/__init__.py diff --git a/tests/cardea/featurization/test_featurization.py b/tests/cardea/featurizing/test_featurization.py similarity index 94% rename from tests/cardea/featurization/test_featurization.py rename to tests/cardea/featurizing/test_featurization.py index 72c615f7..3ebc8e0e 100644 --- a/tests/cardea/featurization/test_featurization.py +++ b/tests/cardea/featurizing/test_featurization.py @@ -5,8 +5,8 @@ import pandas as pd import pytest -from cardea.data_loader import EntitySetLoader -from cardea.featurization import Featurization +from cardea.data_assembling import EntitySetLoader +from cardea.featurizing import Featurization @pytest.fixture() diff --git a/tests/cardea/problem_definition/test_definition.py b/tests/cardea/problem_definition/test_definition.py deleted file mode 100644 index 05914672..00000000 --- a/tests/cardea/problem_definition/test_definition.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import ProblemDefinition - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def problem_definition(): - return ProblemDefinition() - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]},) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['1/1/2000 20:00', '2/1/2000 5:00', '3/1/2000 22:00'], - "end": ['1/2/2000 21:10', '2/2/2000 18:00', '3/3/2000 20:00']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - return [encounter, period, patient] - - -@pytest.fixture() -def entityset(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -def test_check_target_label_true(entityset, problem_definition): - assert problem_definition.check_target_label(entityset, 'Patient', 'gender') is True - - -def test_check_target_label_false(entityset, problem_definition): - - assert problem_definition.check_target_label( - entityset, 'Encounter', 'class') is False - - -def test_check_target_label_values_true(entityset, problem_definition): - - assert problem_definition.check_for_missing_values_in_target_label( - entityset, 'Patient', 'active') is True - - -def test_check_target_label_values_false(entityset, problem_definition): - assert problem_definition.check_for_missing_values_in_target_label( - entityset, 'Patient', 'gender') is False - - -def test_check_target_label_values_error(entityset, problem_definition): - assert problem_definition.check_for_missing_values_in_target_label( - entityset, 'Encounter', 'class') is False diff --git a/tests/cardea/problem_definition/test_length_of_stay.py b/tests/cardea/problem_definition/test_length_of_stay.py deleted file mode 100644 index bd23ed90..00000000 --- a/tests/cardea/problem_definition/test_length_of_stay.py +++ /dev/null @@ -1,306 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import LengthOfStay - - -@pytest.fixture() -def length_of_stay(): - return LengthOfStay() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/19/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "label": [2, 1, 7] - }) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, patient] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', nan]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_cutoff_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_label(objects_missing_generation_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_error_missing_cutoff_label(objects_missing_cutoff_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_cutoff_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_cutoff_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_cutoff_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_target_label(entityset_fail, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_fail) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label( - entityset_fail_missing_generation_label, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_fail_missing_generation_label) - - -def test_generate_cutoff_times_with_missing_cutoff_label( - entityset_error_missing_cutoff_label, length_of_stay): - entityset_error_missing_cutoff_label['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_error_missing_cutoff_label) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times(entityset_fail_missing_generation_value) diff --git a/tests/cardea/problem_definition/test_mortality_prediction.py b/tests/cardea/problem_definition/test_mortality_prediction.py deleted file mode 100644 index bb33eaeb..00000000 --- a/tests/cardea/problem_definition/test_mortality_prediction.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition.mortality_prediction import MortalityPrediction - - -@pytest.fixture() -def mortality_prediction(): - return MortalityPrediction() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "label": [True, False, True]}) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["X60", "C12", "V02"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "end": ['9/22/2018 00:10', '9/21/2018 00:10', '10/4/2018 00:10']}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "end": ['9/22/2018 00:10', '9/21/2018 00:10', '10/4/2018 00:10']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "length": [2, 1, 7, 0, 0, 0]}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, patient, duration] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["Z10", "C12", "A10"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/20/2018 21:10', '9/20/2018 18:00', '9/27/2018 20:00'], - "end": ['9/22/2018 20:00', '9/21/2018 5:00', '10/4/2018 22:00'] - }) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, mortality_prediction, cutoff_times): - _, _, generated_df = mortality_prediction.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - generated_df['label'] = generated_df['label'].astype(bool) # same data type - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label(entityset_success, mortality_prediction): - entityset_success['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - mortality_prediction.generate_cutoff_times( - entityset_success) - - -def test_generate_label_with_missing_label(entityset_success, mortality_prediction): - entityset_success['Encounter'].delete_variables(['diagnosis']) - with pytest.raises(ValueError): - mortality_prediction.generate_cutoff_times(entityset_success) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, mortality_prediction): - es_fail = entityset_fail_missing_generation_value - temp = es_fail['Encounter'].df - temp['diagnosis'] = [nan, nan, nan] - es = es_fail.entity_from_dataframe(entity_id='Encounter', - dataframe=temp, - index='identifier') - with pytest.raises(ValueError): - mortality_prediction.generate_cutoff_times(es) diff --git a/tests/cardea/problem_definition/test_predicting_diagnosis.py b/tests/cardea/problem_definition/test_predicting_diagnosis.py deleted file mode 100644 index f4fac436..00000000 --- a/tests/cardea/problem_definition/test_predicting_diagnosis.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition.predicting_diagnosis import DiagnosisPrediction - - -@pytest.fixture() -def diagnosis_prediction(): - return DiagnosisPrediction("Z10") - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "label": [True, False, False]}) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["Z10", "C12", "A10"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/22/2018 00:00', '9/21/2018 00:00', '10/4/2018 00:00'], - "end": ['9/22/2018 00:10', '9/21/2018 00:10', '10/4/2018 00:10']}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:00'], - "end": ['9/20/2018 00:00', '9/20/2018 00:10', '9/27/2018 00:10']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "length": [2, 1, 7, 0, 0, 0]}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, patient, duration] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7], - "diagnosis": [1, 2, 3]}) - - encounter_diagnosis_df = pd.DataFrame({"object_id": [1, 2, 3], - "condition": [10, 11, 12]}) - - condition_df = pd.DataFrame({"identifier": [10, 11, 12], - "code": [1, 2, 3], - "subject": [10, 11, 12]}) - - cc_df = pd.DataFrame({"object_id": [1, 2, 3], - "coding": [100, 111, 112], - "subject": [10, 11, 12]}) - - coding_df = pd.DataFrame({"object_id": [100, 111, 112], - "code": ["Z10", "C12", "A10"]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/20/2018 21:10', '9/20/2018 18:00', '9/27/2018 20:00'], - "end": ['9/22/2018 20:00', '9/21/2018 5:00', '10/4/2018 22:00'] - }) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - encounter_diagnosis = es_loader.create_object(encounter_diagnosis_df, 'Encounter_Diagnosis') - condition = es_loader.create_object(condition_df, 'Condition') - cc = es_loader.create_object(cc_df, 'CodeableConcept') - coding = es_loader.create_object(coding_df, 'Coding') - - objects = [encounter, period, patient, duration, encounter_diagnosis, condition, cc, coding] - return objects - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, diagnosis_prediction, cutoff_times): - _, _, generated_df = diagnosis_prediction.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label(entityset_success, diagnosis_prediction): - entityset_success['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - diagnosis_prediction.generate_cutoff_times( - entityset_success) - - -def test_generate_label_with_missing_label(entityset_success, diagnosis_prediction): - entityset_success['Encounter'].delete_variables(['diagnosis']) - with pytest.raises(ValueError): - diagnosis_prediction.generate_cutoff_times(entityset_success) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, diagnosis_prediction): - es_fail = entityset_fail_missing_generation_value - temp = es_fail['Encounter'].df - temp['diagnosis'] = [nan, nan, nan] - es = es_fail.entity_from_dataframe(entity_id='Encounter', - dataframe=temp, - index='identifier') - with pytest.raises(ValueError): - diagnosis_prediction.generate_cutoff_times(es) diff --git a/tests/cardea/problem_definition/test_prolonged_length_of_stay.py b/tests/cardea/problem_definition/test_prolonged_length_of_stay.py deleted file mode 100644 index c2f418a2..00000000 --- a/tests/cardea/problem_definition/test_prolonged_length_of_stay.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import ProlongedLengthOfStay - - -@pytest.fixture() -def length_of_stay(): - return ProlongedLengthOfStay() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['9/19/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "label": [0, 0, 1] - }) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, patient] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', nan]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def objects_missing_cutoff_label(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122], - "length": [2, 1, 7]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018 00:00', '9/19/2018 00:00', '9/20/2018 11:23'], - "end": ['9/20/2018 00:12', '9/20/2018 00:20', '9/27/2018 11:23']}) - - duration_df = pd.DataFrame({"object_id": [2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_label(objects_missing_generation_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_error_missing_cutoff_label(objects_missing_cutoff_label, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_cutoff_label) - - fhir_dict = es_loader.get_dataframes(objects_missing_cutoff_label) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_cutoff_label, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_target_label(entityset_fail, length_of_stay, cutoff_times): - _, _, generated_df = length_of_stay.generate_cutoff_times( - entityset_fail) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_missing_generation_label( - entityset_fail_missing_generation_label, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_fail_missing_generation_label) - - -def test_generate_cutoff_times_with_missing_cutoff_label( - entityset_error_missing_cutoff_label, length_of_stay): - entityset_error_missing_cutoff_label['Period'].delete_variables(['start']) - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times( - entityset_error_missing_cutoff_label) - - -def test_generate_label_with_missing_values( - entityset_fail_missing_generation_value, length_of_stay): - with pytest.raises(ValueError): - length_of_stay.generate_cutoff_times(entityset_fail_missing_generation_value) - - -def test_generate_cutoff_times_with_threshold(entityset_success): - los = ProlongedLengthOfStay(t=2) - values_should_be = [1, 0, 1] - es, _, generated_df = los.generate_cutoff_times( - entityset_success) - generated_labels = list(generated_df['label']) - assert values_should_be == generated_labels diff --git a/tests/cardea/problem_definition/test_readmission.py b/tests/cardea/problem_definition/test_readmission.py deleted file mode 100644 index 032bf794..00000000 --- a/tests/cardea/problem_definition/test_readmission.py +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import Readmission - - -@pytest.fixture() -def readmission(): - return Readmission() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - temp = pd.DataFrame({"instance_id": [10, 11, 12, 13, 14, 15], - "time": ['9/22/2018', '9/21/2018', '10/4/2018', - '9/28/2018', '10/30/2018', '11/18/2018'], - "label": [False, False, False, True, False, True] - }) - temp['time'] = pd.to_datetime(temp['time']) - return temp - - -@pytest.fixture() -def objects(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "period": [120, 121, 122, 125, 123, 124], - "length": [2, 1, 7, 0, 0, 0]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122, 125, 123, 124], - "start": ['9/20/2018', '9/20/2018', '9/27/2018', - '9/28/2018', '10/30/2018', '11/18/2018'], - "end": ['9/22/2018', '9/21/2018', '10/4/2018', - '9/28/2018', '10/30/2018', '11/18/2018'] - }) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, period, patient, duration] - - -@pytest.fixture() -def objects_fail(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018'], - "end": ['9/20/2018', '9/20/2018', '9/27/2018']}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - period = es_loader.create_object(period_df, 'Period') - patient = es_loader.create_object(patient_df, 'Patient') - - return [encounter, period, patient] - - -@pytest.fixture() -def objects_missing_generation_table(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12, 13, 14, 15], - "subject": [0, 1, 2, 0, 0, 0], - "length": [2, 1, 7, 0, 0, 0]}) - - duration_df = pd.DataFrame({"object_id": [0, 2, 1, 7]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - duration = es_loader.create_object(duration_df, 'Duration') - - return [encounter, patient, duration] - - -@pytest.fixture() -def objects_missing_generation_value(es_loader): - - encounter_df = pd.DataFrame({"identifier": [10, 11, 12], - "subject": [0, 1, 2], - "period": [120, 121, 122]}) - - period_df = pd.DataFrame({"object_id": [120, 121, 122], - "start": ['9/18/2018', '9/19/2018', '9/20/2018'], - "end": ['9/18/2018', '9/19/2018', nan]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - encounter = es_loader.create_object(encounter_df, 'Encounter') - patient = es_loader.create_object(patient_df, 'Patient') - period = es_loader.create_object(period_df, 'Period') - - return [encounter, patient, period] - - -@pytest.fixture() -def relationships(): - return[('Encounter', 'period', 'Period', 'object_id'), - ('Encounter', 'subject', 'Patient', 'identifier'), - ('Encounter', 'length', 'Duration', 'object_id')] - - -@pytest.fixture() -def entityset_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_value(objects_missing_generation_value, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_value) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_value) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_value, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_fail_missing_generation_table(objects_missing_generation_table, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_missing_generation_table) - - fhir_dict = es_loader.get_dataframes(objects_missing_generation_table) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships( - objects_missing_generation_table, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -@pytest.fixture() -def entityset_fail(objects_fail, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects_fail) - - fhir_dict = es_loader.get_dataframes(objects_fail) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects_fail, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - return es - - -def test_generate_cutoff_times_success(entityset_success, readmission, cutoff_times): - _, _, generated_df = readmission.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_labels_success(entityset_success, readmission, cutoff_times): - es, _, generated_df = readmission.generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - - labels = list(generated_df['label']) - - assert labels == [False, False, False, True, False, True] - - -def test_generate_labels_success_threshold(entityset_success, cutoff_times): - - es, _, generated_df = Readmission(6).generate_cutoff_times( - entityset_success) - generated_df.index = cutoff_times.index # both should have the same index - - labels = list(generated_df['label']) - assert labels == [False, False, False, True, False, False] - - -def test_generate_cutoff_times_missing_generation_label(entityset_success, readmission): - entityset_success['Period'].delete_variables(['end']) - with pytest.raises(ValueError): - readmission.generate_cutoff_times( - entityset_success) - - -def test_generate_label_with_missing_values(entityset_fail_missing_generation_value, readmission): - with pytest.raises(ValueError): - readmission.generate_cutoff_times(entityset_fail_missing_generation_value) diff --git a/tests/cardea/problem_definition/test_show_noshow_appointment.py b/tests/cardea/problem_definition/test_show_noshow_appointment.py deleted file mode 100644 index 1a39df32..00000000 --- a/tests/cardea/problem_definition/test_show_noshow_appointment.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_loader import EntitySetLoader -from cardea.problem_definition import MissedAppointment - - -@pytest.fixture() -def missed_appointment(): - return MissedAppointment() - - -@pytest.fixture() -def es_loader(): - return EntitySetLoader() - - -@pytest.fixture() -def cutoff_times(): - return pd.DataFrame( - {"instance_id": [10, 11, 12], - "time": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "label": ['noshow', 'noshow', 'fulfilled'] - }) - - -@pytest.fixture() -def objects(es_loader): - - appointment_df = pd.DataFrame({"identifier": [10, 11, 12], - "status": ['noshow', 'noshow', 'fulfilled'], - "start": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "participant": [120, 121, 122], - "created": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018]}) - - participant_df = pd.DataFrame({"object_id": [120, 121, 122], - "actor": [0, 1, 2]}) - - patient_df = pd.DataFrame({"identifier": [0, 1, 2], - "gender": ['female', 'female', 'male'], - "birthDate": ['10/21/2000', '7/2/2000', '1/10/2000'], - "active": ['True', 'True', 'nan']}) - - appointment = es_loader.create_object(appointment_df, 'Appointment') - participant = es_loader.create_object(participant_df, 'Appointment_Participant') - patient = es_loader.create_object(patient_df, 'Patient') - - return [appointment, participant, patient] - - -@pytest.fixture() -def es_success(objects, es_loader): - es = ft.EntitySet(id="test") - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def object_error_missing_label(es_loader): - - appointment_df = pd.DataFrame({"identifier": [10, 11, 12], - "start": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "participant": [120, 121, 122], - "created": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018]}) - - appointment = es_loader.create_object(appointment_df, 'Appointment') - - return appointment - - -@pytest.fixture() -def objects_error_missing_cutoff_label(es_loader): - - appointment_df = pd.DataFrame({"identifier": [10, 11, 12], - "start": [7 / 22 / 2018, 8 / 21 / 2018, 9 / 16 / 2018], - "status": ['noshow', 'noshow', 'fulfilled'], - "participant": [120, 121, 122]}) - - appointment = es_loader.create_object(appointment_df, 'Appointment') - return appointment - - -@pytest.fixture() -def entityset_error_missing_label(objects, object_error_missing_label, es_loader): - es = ft.EntitySet(id="test") - - objects.extend([object_error_missing_label]) - - identifiers = es_loader.get_object_ids(objects) - - fhir_dict = es_loader.get_dataframes(objects) - es_loader.create_entity(fhir_dict, identifiers, entity_set=es) - - relationships = es_loader.get_relationships(objects, list(fhir_dict.keys())) - es_loader.create_relationships(relationships, entity_set=es) - - return es - - -@pytest.fixture() -def entityset_error_missing_cutoff_label(objects, objects_error_missing_cutoff_label, es_loader): - es = ft.EntitySet(id="test") - - for object in objects: - es_loader.create_entity(object, entity_set=es) - - for object in objects: - es_loader.create_relationships(object, entity_set=es) - - es_loader.create_entity(objects_error_missing_cutoff_label, entity_set=es) - es_loader.create_relationships(objects_error_missing_cutoff_label, entity_set=es) - return es - - -def test_generate_cutoff_times_success( - es_success, missed_appointment, cutoff_times): - _, _, generated_df = missed_appointment.generate_cutoff_times(es_success) - generated_df.index = cutoff_times.index # both should have the same index - generated_df = generated_df[cutoff_times.columns] # same columns order - assert generated_df.equals(cutoff_times) - - -def test_generate_cutoff_times_error( - entityset_error_missing_label, missed_appointment): - with pytest.raises(ValueError): - missed_appointment.generate_cutoff_times( - entityset_error_missing_label) - - -def test_generate_cutoff_times_error_value(es_success, missed_appointment): - es_success['Appointment'].df.loc[len(es_success['Appointment'].df)] = [ - nan, nan, nan, nan, nan] - with pytest.raises(ValueError): - missed_appointment.generate_cutoff_times( - es_success) - - -def test_generate_cutoff_times_missing_cutoff_time( - es_success, missed_appointment): - es_success['Appointment'].delete_variables(['created']) - with pytest.raises(ValueError): - missed_appointment.generate_cutoff_times( - es_success) From 09588ae2ff71db14a3d185a28e51dbaaeaddd49d Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 31 Mar 2021 15:00:43 -0400 Subject: [PATCH 05/13] using compose (wip) --- .gitignore | 5 + Makefile | 2 +- README.md | 8 +- cardea/core.py | 93 +++++++------------ cardea/data_assembling/entityset_loader.py | 4 +- cardea/data_labeling/definition.py | 68 +++++++------- cardea/data_labeling/length_of_stay.py | 10 +- cardea/data_labeling/mortality_prediction.py | 14 +-- cardea/data_labeling/predicting_diagnosis.py | 12 +-- cardea/data_labeling/readmission.py | 9 +- .../data_labeling/show_noshow_appointment.py | 8 +- cardea/data_labeling/utils.py | 7 +- .../{data_loader.rst => data_assembling.rst} | 4 +- .../{featurization.rst => featurizing.rst} | 4 +- docs/api_reference/index.rst | 5 +- docs/api_reference/problem_definition.rst | 60 ------------ .../{data_loading.rst => data_assembling.rst} | 0 docs/basic_concepts/index.rst | 2 +- .../basic_concepts/machine_learning_tasks.rst | 12 +-- docs/getting_started/quickstart.rst | 6 +- setup.py | 6 +- tests/cardea/data_labeling/test_definition.py | 11 --- .../data_labeling/test_length_of_stay.py | 10 -- .../test_mortality_prediction.py | 9 -- .../test_predicting_diagnosis.py | 9 -- .../cardea/data_labeling/test_readmission.py | 9 -- .../test_show_noshow_appointment.py | 9 -- tests/cardea/modeling/__init__.py | 0 tests/{cardea => data_assembling}/__init__.py | 0 .../data_assembling/test_data_loader.py | 0 .../data_assembling/test_entityset_loader.py | 0 .../data_assembling/test_load_mimic.py | 2 +- .../__init__.py | 0 .../test_definition.py} | 2 +- tests/data_labeling/test_length_of_stay.py | 2 + .../test_mortality_prediction.py | 2 + .../test_predicting_diagnosis.py | 2 + tests/data_labeling/test_readmission.py | 2 + .../test_show_noshow_appointment.py | 2 + .../data_labeling => featurizing}/__init__.py | 0 .../featurizing/test_featurization.py | 14 +-- tests/{cardea => }/fhir/test_fhirbase.py | 0 .../featurizing => modeling}/__init__.py | 0 tests/{cardea => }/modeling/test_modeler.py | 0 tox.ini | 6 ++ 45 files changed, 155 insertions(+), 275 deletions(-) rename docs/api_reference/{data_loader.rst => data_assembling.rst} (78%) rename docs/api_reference/{featurization.rst => featurizing.rst} (72%) delete mode 100644 docs/api_reference/problem_definition.rst rename docs/basic_concepts/{data_loading.rst => data_assembling.rst} (100%) delete mode 100644 tests/cardea/data_labeling/test_definition.py delete mode 100644 tests/cardea/data_labeling/test_length_of_stay.py delete mode 100644 tests/cardea/data_labeling/test_mortality_prediction.py delete mode 100644 tests/cardea/data_labeling/test_predicting_diagnosis.py delete mode 100644 tests/cardea/data_labeling/test_readmission.py delete mode 100644 tests/cardea/data_labeling/test_show_noshow_appointment.py delete mode 100644 tests/cardea/modeling/__init__.py rename tests/{cardea => data_assembling}/__init__.py (100%) rename tests/{cardea => }/data_assembling/test_data_loader.py (100%) rename tests/{cardea => }/data_assembling/test_entityset_loader.py (100%) rename tests/{cardea => }/data_assembling/test_load_mimic.py (93%) rename tests/{cardea/data_assembling => data_labeling}/__init__.py (100%) rename tests/{test_something.py => data_labeling/test_definition.py} (69%) create mode 100644 tests/data_labeling/test_length_of_stay.py create mode 100644 tests/data_labeling/test_mortality_prediction.py create mode 100644 tests/data_labeling/test_predicting_diagnosis.py create mode 100644 tests/data_labeling/test_readmission.py create mode 100644 tests/data_labeling/test_show_noshow_appointment.py rename tests/{cardea/data_labeling => featurizing}/__init__.py (100%) rename tests/{cardea => }/featurizing/test_featurization.py (79%) rename tests/{cardea => }/fhir/test_fhirbase.py (100%) rename tests/{cardea/featurizing => modeling}/__init__.py (100%) rename tests/{cardea => }/modeling/test_modeler.py (100%) diff --git a/.gitignore b/.gitignore index 1d1447de..a7228ab3 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ docs/cardea.rst docs/cardea.*.rst docs/modules.rst docs/api +docs/api_reference/api # PyBuilder target/ @@ -113,3 +114,7 @@ ENV/ # IntelliJ Idea .idea/ + +# output +data/ +*.csv diff --git a/Makefile b/Makefile index e7f11bd4..482857ee 100644 --- a/Makefile +++ b/Makefile @@ -113,7 +113,7 @@ test: ## run tests quickly with the default Python .PHONY: test-all test-all: ## run tests on every Python version with tox - tox + tox -r .PHONY: test-readme test-readme: ## run the readme snippets diff --git a/README.md b/README.md index 3acba885..683957b0 100644 --- a/README.md +++ b/README.md @@ -104,13 +104,15 @@ The output shown represents the entityset data structure where ``cardea.es`` is From there, you can select the prediction problem you aim to solve by specifying the name of the class, which in return gives us the ``label_times`` of the problem. ```python3 -label_times = cardea.select_problem('MissedAppointment') +from cardea.data_labeling import appointment_no_show + +label_times = cardea.select_problem(appointment_no_show) ``` ``label_times`` summarizes for each instance in the dataset (1) what is its corresponding label of the instance and (2) what is the time index that indicates the timespan allowed for calculating features that pertain to each instance in the dataset. ```bash - cutoff_time instance_id label + cutoff_time instance_id missed 0 2015-11-10 07:13:56 5030230 noshow 1 2015-12-03 08:17:28 5122866 fulfilled 2 2015-12-07 10:40:59 5134197 fulfilled @@ -132,7 +134,7 @@ feature_matrix = cardea.generate_features(label_times[:1000]) Once we have the features, we can now split the data into training and testing ```python3 -y = list(feature_matrix.pop('label')) +y = list(feature_matrix.pop('missed')) X = feature_matrix.values diff --git a/cardea/core.py b/cardea/core.py index 493a16b8..812bd51c 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -6,9 +6,9 @@ import logging import os import pickle -from inspect import isclass -from io import BytesIO from functools import partial +from inspect import ismethod +from io import BytesIO from urllib.request import urlopen from zipfile import ZipFile @@ -17,11 +17,9 @@ import cardea from cardea.data_assembling import EntitySetLoader, load_mimic_data +from cardea.data_labeling import DataLabeler from cardea.featurizing import Featurization from cardea.modeling import Modeler -from cardea.data_labeling import ( - diagnosis_prediction, length_of_stay, appointment_no_show, mortality, - readmission) LOGGER = logging.getLogger(__name__) @@ -73,8 +71,6 @@ def load_entityset(self, data, fhir=True): data (str): A directory of all .csv files that should be loaded. To load demo dataset, pass the name of the dataset "kaggle" or "mimic". - fhir (bool): - An indicator of whether to use FHIR or MIMIC schema. Returns: featuretools.EntitySet: @@ -82,12 +78,12 @@ def load_entityset(self, data, fhir=True): """ demo = ['kaggle', 'mimic'] if not os.path.exists(data) and data in demo: - data = self.download_demo(data) + path = self.download_demo(data) - if fhir: - self.es = self.es_loader.load_data_entityset(data) - else: - self.es = load_mimic_data(data) + if data == "kaggle": + self.es = self.es_loader.load_data_entityset(path) + elif data == "mimic": + self.es = load_mimic_data(path) @staticmethod def download_demo(name, data_path=DATA_PATH): @@ -100,9 +96,9 @@ def download_demo(name, data_path=DATA_PATH): LOGGER.info('Downloading dataset %s from %s', name, url) for file in compressed.namelist(): filename = os.path.join(data_path, file) - csv_file = compressed.open(file) + csv_file = compressed.open(file, 'r') - data = pd.read_csv(csv_file, dtype=str) + data = pd.read_csv(csv_file, dtype=str, encoding="utf-8") data.to_csv(filename, index=False) return data_path @@ -116,24 +112,25 @@ def list_problems(self): """ problems = set([]) - for attribute_string in dir(cardea.problem_definition): - attribute = getattr(cardea.problem_definition, attribute_string) - if isclass(attribute): - if attribute.__name__ and attribute.__name__ != 'ProblemDefinition': - problems.add(attribute.__name__) + for attribute_string in dir(cardea.data_labeling): + attribute = getattr(cardea.data_labeling, attribute_string) + if ismethod(attribute): + problems.add(attribute.__name__) return problems - def select_problem(self, selection, parameter=None): + def select_problem(self, function, parameter=None, **kwargs): """Select a prediction problem and extract information. Update the select_problem attribute and generate the cutoff times, the target entity and update the entityset. Args: - selection (str): - Name of the chosen prediction problem. - parameters (dict): + function (method): + function that defines the prediction task, it should return a + tuple of labeling function, the dataframe, and the name of the + target entity. + parameter (dict): Variables to change the default parameters, if any. Returns: @@ -142,50 +139,26 @@ def select_problem(self, selection, parameter=None): * A string indicating the selected target entity. * A dataframe of cutoff times and their target labels. """ - LOGGER.info("Selecting %s prediction problem", selection) - - # problem selection - if selection == 'LengthOfStay': - self.chosen_problem = length_of_stay - - elif selection == 'MortalityPrediction': - self.chosen_problem = mortality - - elif selection == 'MissedAppointment': - self.chosen_problem = appointment_no_show + LOGGER.info("Selecting %s prediction problem", str(function)) - elif selection == 'ProlongedLengthOfStay': - plos = partial(length_of_stay, parameter) - self.chosen_problem = plos + if parameter: + function = partial(function, parameter) - elif selection == 'Readmission' and parameter: - self.chosen_problem = Readmission(parameter) - - elif selection == 'Readmission': - self.chosen_problem = Readmission() - - elif selection == 'DiagnosisPrediction' and parameter: - diag = partial(diagnosis_prediction, parameter) - self.chosen_problem = diag - - elif selection == 'DiagnosisPrediction': - raise ValueError('unspecified diagnosis code') - - else: - raise ValueError('{} is not a defined problem'.format(selection)) + data_labeler = DataLabeler(function) # target label calculation - self.es, self.target_entity, cutoff = self.chosen_problem.generate_cutoff_times(self.es) + label_times, self.target_entity, self.prediction_type = data_labeler.generate_label_times( + self.es) # set default pipeline - if self.chosen_problem.prediction_type == "classification": + if self.prediction_type == "classification": pipeline = "Random Forest" else: pipeline = "Random Forest Regressor" - self.modeler = Modeler(pipeline, self.chosen_problem.prediction_type) + self.modeler = Modeler(pipeline, self.prediction_type) - return cutoff + return label_times def list_feature_primitives(self): """Returns built-in primitive in Featuretools. @@ -196,13 +169,13 @@ def list_feature_primitives(self): """ return ft.list_primitives() - def generate_features(self, cutoff): + def generate_features(self, label_times, verbose=False): """Returns a the calculated feature matrix. Args: es (featuretools.EntitySet): An entityset that holds data. - cutoff (pandas.DataFrame): + label_times (pandas.DataFrame): A dataframe that indicates cutoff time for each instance. Returns: @@ -212,7 +185,7 @@ def generate_features(self, cutoff): """ fm_encoded, _ = self.featurization.generate_feature_matrix( - self.es, self.target_entity, cutoff) + self.es, self.target_entity, label_times, verbose=verbose) fm_encoded = fm_encoded.reset_index(drop=True) return fm_encoded @@ -224,7 +197,7 @@ def select_pipeline(self, pipeline): A pipeline instance or the name/path of a pipeline. """ LOGGER.info("Selecting %s pipeline", pipeline) - self.modeler = Modeler(pipeline, self.chosen_problem.prediction_type) + self.modeler = Modeler(pipeline, self.prediction_type) def train_test_split(self, X, y, test_size, shuffle): """Split the training dataset and the testing dataset. diff --git a/cardea/data_assembling/entityset_loader.py b/cardea/data_assembling/entityset_loader.py index 3c44b3e6..1db4cc99 100644 --- a/cardea/data_assembling/entityset_loader.py +++ b/cardea/data_assembling/entityset_loader.py @@ -26,12 +26,12 @@ def create_entity(self, fhir, identifiers, entity_set): df.columns = map(str.lower, df.columns) if object_name == 'Period': - entity_set.entity_from_dataframe(entity_id=str(object_name).lower(), + entity_set.entity_from_dataframe(entity_id=str(object_name), dataframe=df, index=id, time_index="start") else: - entity_set.entity_from_dataframe(entity_id=str(object_name).lower(), + entity_set.entity_from_dataframe(entity_id=str(object_name), dataframe=df, index=id) diff --git a/cardea/data_labeling/definition.py b/cardea/data_labeling/definition.py index a4d587a9..c14eab7d 100644 --- a/cardea/data_labeling/definition.py +++ b/cardea/data_labeling/definition.py @@ -1,12 +1,11 @@ import composeml as cp -import pandas as pd class DataLabeler: """Class that defines the prediction problem. - This class supports the generation of `label_times` which - is fundamental to the feature generation phase as well + This class supports the generation of `label_times` which + is fundamental to the feature generation phase as well as specifying the target labels. Args: @@ -15,38 +14,39 @@ class DataLabeler: tuple of labeling function, the dataframe, and the name of the target entity. """ + def __init__(self, function): self.function = function def generate_label_times(self, es, *args, **kwargs): - """Searches the data to calculate label times. - - Args: - df (pandas.DataFrame): - Data frame to search and extract labels. - *args: - Positional arguments for label maker. - **kwargs: - Keyword arguments for label maker. - Returns: - composeml.LabelTimes: - Calculated labels with cutoff times. - """ - labeling_function, df, meta = self.function(es) - kwargs = {**meta, **kwargs} - target_entity = kwargs.get('target_entity') - time_index = kwargs.get('time_index') - window_size = kwargs.get('window_size') - thresh = kwargs.get('thresh') - label_maker = cp.LabelMaker(labeling_function=labeling_function, - target_entity=kwargs.get('target_entity'), - time_index=kwargs.get('time_index'), - window_size=kwargs.get('window_size')) - - label_times = label_maker.search(df.sort_values(time_index), - *args, - **kwargs) - if thresh is not None: - label_times = label_times.threshold(thresh) - - return label_times, kwargs.get('entity') \ No newline at end of file + """Searches the data to calculate label times. + + Args: + df (pandas.DataFrame): + Data frame to search and extract labels. + *args: + Positional arguments for label maker. + **kwargs: + Keyword arguments for label maker. + Returns: + composeml.LabelTimes: + Calculated labels with cutoff times. + """ + labeling_function, df, meta = self.function(es) + kwargs = {**meta, **kwargs} + kwargs.get('target_entity') + time_index = kwargs.get('time_index') + kwargs.get('window_size') + thresh = kwargs.get('thresh') + label_maker = cp.LabelMaker(labeling_function=labeling_function, + target_entity=kwargs.get('target_entity'), + time_index=kwargs.get('time_index'), + window_size=kwargs.get('window_size')) + + label_times = label_maker.search(df.sort_values(time_index), + *args, + **kwargs) + if thresh is not None: + label_times = label_times.threshold(thresh) + + return label_times, kwargs.get('entity'), kwargs.get('type') diff --git a/cardea/data_labeling/length_of_stay.py b/cardea/data_labeling/length_of_stay.py index 7bbd4ed9..a37751ee 100644 --- a/cardea/data_labeling/length_of_stay.py +++ b/cardea/data_labeling/length_of_stay.py @@ -14,10 +14,11 @@ 'time_index': 'start', } + def length_of_stay(es, k=None): - """Defines the labeling task of length of stay. - - Predict how many days the patient will be in the hospital. For + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ def los(ds, **kwargs): @@ -43,11 +44,10 @@ def los(ds, **kwargs): meta['thresh'] = k df = denormalize(es, entities=entities) - + # generate label df[end] = pd.to_datetime(df[end]) df[start] = pd.to_datetime(df[start]) df['los'] = df[end] - df[start] return los, df, meta - \ No newline at end of file diff --git a/cardea/data_labeling/mortality_prediction.py b/cardea/data_labeling/mortality_prediction.py index a9c3840f..aba9309b 100644 --- a/cardea/data_labeling/mortality_prediction.py +++ b/cardea/data_labeling/mortality_prediction.py @@ -1,4 +1,3 @@ -import pandas as pd from cardea.data_labeling.utils import denormalize @@ -14,10 +13,11 @@ 'time_index': 'start', } + def mortality(es): - """Defines the labeling task of length of stay. - - Predict how many days the patient will be in the hospital. For + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ def mortal(ds, **kwargs): @@ -29,17 +29,17 @@ def mortal(ds, **kwargs): elif es.id == 'fhir': meta = FHIR_META - entities = ['encounter', 'encounter_diagnosis', 'condition', + entities = ['encounter', 'encounter_diagnosis', 'condition', 'codeableconcept', 'coding', 'period'] meta['type'] = 'classification' meta['num_examples_per_instance'] = 1 df = denormalize(es, entities=entities) - + # generate label if es.id == 'fhir': - causes_of_death = ['X60', 'X84', 'Y87.0', 'X85', 'Y09', 'Y87.1', + causes_of_death = ['X60', 'X84', 'Y87.0', 'X85', 'Y09', 'Y87.1', 'V02', 'V04', 'V09.0', 'V09.2', 'V12', 'V14'] df['hospital_expire_flag'] = int(df['code'].isin(causes_of_death)) diff --git a/cardea/data_labeling/predicting_diagnosis.py b/cardea/data_labeling/predicting_diagnosis.py index fbb2e83f..2e566237 100644 --- a/cardea/data_labeling/predicting_diagnosis.py +++ b/cardea/data_labeling/predicting_diagnosis.py @@ -1,4 +1,3 @@ -import pandas as pd from cardea.data_labeling.utils import denormalize @@ -14,10 +13,11 @@ 'time_index': 'start', } + def diagnosis_prediction(es, diag): - """Defines the labeling task of length of stay. - - Predict how many days the patient will be in the hospital. For + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ def diagnosis(ds, **kwargs): @@ -30,7 +30,7 @@ def diagnosis(ds, **kwargs): elif es.id == 'fhir': meta = FHIR_META - entities = ['encounter', 'encounter_diagnosis', 'condition', + entities = ['encounter', 'encounter_diagnosis', 'condition', 'codeableconcept', 'coding', 'period'] column = 'code' @@ -38,5 +38,5 @@ def diagnosis(ds, **kwargs): meta['num_examples_per_instance'] = 1 df = denormalize(es, entities=entities) - + return diagnosis, df, meta diff --git a/cardea/data_labeling/readmission.py b/cardea/data_labeling/readmission.py index fec1950e..62fc5f90 100644 --- a/cardea/data_labeling/readmission.py +++ b/cardea/data_labeling/readmission.py @@ -14,10 +14,11 @@ 'time_index': 'end' } + def readmission(es, k=30): - """Defines the labeling task of length of stay. - - Predict how many days the patient will be in the hospital. For + """Defines the labeling task of length of stay. + + Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ def readmit(ds, **kwargs): @@ -42,7 +43,7 @@ def readmit(ds, **kwargs): meta['num_examples_per_instance'] = 2 df = denormalize(es, entities=entities) - + # generate label df[end] = pd.to_datetime(df[end]) df[start] = pd.to_datetime(df[start]) diff --git a/cardea/data_labeling/show_noshow_appointment.py b/cardea/data_labeling/show_noshow_appointment.py index 5518f0a6..b1c23b97 100644 --- a/cardea/data_labeling/show_noshow_appointment.py +++ b/cardea/data_labeling/show_noshow_appointment.py @@ -1,10 +1,10 @@ -import pandas as pd from cardea.data_labeling.utils import denormalize + def appointment_no_show(es): - """Defines the labeling task of appointment no show. + """Defines the labeling task of appointment no show. """ def missed(ds, **kwargs): return True if 'noshow' in ds["status"].values else False @@ -14,12 +14,12 @@ def missed(ds, **kwargs): meta = { "entity": "Appointment", - "target_entity": "identifier", # automatically, should this be the index of the table? + "target_entity": "identifier", # automatically, should this be the index of the table? "time_index": "created", "type": "classification", "num_examples_per_instance": 1 } df = denormalize(es, entities=['Appointment']) - + return missed, df, meta diff --git a/cardea/data_labeling/utils.py b/cardea/data_labeling/utils.py index dd5b0c35..70664428 100644 --- a/cardea/data_labeling/utils.py +++ b/cardea/data_labeling/utils.py @@ -1,4 +1,5 @@ -import inspect +import pandas as pd + def _search_relationship(es, left, right): for r in es.relationships: @@ -32,7 +33,7 @@ def denormalize(es, entities): selected entities. """ k = len(entities) - + # initial entity to start from (should be the target entity) first = entities[0] previous = [first] @@ -48,5 +49,5 @@ def denormalize(es, entities): how='left', suffixes=('', '_y')).filter(regex='^(?!.*_y)') previous.append(right) - + return df diff --git a/docs/api_reference/data_loader.rst b/docs/api_reference/data_assembling.rst similarity index 78% rename from docs/api_reference/data_loader.rst rename to docs/api_reference/data_assembling.rst index def53bfb..c02a55eb 100644 --- a/docs/api_reference/data_loader.rst +++ b/docs/api_reference/data_assembling.rst @@ -1,9 +1,9 @@ -.. _cardea.data_loader: +.. _cardea.data_assembling: cardea.data_loader ================== -.. currentmodule:: cardea.data_loader +.. currentmodule:: cardea.data_assembling EntitySet Loader ~~~~~~~~~~~~~~~~ diff --git a/docs/api_reference/featurization.rst b/docs/api_reference/featurizing.rst similarity index 72% rename from docs/api_reference/featurization.rst rename to docs/api_reference/featurizing.rst index ff3d98eb..3365cc24 100644 --- a/docs/api_reference/featurization.rst +++ b/docs/api_reference/featurizing.rst @@ -1,9 +1,9 @@ -.. _cardea.featurization: +.. _cardea.featurizing: cardea.featurization ==================== -.. currentmodule:: cardea.featurization +.. currentmodule:: cardea.featurizing Featurization ~~~~~~~~~~~~~~~~~ diff --git a/docs/api_reference/index.rst b/docs/api_reference/index.rst index 11ff1b1d..ce3f56dc 100644 --- a/docs/api_reference/index.rst +++ b/docs/api_reference/index.rst @@ -9,8 +9,7 @@ In this section you will find a detailed specification of all the public functio :maxdepth: 2 cardea - data_loader - problem_definition - featurization + data_assembling + featurizing modeling fhir \ No newline at end of file diff --git a/docs/api_reference/problem_definition.rst b/docs/api_reference/problem_definition.rst deleted file mode 100644 index 7a109238..00000000 --- a/docs/api_reference/problem_definition.rst +++ /dev/null @@ -1,60 +0,0 @@ -.. _cardea.problem_definition: - -cardea.problem_definition -========================= - -.. currentmodule:: cardea.problem_definition - -Prolonged Length of Stay -~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - ProlongedLengthOfStay - ProlongedLengthOfStay.generate_cutoff_times - -Length of Stay -~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - LengthOfStay - LengthOfStay.generate_cutoff_times - -Readmission -~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - Readmission - Readmission.generate_cutoff_times - -MortalityPrediction -~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - MortalityPrediction - MortalityPrediction.generate_cutoff_times - -DiagnosisPrediction -~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - DiagnosisPrediction - DiagnosisPrediction.generate_cutoff_times - -MissedAppointmentProblemDefinition -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. autosummary:: - :toctree: api/ - - MissedAppointment - MissedAppointment.generate_cutoff_times diff --git a/docs/basic_concepts/data_loading.rst b/docs/basic_concepts/data_assembling.rst similarity index 100% rename from docs/basic_concepts/data_loading.rst rename to docs/basic_concepts/data_assembling.rst diff --git a/docs/basic_concepts/index.rst b/docs/basic_concepts/index.rst index cf0ebc8d..2834298c 100644 --- a/docs/basic_concepts/index.rst +++ b/docs/basic_concepts/index.rst @@ -10,7 +10,7 @@ library to help you get started. .. toctree:: :maxdepth: 3 - data_loading + data_assembling machine_learning_tasks auto_featurization auto_ml diff --git a/docs/basic_concepts/machine_learning_tasks.rst b/docs/basic_concepts/machine_learning_tasks.rst index f3757aa5..034310d0 100644 --- a/docs/basic_concepts/machine_learning_tasks.rst +++ b/docs/basic_concepts/machine_learning_tasks.rst @@ -34,9 +34,11 @@ values in the **Missed Appointment** task: .. ipython:: python from cardea import Cardea + from cardea.data_labeling import appointment_no_show + cardea = Cardea() cardea.load_entityset(data='kaggle') - cardea.select_problem('MissedAppointment') + cardea.select_problem(appointment_no_show) Current Prediction Problems --------------------------- @@ -57,10 +59,4 @@ these are described as follows: 6. Readmission: a. Predicts whether a patient will revisit the hospital within certain period of time (a month by default). -You can see the list of problems using the ``list_problems(...)`` method, example: - -.. ipython:: python - - from cardea import Cardea - cardea = Cardea() - cardea.list_problems() +You can see the list of problems using the ``list_problems(...)`` method \ No newline at end of file diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst index db24350c..34b8f6b7 100644 --- a/docs/getting_started/quickstart.rst +++ b/docs/getting_started/quickstart.rst @@ -32,8 +32,10 @@ You can see the list of problem definitions and select one with the following co From there, you can select the prediction problem you aim to solve by specifying the name of the class, which in return gives us the ``label_times`` of the problem. .. ipython:: python + + from cardea.data_labeling import appointment_no_show - label_times = cardea.select_problem('MissedAppointment') + label_times = cardea.select_problem(appointment_no_show) label_times.head() Then, you can perform the AutoML steps and take advantage of Cardea. @@ -51,7 +53,7 @@ Once we have the features, we can now split the data into training and testing .. ipython:: python :okwarning: - y = list(feature_matrix.pop('label')) + y = list(feature_matrix.pop('missed')) X = feature_matrix.values X_train, X_test, y_train, y_test = cardea.train_test_split( diff --git a/setup.py b/setup.py index 87d1b912..7e759ffa 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,9 @@ 'baytune>=0.4.0,<0.5', 'pyCLI==2.0.3', 'scikit-learn>=0.21,<0.22', - 'featuretools>=0.20.0,<0.25' + 'featuretools>=0.20.0,<0.25', + 'composeml', + 'jedi==0.17.2' ] setup_requires = [ @@ -68,9 +70,9 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', ], description="Cardea", entry_points={ diff --git a/tests/cardea/data_labeling/test_definition.py b/tests/cardea/data_labeling/test_definition.py deleted file mode 100644 index bb6251eb..00000000 --- a/tests/cardea/data_labeling/test_definition.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest - -from cardea.data_labeling import DataLabeler - -def test_data_labeler(): - pass \ No newline at end of file diff --git a/tests/cardea/data_labeling/test_length_of_stay.py b/tests/cardea/data_labeling/test_length_of_stay.py deleted file mode 100644 index 9ce7b92f..00000000 --- a/tests/cardea/data_labeling/test_length_of_stay.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_labeling import length_of_stay - diff --git a/tests/cardea/data_labeling/test_mortality_prediction.py b/tests/cardea/data_labeling/test_mortality_prediction.py deleted file mode 100644 index aaa684b2..00000000 --- a/tests/cardea/data_labeling/test_mortality_prediction.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_labeling import mortality diff --git a/tests/cardea/data_labeling/test_predicting_diagnosis.py b/tests/cardea/data_labeling/test_predicting_diagnosis.py deleted file mode 100644 index 61d7456d..00000000 --- a/tests/cardea/data_labeling/test_predicting_diagnosis.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_labeling import diagnosis_prediction diff --git a/tests/cardea/data_labeling/test_readmission.py b/tests/cardea/data_labeling/test_readmission.py deleted file mode 100644 index 62286967..00000000 --- a/tests/cardea/data_labeling/test_readmission.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_labeling import readmission diff --git a/tests/cardea/data_labeling/test_show_noshow_appointment.py b/tests/cardea/data_labeling/test_show_noshow_appointment.py deleted file mode 100644 index 97f99aa2..00000000 --- a/tests/cardea/data_labeling/test_show_noshow_appointment.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import featuretools as ft -import pandas as pd -import pytest -from numpy import nan - -from cardea.data_labeling import appointment_no_show diff --git a/tests/cardea/modeling/__init__.py b/tests/cardea/modeling/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/cardea/__init__.py b/tests/data_assembling/__init__.py similarity index 100% rename from tests/cardea/__init__.py rename to tests/data_assembling/__init__.py diff --git a/tests/cardea/data_assembling/test_data_loader.py b/tests/data_assembling/test_data_loader.py similarity index 100% rename from tests/cardea/data_assembling/test_data_loader.py rename to tests/data_assembling/test_data_loader.py diff --git a/tests/cardea/data_assembling/test_entityset_loader.py b/tests/data_assembling/test_entityset_loader.py similarity index 100% rename from tests/cardea/data_assembling/test_entityset_loader.py rename to tests/data_assembling/test_entityset_loader.py diff --git a/tests/cardea/data_assembling/test_load_mimic.py b/tests/data_assembling/test_load_mimic.py similarity index 93% rename from tests/cardea/data_assembling/test_load_mimic.py rename to tests/data_assembling/test_load_mimic.py index 5752372e..912b1571 100644 --- a/tests/cardea/data_assembling/test_load_mimic.py +++ b/tests/data_assembling/test_load_mimic.py @@ -25,7 +25,7 @@ def relationships(admission): def test_get_table_properties_types(properties): types = properties[0] - assert len(types) == 19 and types['language'] == str + assert len(types) == 19 and types['LANGUAGE'] == str def test_get_table_properties_primkey(properties): diff --git a/tests/cardea/data_assembling/__init__.py b/tests/data_labeling/__init__.py similarity index 100% rename from tests/cardea/data_assembling/__init__.py rename to tests/data_labeling/__init__.py diff --git a/tests/test_something.py b/tests/data_labeling/test_definition.py similarity index 69% rename from tests/test_something.py rename to tests/data_labeling/test_definition.py index f5d0f9f0..654c0c6e 100644 --- a/tests/test_something.py +++ b/tests/data_labeling/test_definition.py @@ -2,5 +2,5 @@ # -*- coding: utf-8 -*- -def test_something(): +def test_data_labeler(): pass diff --git a/tests/data_labeling/test_length_of_stay.py b/tests/data_labeling/test_length_of_stay.py new file mode 100644 index 00000000..faa18be5 --- /dev/null +++ b/tests/data_labeling/test_length_of_stay.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/tests/data_labeling/test_mortality_prediction.py b/tests/data_labeling/test_mortality_prediction.py new file mode 100644 index 00000000..faa18be5 --- /dev/null +++ b/tests/data_labeling/test_mortality_prediction.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/tests/data_labeling/test_predicting_diagnosis.py b/tests/data_labeling/test_predicting_diagnosis.py new file mode 100644 index 00000000..faa18be5 --- /dev/null +++ b/tests/data_labeling/test_predicting_diagnosis.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/tests/data_labeling/test_readmission.py b/tests/data_labeling/test_readmission.py new file mode 100644 index 00000000..faa18be5 --- /dev/null +++ b/tests/data_labeling/test_readmission.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/tests/data_labeling/test_show_noshow_appointment.py b/tests/data_labeling/test_show_noshow_appointment.py new file mode 100644 index 00000000..faa18be5 --- /dev/null +++ b/tests/data_labeling/test_show_noshow_appointment.py @@ -0,0 +1,2 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- diff --git a/tests/cardea/data_labeling/__init__.py b/tests/featurizing/__init__.py similarity index 100% rename from tests/cardea/data_labeling/__init__.py rename to tests/featurizing/__init__.py diff --git a/tests/cardea/featurizing/test_featurization.py b/tests/featurizing/test_featurization.py similarity index 79% rename from tests/cardea/featurizing/test_featurization.py rename to tests/featurizing/test_featurization.py index 3ebc8e0e..c3b5d16d 100644 --- a/tests/cardea/featurizing/test_featurization.py +++ b/tests/featurizing/test_featurization.py @@ -46,12 +46,12 @@ def entityset(objects, es_loader): @pytest.fixture() -def cutoff(): - cutoff = pd.DataFrame({"instance_id": [10, 11, 12], - "time": ['1/1/2000 20:00', '2/1/2000 5:00', '3/1/2000 22:00']}) +def label_times(): + label_times = pd.DataFrame({"instance_id": [10, 11, 12], + "time": ['1/1/2000 20:00', '2/1/2000 5:00', '3/1/2000 22:00']}) - cutoff['time'] = pd.to_datetime(cutoff['time']) - return cutoff + label_times['time'] = pd.to_datetime(label_times['time']) + return label_times @pytest.fixture() @@ -59,7 +59,7 @@ def featurization(): return Featurization() -def test_generate_feature_matrix(featurization, entityset, cutoff): +def test_generate_feature_matrix(featurization, entityset, label_times): fm_encoded, features_encoded = featurization.generate_feature_matrix( - entityset, "Encounter", cutoff) + entityset, "Encounter", label_times) assert len(fm_encoded) == 3 and len(fm_encoded.columns) == 32 diff --git a/tests/cardea/fhir/test_fhirbase.py b/tests/fhir/test_fhirbase.py similarity index 100% rename from tests/cardea/fhir/test_fhirbase.py rename to tests/fhir/test_fhirbase.py diff --git a/tests/cardea/featurizing/__init__.py b/tests/modeling/__init__.py similarity index 100% rename from tests/cardea/featurizing/__init__.py rename to tests/modeling/__init__.py diff --git a/tests/cardea/modeling/test_modeler.py b/tests/modeling/test_modeler.py similarity index 100% rename from tests/cardea/modeling/test_modeler.py rename to tests/modeling/test_modeler.py diff --git a/tox.ini b/tox.ini index 6895ed00..5e7a7fdc 100644 --- a/tox.ini +++ b/tox.ini @@ -27,3 +27,9 @@ commands = skipsdist = true commands = /usr/bin/env make docs + + +[testenv:readme] +skipsdist = true +commands = + /usr/bin/env make test-readme \ No newline at end of file From 04e53197584241f1b54fe9350c3775f7318f6a93 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 31 Mar 2021 23:25:30 -0400 Subject: [PATCH 06/13] update Cardea API --- README.md | 42 +- cardea/core.py | 310 +++++----- cardea/data.py | 70 +++ cardea/featurizing/featurization.py | 11 +- notebooks/appointment_noshow_tutorial.ipynb | 609 ++++++++------------ 5 files changed, 503 insertions(+), 539 deletions(-) create mode 100644 cardea/data.py diff --git a/README.md b/README.md index 683957b0..223df068 100644 --- a/README.md +++ b/README.md @@ -69,16 +69,15 @@ To use this dataset download the data from here then unzip it in the root direct ```bash curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip ``` -To load the data, supply the ``data`` to the loader using the following command: +To load the data, supply the ``data_path`` to the loader. By default, ``cardea`` loads the kaggle dataset ```python3 -cardea.load_entityset(data='kaggle') +cardea.load_entityset(data_path='cardea/data/kaggle', fhir=True) ``` -> :bulb: To load local data, pass the folder path to ``data``. To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.es`` which should output the following: -```bash +``` Entityset: kaggle Entities: Address [Rows: 81, Columns: 2] @@ -104,20 +103,18 @@ The output shown represents the entityset data structure where ``cardea.es`` is From there, you can select the prediction problem you aim to solve by specifying the name of the class, which in return gives us the ``label_times`` of the problem. ```python3 -from cardea.data_labeling import appointment_no_show - -label_times = cardea.select_problem(appointment_no_show) +label_times = cardea.create_label_times() ``` ``label_times`` summarizes for each instance in the dataset (1) what is its corresponding label of the instance and (2) what is the time index that indicates the timespan allowed for calculating features that pertain to each instance in the dataset. -```bash - cutoff_time instance_id missed -0 2015-11-10 07:13:56 5030230 noshow -1 2015-12-03 08:17:28 5122866 fulfilled -2 2015-12-07 10:40:59 5134197 fulfilled -3 2015-12-07 10:42:42 5134220 noshow -4 2015-12-07 10:43:01 5134223 noshow +``` + identifier time missed +0 5030230 2015-11-10 07:13:56 True +1 5122866 2015-12-03 08:17:28 False +2 5134197 2015-12-07 10:40:59 False +3 5134220 2015-12-07 10:42:42 True +4 5134223 2015-12-07 10:43:01 True ``` You can read more about ``label_times`` [here](https://mlbazaar.github.io/Cardea/basic_concepts/machine_learning_tasks.html). @@ -134,8 +131,7 @@ feature_matrix = cardea.generate_features(label_times[:1000]) Once we have the features, we can now split the data into training and testing ```python3 -y = list(feature_matrix.pop('missed')) - +y = feature_matrix.pop('missed').values X = feature_matrix.values X_train, X_test, y_train, y_test = cardea.train_test_split( @@ -145,7 +141,7 @@ X_train, X_test, y_train, y_test = cardea.train_test_split( Now that we have our feature matrix properly divided, we can use to train our machine learning pipeline, Modeling, optimizing hyperparameters and finding the most optimal model ```python3 -cardea.select_pipeline('Random Forest') +cardea.set_pipeline('Random Forest') cardea.fit(X_train, y_train) y_pred = cardea.predict(X_test) ``` @@ -155,11 +151,11 @@ Finally, you can evaluate the performance of the model cardea.evaluate(X, y, test_size=0.2, shuffle=True) ``` which returns the scoring metric depending on the type of problem -```bash -{'Accuracy': 0.75, - 'F1 Macro': 0.5098039215686274, - 'Precision': 0.5183001719479243, - 'Recall': 0.5123528436411872} +``` +Accuracy 0.75 +F1 Macro 0.5098 +Precision 0.5183 +Recall 0.5123 ``` # Citation @@ -167,7 +163,7 @@ If you use Cardea for your research, please consider citing the following paper: Sarah Alnegheimish; Najat Alrashed; Faisal Aleissa; Shahad Althobaiti; Dongyu Liu; Mansour Alsaleh; Kalyan Veeramachaneni. [Cardea: An Open Automated Machine Learning Framework for Electronic Health Records](https://arxiv.org/abs/2010.00509). [IEEE DSAA 2020](https://ieeexplore.ieee.org/document/9260104). -```bash +``` @inproceedings{alnegheimish2020cardea, title={Cardea: An Open Automated Machine Learning Framework for Electronic Health Records}, author={Alnegheimish, Sarah and Alrashed, Najat and Aleissa, Faisal and Althobaiti, Shahad and Liu, Dongyu and Alsaleh, Mansour and Veeramachaneni, Kalyan}, diff --git a/cardea/core.py b/cardea/core.py index 812bd51c..8110ce19 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -3,209 +3,217 @@ This module defines the Cardea Class, which is responsible for the tying all components together, as well as the interact with them. """ +import json import logging import os import pickle from functools import partial from inspect import ismethod -from io import BytesIO -from urllib.request import urlopen -from zipfile import ZipFile +from types import FunctionType +from typing import List, Union -import featuretools as ft +import numpy as np import pandas as pd +from mlblocks import MLPipeline import cardea +from cardea.data import DEMO_DATA, download from cardea.data_assembling import EntitySetLoader, load_mimic_data -from cardea.data_labeling import DataLabeler +from cardea.data_labeling import DataLabeler, appointment_no_show from cardea.featurizing import Featurization from cardea.modeling import Modeler LOGGER = logging.getLogger(__name__) -DATA_PATH = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - 'data' -) -BUCKET = 'dai-cardea' -S3_URL = 'https://{}.s3.amazonaws.com/{}' +DEFAULT_DATA = 'kaggle' +DEFAULT_FHIR = True +DEFAULT_LABELER = appointment_no_show +DEFAULT_PIPELINE = 'XGB' +DEFAULT_METRICS = ["Accuracy", "F1 Macro", "Precision", "Recall"] +class Cardea: + """Cardea Class. -class Cardea(): - """An interface class that ties the end-to-end system together. + The Cardea Class provides the main functionalities + to load data, create prediction tasks, and build + pipelines. Args: - es_loader (EntitySetLoader): - An entityset loader. - featurization (Featurization): - A featurization class. - modeler (Modeler): - A modeling class. - problems (list): - A list of currently available prediction problems. - chosen_problem (str): - The selected prediction problem or regression. - es (featuretools.EntitySet): - The loaded entityset. - target_entity (str): - The target entity for featurization. + data (str): + Path or name of the dataset to load into an entityset. + fhir (bool): + Indicator of whether to use FHIR or MIMIC schema. + labeler (method): + Function to defined the data labeler for the wanted prediction problem. + pipeline (str, dict or MLPipeline): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (dict): + Additional hyperparameters to set to the pipeline. """ - def __init__(self): - - self.es_loader = EntitySetLoader() - self.featurization = Featurization() - - self.es = None - self.chosen_problem = None - self.target_entity = None - self.modeler = None - - def load_entityset(self, data, fhir=True): + def load_entityset(self, data_path: str, fhir: bool = False) -> None: """Returns an entityset loaded with .csv files in data. Load the given dataset into an entityset. The dataset must be in FHIR or MIMIC structure format. Args: - data (str): - A directory of all .csv files that should be loaded. To load demo dataset, - pass the name of the dataset "kaggle" or "mimic". + data_path (str): + A directory of all .csv files that should be loaded. + fhir (bool): + An indicator whether FHIR or MIMIC schema is used. This parameter is + ignored when loading demo data. Returns: featuretools.EntitySet: An entityset with loaded data. """ - demo = ['kaggle', 'mimic'] - if not os.path.exists(data) and data in demo: - path = self.download_demo(data) + LOGGER.info("Loading data %s", data_path) + + if fhir: + self.es = self._es_loader.load_data_entityset(data_path) + + else: + self.es = load_mimic_data(data_path) + + def _set_modeler(self): + pipeline = self._pipeline + if isinstance(pipeline, str) and os.path.isfile(pipeline): + with open(pipeline) as json_file: + pipeline = json.load(json_file) + + mlpipeline = MLPipeline(pipeline) + if self._hyperparameters: + mlpipeline.set_hyperparameters(self._hyperparameters) + + self._modeler = Modeler(mlpipeline, self._type) - if data == "kaggle": - self.es = self.es_loader.load_data_entityset(path) - elif data == "mimic": - self.es = load_mimic_data(path) + def _set_entityset(self): + data = self._data - @staticmethod - def download_demo(name, data_path=DATA_PATH): - data_path = os.path.join(data_path, name) - os.makedirs(data_path, exist_ok=True) + if data in DEMO_DATA: + fhir = False if data == "mimic" else True + data_path = download(data) - url = S3_URL.format(BUCKET, '{}.zip'.format(name)) - compressed = ZipFile(BytesIO(urlopen(url).read())) + self.load_entityset(data_path, fhir) - LOGGER.info('Downloading dataset %s from %s', name, url) - for file in compressed.namelist(): - filename = os.path.join(data_path, file) - csv_file = compressed.open(file, 'r') + def __init__(self, data: str = None, labeler: FunctionType = None, + pipeline: Union[str, dict, MLPipeline] = None, hyperparameters: dict = None): + self._data = data or DEFAULT_DATA + self._pipeline = pipeline or DEFAULT_PIPELINE + self._hyperparameters = hyperparameters - data = pd.read_csv(csv_file, dtype=str, encoding="utf-8") - data.to_csv(filename, index=False) + self._es_loader = EntitySetLoader() + self._featurization = Featurization() + self._modeler = None - return data_path + self._target = None + self._set_entityset() - def list_problems(self): - """Returns a list of the currently available problems. + def list_labelers(self) -> set: + """Returns a list of the currently available data labelers. Returns: list: - A list of the available problems. + A list of the available data labelers. """ - problems = set([]) - for attribute_string in dir(cardea.data_labeling): - attribute = getattr(cardea.data_labeling, attribute_string) - if ismethod(attribute): - problems.add(attribute.__name__) + labelers = set() + for labeler_string in dir(cardea.data_labeling): + labeler = getattr(cardea.data_labeling, labeler_string) + if isinstance(labeler, FunctionType): + labelers.add(labeler.__name__) - return problems + return labelers - def select_problem(self, function, parameter=None, **kwargs): - """Select a prediction problem and extract information. + def create_label_times(self, labeler: FunctionType = None, + parameter: dict = None) -> pd.DataFrame: + """Create label times using the data labeler. - Update the select_problem attribute and generate the cutoff times, - the target entity and update the entityset. + Update the labeling function and generate the label times, Args: - function (method): - function that defines the prediction task, it should return a + labeler (function): + Function that defines the prediction task, it should return a tuple of labeling function, the dataframe, and the name of the target entity. parameter (dict): Variables to change the default parameters, if any. Returns: - featuretools.EntitySet, str, pandas.DataFrame: - * An updated EntitySet if a new column is generated. - * A string indicating the selected target entity. - * A dataframe of cutoff times and their target labels. + pandas.DataFrame: + A dataframe of cutoff times and their target labels. """ - LOGGER.info("Selecting %s prediction problem", str(function)) + labeler = labeler or DEFAULT_LABELER if parameter: - function = partial(function, parameter) + labeler = partial(labeler, **parameter) - data_labeler = DataLabeler(function) + LOGGER.info("Using labeler %s", str(labeler.__name__)) + data_labeler = DataLabeler(labeler) # target label calculation - label_times, self.target_entity, self.prediction_type = data_labeler.generate_label_times( + label_times, self._target, self._type = data_labeler.generate_label_times( self.es) # set default pipeline - if self.prediction_type == "classification": - pipeline = "Random Forest" - else: - pipeline = "Random Forest Regressor" - - self.modeler = Modeler(pipeline, self.prediction_type) + self._set_modeler() return label_times - def list_feature_primitives(self): - """Returns built-in primitive in Featuretools. - - Returns: - pandas.DataFrame: - A dataframe that lists and describes each built-in primitives. - """ - return ft.list_primitives() - - def generate_features(self, label_times, verbose=False): + def generate_features(self, label_times: pd.DataFrame, + verbose: bool = False) -> pd.DataFrame: """Returns a the calculated feature matrix. Args: - es (featuretools.EntitySet): - An entityset that holds data. label_times (pandas.DataFrame): A dataframe that indicates cutoff time for each instance. + verbose (bool): + Indicate verbosity of the featurization. Returns: - pandas.DataFrame, list: - * The generated feature matrix. - * List of feature definitions in the feature matrix. + pandas.DataFrame: + Generated feature matrix. """ - fm_encoded, _ = self.featurization.generate_feature_matrix( - self.es, self.target_entity, label_times, verbose=verbose) - fm_encoded = fm_encoded.reset_index(drop=True) - return fm_encoded + fm, _ = self._featurization.generate_feature_matrix( + self.es, self._target, label_times, verbose=verbose) - def select_pipeline(self, pipeline): + return fm + + def set_pipeline(self, pipeline: Union[str, dict, MLPipeline], + hyperparameters: dict = None) -> None: """Select a pipeline. Args: - pipeline (MLPipeline or str): - A pipeline instance or the name/path of a pipeline. + pipeline (str, dict or MLPipeline): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (dict): + Additional hyperparameters to set to the pipeline. """ - LOGGER.info("Selecting %s pipeline", pipeline) - self.modeler = Modeler(pipeline, self.prediction_type) + LOGGER.info("Setting %s pipeline", pipeline) + + self._pipeline = pipeline + self._hyperparameters = hyperparameters + self._set_modeler() - def train_test_split(self, X, y, test_size, shuffle): + def train_test_split(self, X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series, list], test_size: float = 0.2, + shuffle: bool = True) -> List[Union[pd.DataFrame, np.ndarray]]: """Split the training dataset and the testing dataset. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series or ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. test_size (float): The proportion of the dataset to include in the test dataset. @@ -216,15 +224,17 @@ def train_test_split(self, X, y, test_size, shuffle): list: List containing the train-test split of the inputs and targets. """ - return self.modeler.train_test_split(X, y, test_size, shuffle) + return self._modeler.train_test_split(X, y, test_size, shuffle) - def fit(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): + def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series, list], + tune: bool = False, max_evals: int = 10, scoring: str = None, + verbose: bool = False) -> None: """Train the cardea pipeline. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. tune (bool): Whether to optimize hyper-parameters of the pipelines. @@ -235,9 +245,9 @@ def fit(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): verbose (bool): Whether to log information during processing. """ - self.modeler.fit(X, y, tune, max_evals, scoring, verbose) + self._modeler.fit(X, y, tune, max_evals, scoring, verbose) - def predict(self, X): + def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> Union[np.ndarray, list]: """Get predictions from the cardea pipeline. Args: @@ -245,18 +255,21 @@ def predict(self, X): Inputs to the pipeline. Returns: - ndarray: + numpy.ndarray or list: Predictions to the input data. """ - return self.modeler.predict(X) + return self._modeler.predict(X) - def fit_predict(self, X, y, tune=False, max_evals=10, scoring=None, verbose=False): + def fit_predict(self, X: Union[np.ndarray, pd.DataFrame], + y: Union[np.ndarray, pd.Series, list], tune: bool = False, + max_evals: int = 10, scoring: str = None, + verbose: bool = False) -> Union[np.ndarray, list]: """Train a cardea pipeline then make predictions. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series or ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. tune (bool): Whether to optimize hyper-parameters of the pipelines. @@ -271,21 +284,26 @@ def fit_predict(self, X, y, tune=False, max_evals=10, scoring=None, verbose=Fals ndarray: Predictions to the input data. """ - return self.modeler.fit_predict(X, y, tune, max_evals, scoring, verbose) + return self._modeler.fit_predict(X, y, tune, max_evals, scoring, verbose) - def evaluate(self, X, y, test_size=0.2, shuffle=True, tune=False, max_evals=10, scoring=None, - metrics=None, verbose=False): + def evaluate(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series, list], + test_size: float = 0.2, shuffle: bool = True, fit: bool = False, + tune: bool = False, max_evals: int = 10, scoring: str = None, + metrics: List[str] = DEFAULT_METRICS, verbose: bool = False) -> pd.Series: """Evaluate the cardea pipeline. Args: - X (pandas.DataFrame or ndarray): + X (pandas.DataFrame or numpy.ndarray): Inputs to the pipeline. - y (pandas.Series or ndarray): + y (pandas.Series, numpy.ndarray or list): Target values. test_size (float): The proportion of the dataset to include in the test dataset. shuffle (bool): Whether or not to shuffle the data before splitting. + fit (bool): + Whether to fit the pipeline before evaluating it. + Defaults to ``False``. tune (bool): Whether to optimize hyper-parameters of the pipelines. max_evals (int): @@ -297,11 +315,31 @@ def evaluate(self, X, y, test_size=0.2, shuffle=True, tune=False, max_evals=10, with the problem type. verbose (bool): Whether to log information during processing. + + Returns: + Series: + ``pandas.Series`` containing one element for each + metric applied, with the metric name as index. """ - return self.modeler.evaluate( - X, y, test_size, shuffle, tune, max_evals, scoring, metrics, verbose) + if fit: + X_train, X_test, y_train, y_test = self.train_test_split( + X, y, test_size=test_size, shuffle=shuffle) + + self._modeler.fit( + X_train, y_train, tune=tune, max_evals=max_evals, scoring=scoring, verbose=verbose) + + else: + X_test = X + y_test = y + + scores = { + metric: self._modeler.test(X_test, y_test, scoring=metric) + for metric in metrics + } + + return pd.Series(scores) - def save(self, path): + def save(self, path: str): """Save this object using pickle. Args: diff --git a/cardea/data.py b/cardea/data.py new file mode 100644 index 00000000..eb04677f --- /dev/null +++ b/cardea/data.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- + +""" +Data Management module. +This module contains functions that allow downloading demo data from Amazon S3 +The demo data is a modified version of the missed appointment data found here: +https://www.kaggle.com/joniarroba/noshowappointments +Another demo data is also available for mimic dataset: +https://physionet.org/files/mimiciii-demo/1.4/ +""" + +import logging +import os +from io import BytesIO +from urllib.request import urlopen +from zipfile import ZipFile + +import pandas as pd + +LOGGER = logging.getLogger(__name__) + +DATA_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'data' +) +BUCKET = 'dai-cardea' +S3_URL = 'https://{}.s3.amazonaws.com/{}' + +DEMO_DATA = ("kaggle", "mimic") + + +def download(name, data_path=DATA_PATH): + """Download demo data with the given name from S3. + + If the data has never been loaded before, it will be downloaded + from the [dai-cardea bucket](https://dai-cardea.s3.amazonaws.com) or + the S3 bucket specified following the `s3://{bucket}/path/to/the.csv` format, + and then cached inside the `data` folder, within the `cardea` package + directory, and then returned. + + Otherwise, if it has been downloaded and cached before, it will be directly + loaded from the `cardea/data` folder without contacting S3. + + Args: + name (str): Name of demo data + + Returns: + str: + path to the downloaded data + """ + if name not in DEMO_DATA: + raise KeyError("unknown demo data {}".format(name)) + + data_path = os.path.join(data_path, name) + + if not os.path.exists(data_path): + os.makedirs(data_path, exist_ok=True) + url = S3_URL.format(BUCKET, '{}.zip'.format(name)) + compressed = ZipFile(BytesIO(urlopen(url).read())) + + LOGGER.info('Downloading dataset %s from %s', name, url) + + for file in compressed.namelist(): + filename = os.path.join(data_path, file) + csv_file = compressed.open(file, 'r') + + data = pd.read_csv(csv_file, dtype=str, encoding="utf-8") + data.to_csv(filename, index=False) + + return data_path diff --git a/cardea/featurizing/featurization.py b/cardea/featurizing/featurization.py index 88eb854a..dc65e224 100644 --- a/cardea/featurizing/featurization.py +++ b/cardea/featurizing/featurization.py @@ -22,7 +22,7 @@ def n_jobs(): def max_depth(): return 2 - def generate_feature_matrix(self, es, target, cutoff, verbose=True): + def generate_feature_matrix(self, es, target, cutoff, verbose=True, encode=False): """Calculates a feature matrix and features given in Featurization object. Args: @@ -35,6 +35,8 @@ def generate_feature_matrix(self, es, target, cutoff, verbose=True): Specified times at which to calculate the features for each instance. verbose (bool): An indicator of verbose option. + encode (bool): + Whether or not to encode categorical features Returns: pandas.DataFrame, list: @@ -51,8 +53,7 @@ def generate_feature_matrix(self, es, target, cutoff, verbose=True): max_depth=self.max_depth(), verbose=verbose) - # encode categorical values - fm_encoded, features_encoded = ft.encode_features(feature_matrix, - features_defs) + if encode: + return ft.encode_features(feature_matrix, features_defs) - return fm_encoded, features_encoded + return feature_matrix, features_defs diff --git a/notebooks/appointment_noshow_tutorial.ipynb b/notebooks/appointment_noshow_tutorial.ipynb index e5ddac68..8b20c92b 100644 --- a/notebooks/appointment_noshow_tutorial.ipynb +++ b/notebooks/appointment_noshow_tutorial.ipynb @@ -20,41 +20,20 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bF4XlPH8UPPO", - "outputId": "aca5874e-a449-4ec5-f111-78a2fc7a29c2" - }, - "outputs": [], - "source": [ - "# if you are running from Google Colab, uncomment the following commands to \n", - "# install cardea.\n", - "\n", - "# ! pip install cardea\n", - "# ! pip install 'urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1'" - ] - }, - { - "cell_type": "code", - "execution_count": 2, "metadata": { "id": "uqYRyFYLVfBK" }, "outputs": [], "source": [ - "# imports \n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.metrics import accuracy_score\n", + "%load_ext autoreload\n", + "%autoreload 2\n", "\n", "from cardea import Cardea" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "id": "rA1hkWm3VkpI" }, @@ -76,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "bN9h70jQVm6V" }, @@ -97,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -112,14 +91,13 @@ "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", - "100 2983k 100 2983k 0 0 4618k 0 --:--:-- --:--:-- --:--:-- 4611k\n", + "100 2988k 100 2988k 0 0 6345k 0 --:--:-- --:--:-- --:--:-- 6332k\n", "Archive: kaggle.zip\n", - " creating: kaggle/\n", " inflating: kaggle/Patient.csv \n", " inflating: kaggle/Coding.csv \n", " inflating: kaggle/Appointment_Participant.csv \n", " inflating: kaggle/Address.csv \n", - " inflating: kaggle/CodeableConcept.csv \n", + " extracting: kaggle/CodeableConcept.csv \n", " inflating: kaggle/Reference.csv \n", " inflating: kaggle/Observation.csv \n", " inflating: kaggle/Identifier.csv \n", @@ -128,12 +106,12 @@ } ], "source": [ - "! curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip kaggle.zip" + "! curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle.zip" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -165,13 +143,13 @@ " Appointment.participant -> Appointment_Participant.object_id" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cd.load_data_entityset(folder_path='kaggle')\n", + "cd.load_entityset(data_path='kaggle', fhir=True)\n", "\n", "# to view the loaded entityset\n", "cd.es" @@ -198,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -210,21 +188,20 @@ { "data": { "text/plain": [ - "{'DiagnosisPrediction',\n", - " 'LengthOfStay',\n", - " 'MissedAppointmentProblemDefinition',\n", - " 'MortalityPrediction',\n", - " 'ProlongedLengthOfStay',\n", - " 'Readmission'}" + "{'appointment_no_show',\n", + " 'diagnosis_prediction',\n", + " 'length_of_stay',\n", + " 'mortality',\n", + " 'readmission'}" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cd.list_problems()" + "cd.list_labelers()" ] }, { @@ -249,6 +226,13 @@ "outputId": "0281f75b-9e89-4c90-84ae-774415e10d11" }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elapsed: 01:50 | Remaining: 00:00 | Progress: 100%|██████████| identifier: 110527/110527 \n" + ] + }, { "data": { "text/html": [ @@ -270,53 +254,53 @@ " \n", " \n", " \n", - " cutoff_time\n", - " instance_id\n", - " label\n", + " identifier\n", + " time\n", + " missed\n", " \n", " \n", " \n", " \n", - " 5030230\n", - " 2015-11-10 07:13:56\n", + " 0\n", " 5030230\n", - " noshow\n", + " 2015-11-10 07:13:56\n", + " True\n", " \n", " \n", - " 5122866\n", - " 2015-12-03 08:17:28\n", + " 1\n", " 5122866\n", - " fulfilled\n", + " 2015-12-03 08:17:28\n", + " False\n", " \n", " \n", - " 5134197\n", - " 2015-12-07 10:40:59\n", + " 2\n", " 5134197\n", - " fulfilled\n", + " 2015-12-07 10:40:59\n", + " False\n", " \n", " \n", - " 5134220\n", - " 2015-12-07 10:42:42\n", + " 3\n", " 5134220\n", - " noshow\n", + " 2015-12-07 10:42:42\n", + " True\n", " \n", " \n", - " 5134223\n", - " 2015-12-07 10:43:01\n", + " 4\n", " 5134223\n", - " noshow\n", + " 2015-12-07 10:43:01\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " cutoff_time instance_id label\n", - "5030230 2015-11-10 07:13:56 5030230 noshow\n", - "5122866 2015-12-03 08:17:28 5122866 fulfilled\n", - "5134197 2015-12-07 10:40:59 5134197 fulfilled\n", - "5134220 2015-12-07 10:42:42 5134220 noshow\n", - "5134223 2015-12-07 10:43:01 5134223 noshow" + " identifier time missed\n", + "0 5030230 2015-11-10 07:13:56 True\n", + "1 5122866 2015-12-03 08:17:28 False\n", + "2 5134197 2015-12-07 10:40:59 False\n", + "3 5134220 2015-12-07 10:42:42 True\n", + "4 5134223 2015-12-07 10:43:01 True" ] }, "execution_count": 8, @@ -326,7 +310,7 @@ ], "source": [ "# select problem\n", - "label_times = cd.select_problem('MissedAppointmentProblemDefinition')\n", + "label_times = cd.create_label_times()\n", "label_times.head(5)" ] }, @@ -347,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -361,8 +345,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Built 13 features\n", - "Elapsed: 00:52 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks\n" + "Built 14 features\n", + "Elapsed: 00:26 | Progress: 100%|██████████\n" ] }, { @@ -386,392 +370,267 @@ " \n", " \n", " \n", - " participant = 4159901403\n", - " participant = 3856467788\n", - " participant = 3807024061\n", - " participant = 3215247433\n", - " participant = 2872476717\n", - " participant = 2417505282\n", - " participant = 4162690658\n", - " participant = 3562155678\n", - " participant = 3488625302\n", - " participant = 3418939447\n", - " ...\n", - " Appointment_Participant.actor = 27200000000000\n", - " Appointment_Participant.actor = 9740000000000\n", - " Appointment_Participant.actor = 8460000000000\n", - " Appointment_Participant.actor = 923000000000000\n", - " Appointment_Participant.actor = 795000000000000\n", - " Appointment_Participant.actor = 724000000000000\n", - " Appointment_Participant.actor = 659000000000000\n", - " Appointment_Participant.actor is unknown\n", + " status\n", + " participant\n", + " DAY(created)\n", + " DAY(start)\n", + " IS_WEEKEND(created)\n", + " IS_WEEKEND(start)\n", + " MONTH(created)\n", + " MONTH(start)\n", + " WEEKDAY(created)\n", + " WEEKDAY(start)\n", + " YEAR(created)\n", + " YEAR(start)\n", + " Appointment_Participant.actor\n", " Appointment_Participant.COUNT(Appointment)\n", - " label\n", + " missed\n", + " \n", + " \n", + " identifier\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", + " 5030230\n", + " noshow\n", + " 3353377007\n", + " 10\n", + " 4\n", + " False\n", + " False\n", + " 11\n", + " 5\n", " 1\n", + " 2\n", + " 2015\n", + " 2016\n", + " 832000000000000\n", " 56\n", - " noshow\n", + " True\n", " \n", " \n", - " 1\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 55\n", + " 5122866\n", " fulfilled\n", + " 486500845\n", + " 3\n", + " 2\n", + " False\n", + " False\n", + " 12\n", + " 5\n", + " 3\n", + " 0\n", + " 2015\n", + " 2016\n", + " 91600000000000\n", + " 55\n", + " False\n", " \n", " \n", - " 2\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 33\n", + " 5134197\n", " fulfilled\n", + " 64062658\n", + " 7\n", + " 3\n", + " False\n", + " False\n", + " 12\n", + " 6\n", + " 0\n", + " 4\n", + " 2015\n", + " 2016\n", + " 1220000000000\n", + " 33\n", + " False\n", " \n", " \n", - " 3\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 48\n", + " 5134220\n", " noshow\n", + " 207195819\n", + " 7\n", + " 3\n", + " False\n", + " False\n", + " 12\n", + " 6\n", + " 0\n", + " 4\n", + " 2015\n", + " 2016\n", + " 31900000000000\n", + " 48\n", + " True\n", " \n", " \n", - " 4\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " ...\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 0\n", - " 1\n", - " 38\n", + " 5134223\n", " noshow\n", + " 1089855247\n", + " 7\n", + " 3\n", + " False\n", + " False\n", + " 12\n", + " 6\n", + " 0\n", + " 4\n", + " 2015\n", + " 2016\n", + " 9580000000000\n", + " 38\n", + " True\n", " \n", " \n", "\n", - "

5 rows × 75 columns

\n", "" ], "text/plain": [ - " participant = 4159901403 participant = 3856467788 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 3807024061 participant = 3215247433 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 2872476717 participant = 2417505282 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 4162690658 participant = 3562155678 \\\n", - "0 0 0 \n", - "1 0 0 \n", - "2 0 0 \n", - "3 0 0 \n", - "4 0 0 \n", - "\n", - " participant = 3488625302 participant = 3418939447 ... \\\n", - "0 0 0 ... \n", - "1 0 0 ... \n", - "2 0 0 ... \n", - "3 0 0 ... \n", - "4 0 0 ... \n", - "\n", - " Appointment_Participant.actor = 27200000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 9740000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 8460000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 923000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 795000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", - "\n", - " Appointment_Participant.actor = 724000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", + " status participant DAY(created) DAY(start) \\\n", + "identifier \n", + "5030230 noshow 3353377007 10 4 \n", + "5122866 fulfilled 486500845 3 2 \n", + "5134197 fulfilled 64062658 7 3 \n", + "5134220 noshow 207195819 7 3 \n", + "5134223 noshow 1089855247 7 3 \n", "\n", - " Appointment_Participant.actor = 659000000000000 \\\n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 \n", + " IS_WEEKEND(created) IS_WEEKEND(start) MONTH(created) \\\n", + "identifier \n", + "5030230 False False 11 \n", + "5122866 False False 12 \n", + "5134197 False False 12 \n", + "5134220 False False 12 \n", + "5134223 False False 12 \n", "\n", - " Appointment_Participant.actor is unknown \\\n", - "0 1 \n", - "1 1 \n", - "2 1 \n", - "3 1 \n", - "4 1 \n", + " MONTH(start) WEEKDAY(created) WEEKDAY(start) YEAR(created) \\\n", + "identifier \n", + "5030230 5 1 2 2015 \n", + "5122866 5 3 0 2015 \n", + "5134197 6 0 4 2015 \n", + "5134220 6 0 4 2015 \n", + "5134223 6 0 4 2015 \n", "\n", - " Appointment_Participant.COUNT(Appointment) label \n", - "0 56 noshow \n", - "1 55 fulfilled \n", - "2 33 fulfilled \n", - "3 48 noshow \n", - "4 38 noshow \n", + " YEAR(start) Appointment_Participant.actor \\\n", + "identifier \n", + "5030230 2016 832000000000000 \n", + "5122866 2016 91600000000000 \n", + "5134197 2016 1220000000000 \n", + "5134220 2016 31900000000000 \n", + "5134223 2016 9580000000000 \n", "\n", - "[5 rows x 75 columns]" + " Appointment_Participant.COUNT(Appointment) missed \n", + "identifier \n", + "5030230 56 True \n", + "5122866 55 False \n", + "5134197 33 False \n", + "5134220 48 True \n", + "5134223 38 True " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# feature engineering\n", - "feature_matrix = cd.generate_features(label_times[:1000]) # takes a while for the full dataset\n", + "feature_matrix = cd.generate_features(label_times[:1000], verbose=True) # takes a while for the full dataset\n", "feature_matrix.head(5)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have the features, we can now split the data into training and testing" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": { "id": "xGmr2hXEWw9T" }, "outputs": [], "source": [ - "# shuffle the dataframe\n", - "feature_matrix = feature_matrix.sample(frac=1)\n", - "\n", "# pop the target labels\n", - "y = list(feature_matrix.pop('label'))\n", - "X = feature_matrix.values" + "y = feature_matrix.pop('missed').values\n", + "X = feature_matrix.values\n", + "\n", + "X_train, X_test, y_train, y_test = cd.train_test_split(\n", + " X, y, test_size=0.2, shuffle=True)" ] }, { "cell_type": "markdown", - "metadata": { - "id": "Kr5EsvQUW4Yo" - }, + "metadata": {}, "source": [ - "The pipeline variable represents the order in which machine learning algorithms are executed. It can be used to compare models together by specifying multiple algorithms in different lists. Such as:\n", - "\n", - "```\n", - "pipeline = [['sklearn.ensemble.RandomForestClassifier'], \n", - " ['sklearn.neighbors.KNeighborsClassifier']]\n", - "```\n", - "\n", - "Here we execute two different models, the first one being Random Forest and the second is K-Nearest Neighbor (KNN). In addition, you can use the pipeline to create your own encoding and modeling pipeline where the data crosses several algorithms to create the prediction model. For example, I can use a sequence of primitives that allow me to (1) normalize my data (2) use Random Forest. This can be modeled as:\n", - "```\n", - "pipeline = [['sklearn.preprocessing.StandardScaler', 'sklearn.ensemble.RandomForestClassifier'], \n", - " ['sklearn.neighbors.KNeighborsClassifier']]\n", - "```\n", - "Here there are two different models, the first one composes of two primitives (preprocessing through normalization then applying Random Forest) and the second is basic KNN. More on machine learning algorithms and MLPrimitives can be found here: https://HDI-Project.github.io/MLPrimitives" + "Now that we have our feature matrix properly divided, we can use to train our machine learning pipeline, Modeling, optimizing hyperparameters and finding the most optimal model" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JNvYIt-dXb7G", - "outputId": "b3081349-c2ac-4334-a02a-61f942f2bfd6" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "execution_count": 15, + "metadata": {}, + "outputs": [], "source": [ - "# modeling\n", - "pipeline = [['sklearn.ensemble.RandomForestClassifier'], \n", - " ['sklearn.naive_bayes.MultinomialNB'], \n", - " ['sklearn.neighbors.KNeighborsClassifier']]\n", - "\n", - "exe = cd.execute_model(feature_matrix=X,\n", - " target=y, \n", - " primitives=pipeline)" + "cd.set_pipeline('Random Forest')\n", + "cd.fit(X_train, y_train)\n", + "y_pred = cd.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": { - "id": "Or-EtJ3_XguM" + "id": "Kr5EsvQUW4Yo" }, "source": [ - "## Visualize Results\n", - "\n", - "After executing the pipelines, the method returns a list composing of each pipeline with each fold representing three main results:\n", - "\n", - "* The list of primitives used.\n", - "* The actual label vector.\n", - "* The predicted label vector.\n", - "* The tuned hyperparameters (if given).\n", - "\n", - "In order to perceive the results and look at the performance of each pipeline we can view it's training process by merely plotting the confusion matrix." + "Finally, you can evaluate the performance of the model" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 281 + "base_uri": "https://localhost:8080/" }, - "id": "WXNtz9VvXxcm", - "outputId": "0711adcf-ca8d-4269-8803-f16497367798" + "id": "JNvYIt-dXb7G", + "outputId": "b3081349-c2ac-4334-a02a-61f942f2bfd6" }, "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAARUAAAEICAYAAABxpmCnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAYMElEQVR4nO3debCcVZ3G8e8jgUQRyYaAQAIosokEyICyBkGWqIQSFBAhWFBxAa0RNxgsYAI4gVlQawSMGAkgi6JolCCGJW4YNCohspiEoEAIEJIQQCAQ+M0f51znTdN915Pu29fnU9V1u99z3rd/7+3bT79b36OIwMyslNe1ugAzG1gcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUekDS6yX9RNIqSd/vw3KOl/TzkrW1gqSbJU3s5bybSHpA0utL12WNSRqcf++brKvnGJChIukjkuZKek7S0vzHv2+BRR8NbAqMiIgP9XYhEfHdiDikQD1rkTROUki6sWb6rnn67G4u51xJV3fVLyIOj4jpvSz3DOCKiHihl/MPOJI+K+lxSc9ImiZpcCd93yDpEklP5Q+5X1baDpR0R57+1+p8EbEamEb6/a8TAy5UJJ0OfBX4CikARgGXABMKLH40sCAi1hRY1rqyDHi3pBGVaROBBaWeQEmv/3bym2Ui0GVwNYuk9Vr8/IeS3ugHkf7OtgX+vZNZpgLDgR3zz89W2v5OCo4vNJj3GmBiZ6HVJxExYG7AxsBzwIc66TOYFDqP5dtXgcG5bRzwKPA54ElgKfCx3PbvwEvAy/k5TgbOBa6uLHtrIIBB+fFJwGLgWeAh4PjK9F9X5tsb+D2wKv/cu9I2GzgP+E1ezs+BkQ3WraP+y4BT87T1gCXA2cDsSt+vAY8AzwB/APbL0w+rWc95lTouyHW8ALwtTzslt18K/KCy/AuB2wDVqXN/YFHNtI8B9+d1XAx8vKZ9AnB3rvdB4LA8fTjwnfxargR+VO93nKcF8LZ8/4pc80zSm/Bg4H3An/JzPAKcWzP/vsCdwNO5/STgX4AngPUq/T7Y8Xvrwd/uNcBXKo8PAh5v0HeHXOObuljmwcBfG7QtBA5YJ+/DZr3hm3HLb4g15Dd1gz6TgTnAm4FN8h/JebltXJ5/MrA+MB54HhiW289l7RCpfbx1/sMdBGyYX/jtc9vmwM61f/D5TbESOCHPd1x+PCK3z85vorcDr8+PpzRYt3GkUNkbuCtPGw/cApzC2qHyUWBEfs7PAY8DQ+qtV6WOh4Gd8zzrs3aovIG0NXQSsB/wFLBlgzpPBW6qmfY+4K2AgAPy73333LYnKXDfS9q63gLYIbfdBFwPDMs1HVD7O648R22orAL2ycsckn9/u+TH7ySFxZG5/2hS4B2Xn2cEMCa33QccXnmeG4HP5fsfIYVQo9uo3G8ecExlGSNzvSPq/P5OBOYDF+ff83zgqDr9OguVGcBn1sX7cKDt/owAnorOd0+OByZHxJMRsYy0BXJCpf3l3P5yRMwkfVpv38t6XgXeIen1EbE0Iu6t0+d9wMKIuCoi1kTEtcADwAcqfb4TEQsiHX/4HjCmsyeNiDuB4ZK2J/0BXlmnz9URsTw/53+TtuC6Ws8rIuLePM/LNct7nvR7/B/Sbs2nI+LRBssZSnqDVue/KSIejOQXpC2y/XLzycC0iJgVEa9GxJKIeEDS5sDhwCciYmV+zX7RxTpU/TgifpOX+WJEzI6I+fnxPcC1pICDFA63RsS1+XmWR8TduW06KaSRNBw4lLTlQURcExFDO7k9nJfxRlLIdei4v1GdurcE3pH7vAU4DZguaccerPuzpNehuIEWKsuBkZIGddLnLcDfKo//lqf9Yxk1ofQ86QXvkYj4O3AM8AlgqaSbJO3QjXo6atqi8vjxXtRzFemP7UDSJ+daJH1e0v35YN7TpF3HkV0s85HOGiPiLtKui0jh18hKat4skg6XNEfSilzP+Eo9W5G21mptBayIiJVd1N3IWusjaa98gHOZpFWk166rGiCF6AckbQh8GPhVRCztYS3PAW+qPO64/2ydvi+QPvzOj4iXcpDeAfTk4P9GpC2l4gZaqPwWWA0c2Umfx0ibsh1G5Wm98XfSZn+HzaqNEXFLRLyXtOvzAPCtbtTTUdOSXtbU4SrgU8DMvBXxD5L2A75IegMMi4ihpE89dZTeYJmdfqVd0qmkLZ7H8vIbuYe0O9cx32DgB8B/AZvmemZW6nmEtGtU6xHSFtnQOm1rvTaSNqvTp3Z9riHtFmwVERuTjk11VQMRsYT0t/dB0tbaVZXnPT6fhWx0G5W73gvsWlnsrsATEbG8zlPe04116cqOpF2u4gZUqETEKtIByW9IOjKfdls/fwpelLtdC3w5XycxMvfv7VmIu4H9JY2StDFwZkeDpE0lTcifXqtJn0Sv1lnGTODt+TT4IEnHADsBP+1lTQBExEOkTfez6jRvRDp2tAwYJOls1v6UfALYuidneCS9HTiftBtwAvBFSWMadP8dMFRSx9bYBqQwWgaskXQ4a3/qfhv4mKSDJL1O0haSdshbAzcDl0gall/r/fM884CdJY2RNIR0nKgrG5G2fF6UtCdpl6fDd4GDJX04v04jatbvSlKQ7gL8sGNipMsH3tjJ7eHK/CdL2imH5JdJx33q+SXp+NaZuZZ9SFuktwDk39EQ0rEfSRoiaYOOmfPvfTjp2GJxAypUAPLxgdNJL8oy0ifMacCPcpfzgbmktJ8P/DFP681zzSIdJLyHdAalGgSvy3U8BqwgvcE/WWcZy4H3kw6WLif9Yb4/Ip7qTU01y/51RNTbCrsF+BnpwOrfgBdZe1eg48K+5ZL+2NXz5N3Nq4ELI2JeRCwE/g24qt5py4h4ifSG+Wh+/CzwGdIu00rSm3lGpf/vSGeHLiZtUf2C/9+6O4G0K/AA6Yzdv+Z5FpAOuN9KOtPx667Wg7RlN1nSs6QPm3/swuU3/3jS67SC9IFS3bK4Mdd0Y+2WYXdExM+Ai0i7MQ+TXpdzOtol3Svp+Nz3ZdLZsPGk38e3gBMj4oHcfX/SLtJM0lbvC6RjVB0+AkyPdM1KccpHgs2aSumKzl8Bu8UAuQBO0oOkU+G3trqWRnLIzwP2j4gn18lzOFTM+k7SUaRrc94eEfV2c/9p9Gn3R9JwSbMkLcw/hzXo94qku/NtRmX6NpLukrRI0vXV/T6zdqH09YdLSRcc/lMHCvRxSyUf/FwREVMknUE6k/ClOv2ei4jXnAaV9D3ghxFxnaTLSFchXtrrgsys5foaKn8BxkXE0nwh0uyIeM0FVPVCRZJIB1I3i4g1kt5Nuiz60F4XZGYt19lFYt2xaeUin8dJX+CrZ4ikuaTTmFMi4kekq1+frlxo9ihrX/C1FkmTgEkAG2644R7bb1/vOjLrr5Y+u05ONNg68vQTS3h+1Qp13fO1ugwVSbdSc1FXttb1DxERkhpt9oyOiCWStgVulzSftS9J7lJETCV9M5M99hgbv7lrbk9mtxa78PaFrS7BeuCbp32w1/N2GSoRcXCjNklPSNq8svtT9xRVvuKQiFicD2rtRrqCcqikQXlrZUv6fhWpmbVYXy9+m0H6vxjknz+u7ZCvdByc748kfSv0vkgHc+4g/eOjhvObWXvpa6hMAd4raSHpa9ZTACSNlXR57rMjMFfSPFKITImI+3Lbl4DTJS0iHWP5dh/rMbMW69OB2nyJ+UF1ps8l/f+Ojq/h79Jg/sWk/5VhZgPEgPvuj5m1lkPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMilrnw55KGiPpt3nU+nskHVNpu0LSQ5UhUcf0pR4za72+bqmcAdwWEdsBt+XHtZ4HToyInYHDgK9KGlpp/0JEjMm3u/tYj5m1WF9DZQIwPd+fDhxZ2yEiFkTEwnz/MdLYQJv08XnNrJ/qa6h0d9hTACTtCWwAPFiZfEHeLbq4Y3wgM2tfzRr2lDyC4VXAxIh4NU8+kxRGG5CGNP0SMLnB/P8YS3mrUaO6KtvMWqQpw55KehNwE3BWRMypLLtjK2e1pO8An++kjrXGUu6qbjNrjWYMe7oBcCNwZUTcUNO2ef4p0vGYP/exHjNrsWYMe/phYH/gpDqnjr8raT4wHxgJnN/HesysxZox7OnVwNUN5n9PX57fzPofX1FrZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFVUkVCQdJukvkhZJes3Qp5IGS7o+t98laetK25l5+l8kHVqiHjNrnT6HiqT1gG8AhwM7AcdJ2qmm28nAyoh4G3AxcGGedyfgWKBjnOVL8vLMrE2V2FLZE1gUEYsj4iXgOtIYy1XVMZdvAA7KY/1MAK6LiNUR8RCwKC/PzNpUiVDZAnik8vjRPK1un4hYA6wCRnRzXiANeypprqS5y55aVqBsM1sX2uZAbURMjYixETF2k5GbtLocM2ugRKgsAbaqPN4yT6vbR9IgYGNgeTfnNbM2UiJUfg9sJ2mbPG7ysaQxlquqYy4fDdweEZGnH5vPDm0DbAf8rkBNZtYifRr2FNIxEkmnAbcA6wHTIuJeSZOBuRExA/g2cJWkRcAKUvCQ+30PuA9YA5waEa/0tSYza50+hwpARMwEZtZMO7ty/0XgQw3mvQC4oEQdZtZ6bXOg1szag0PFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMimrWsKenS7pP0j2SbpM0utL2iqS78632H2abWZvp8/+orQx7+l7SYGC/lzQjIu6rdPsTMDYinpf0SeAi4Jjc9kJEjOlrHWbWPzRl2NOIuCMins8P55DG9zGzAahZw55WnQzcXHk8JA9nOkfSkY1m8rCnZu2hyBAd3SXpo8BY4IDK5NERsUTStsDtkuZHxIO180bEVGAqwB57jI2mFGxmPdasYU+RdDBwFnBERKzumB4RS/LPxcBsYLcCNZlZizRl2FNJuwHfJAXKk5XpwyQNzvdHAvuQRis0szbVrGFP/xN4I/B9SQAPR8QRwI7ANyW9Sgq4KTVnjcyszTRr2NODG8x3J7BLiRrMrH/wFbVmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpq1rCnJ0laVhne9JRK20RJC/NtYol6zKx1mjXsKcD1EXFazbzDgXNIYwEF8Ic878q+1mVmrdGUYU87cSgwKyJW5CCZBRxWoCYza5ES/02/3rCne9Xpd5Sk/YEFwGcj4pEG89YdMlXSJGASwBZbbsWyZ1bX62b91JQvfa3VJVgPrH70iV7P26wDtT8Bto6Id5K2Rqb3dAERMTUixkbE2OEjNileoJmV0ZRhTyNieWWo08uBPbo7r5m1l2YNe7p55eERwP35/i3AIXn402HAIXmambWpZg17+hlJRwBrgBXASXneFZLOIwUTwOSIWNHXmsysdZo17OmZwJkN5p0GTCtRh5m1nq+oNbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWVLOGPb24MuTpAklPV9peqbTNqJ3XzNpLU4Y9jYjPVvp/GtitsogXImJMX+sws/6hFcOeHgdcW+B5zawfKhEqPRm6dDSwDXB7ZfIQSXMlzZF0ZKMnkTQp95u7YvmyAmWb2brQ7AO1xwI3RMQrlWmjI2Is8BHgq5LeWm9GD3tq1h6aMuxpxbHU7PpExJL8czEwm7WPt5hZm2nKsKcAknYAhgG/rUwbJmlwvj8S2Ae4r3ZeM2sfzRr2FFLYXBcRUZl9R+Cbkl4lBdyU6lkjM2s/TRn2ND8+t858dwK7lKjBzPoHX1FrZkU5VMysKIeKmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKyoUsOeTpP0pKQ/N2iXpK/nYVHvkbR7pW2ipIX5NrFEPWbWOqW2VK4ADuuk/XBgu3ybBFwKIGk4cA6wF2mkw3MkDStUk5m1QJFQiYhfAis66TIBuDKSOcBQSZsDhwKzImJFRKwEZtF5OJlZP9esYyqNhkbtyZCpHvbUrA20zYFaD3tq1h6aFSqNhkbtyZCpZtYGmhUqM4AT81mgdwGrImIpaVTDQ/Lwp8OAQ/I0M2tTRUYolHQtMA4YKelR0hmd9QEi4jLS6IXjgUXA88DHctsKSeeRxmMGmBwRnR3wNbN+rtSwp8d10R7AqQ3apgHTStRhZq3XNgdqzaw9OFTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVEOFTMryqFiZkU5VMysqGYNe3p8Hu50vqQ7Je1aaftrnn63pLkl6jGz1mnWsKcPAQdExC7AecDUmvYDI2JMRIwtVI+ZtUipf3z9S0lbd9J+Z+XhHNL4PmY2ALXimMrJwM2VxwH8XNIfJE1qQT1mVlCRLZXuknQgKVT2rUzeNyKWSHozMEvSA3nA99p5JwGTALbYcqvaZjPrJ5q2pSLpncDlwISIWN4xPSKW5J9PAjcCe9ab32Mpm7WHpoSKpFHAD4ETImJBZfqGkjbquE8a9rTuGSQzaw/NGvb0bGAEcIkkgDX5TM+mwI152iDgmoj4WYmazKw1mjXs6SnAKXWmLwZ2fe0cZtaufEWtmRXlUDGzohwqZlaUQ8XMinKomFlRDhUzK8qhYmZFOVTMrCiHipkV5VAxs6IcKmZWlEPFzIpyqJhZUQ4VMyvKoWJmRTlUzKwoh4qZFeVQMbOiHCpmVlSzxlIeJ2lVHi/5bklnV9oOk/QXSYsknVGiHjNrnWaNpQzwqzxe8piImAwgaT3gG8DhwE7AcZJ2KlSTmbVAkVDJIwqu6MWsewKLImJxRLwEXAdMKFGTmbVGM4c9fbekecBjwOcj4l5gC+CRSp9Hgb3qzVwd9hRYPWrEkIE46NhI4KlWF7GODNR1G6jrtX1vZ2xWqPwRGB0Rz0kaD/wI2K4nC4iIqcBUAElz82BkA8pAXS8YuOs2kNert/M25exPRDwTEc/l+zOB9SWNBJYA1dHWt8zTzKxNNWss5c2UxzaVtGd+3uXA74HtJG0jaQPgWGBGM2oys3WjWWMpHw18UtIa4AXg2IgIYI2k04BbgPWAaflYS1emlqi7Hxqo6wUDd928XjWU3ttmZmX4ilozK8qhYmZFtUWoSBouaZakhfnnsAb9Xql8FaDfHvDt6qsJkgZLuj633yVp6xaU2WPdWK+TJC2rvEantKLOnurG11Ak6et5ve+RtHuza+yNvny9plMR0e9vwEXAGfn+GcCFDfo91+pau7Eu6wEPAtsCGwDzgJ1q+nwKuCzfPxa4vtV1F1qvk4D/bXWtvVi3/YHdgT83aB8P3AwIeBdwV6trLrRe44Cf9nS5bbGlQrp0f3q+Px04snWl9Fl3vppQXd8bgIM6Tsn3YwP2KxfR9ddQJgBXRjIHGCpp8+ZU13vdWK9eaZdQ2TQilub7jwObNug3RNJcSXMkHdmc0nqs3lcTtmjUJyLWAKuAEU2prve6s14AR+VdhBskbVWnvR11d93b0bslzZN0s6SduzNDM7/70ylJtwKb1Wk6q/ogIkJSo/PgoyNiiaRtgdslzY+IB0vXar32E+DaiFgt6eOkrbH3tLgma6xXX6/pN6ESEQc3apP0hKTNI2Jp3qx8ssEyluSfiyXNBnYj7ef3J935akJHn0clDQI2Jl2B3J91uV4RUV2Hy0nHygaCAfl1k4h4pnJ/pqRLJI2MiE6/QNkuuz8zgIn5/kTgx7UdJA2TNDjfHwnsA9zXtAq7rztfTaiu79HA7ZGPnPVjXa5XzXGGI4D7m1jfujQDODGfBXoXsKqyu962Ovl6TedafQS6m0epRwC3AQuBW4HhefpY4PJ8f29gPumsw3zg5FbX3cn6jAcWkLaizsrTJgNH5PtDgO8Di4DfAdu2uuZC6/UfwL35NboD2KHVNXdzva4FlgIvk46XnAx8AvhEbhfpn409mP/2xra65kLrdVrl9ZoD7N2d5foyfTMrql12f8ysTThUzKwoh4qZFeVQMbOiHCpmVpRDxcyKcqiYWVH/BxC2WzKmvdkuAAAAAElFTkSuQmCC\n", "text/plain": [ - "
" + "Accuracy 1.0\n", + "F1 Macro 1.0\n", + "Precision 1.0\n", + "Recall 1.0\n", + "dtype: float64" ] }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "y_test = []\n", - "y_pred = []\n", - "for i in range(0, 10):\n", - " y_test.extend(exe['pipeline0']['folds'][str(i)]['Actual'])\n", - " y_pred.extend(exe['pipeline0']['folds'][str(i)]['predicted'])\n", - "\n", - "y_test = pd.Categorical(pd.Series(y_test)).codes\n", - "y_pred = pd.Categorical(pd.Series(y_pred)).codes\n", - "\n", - "plt.title(\"Confusion Matrix (accuracy=%.2f)\" % accuracy_score(y_test, y_pred))\n", - "plt.imshow(pd.crosstab(y_test, y_pred), cmap=\"Blues\")\n", - "\n", - "plt.show()" + "cd.evaluate(X, y, fit=True, test_size=0.2, shuffle=True)" ] } ], @@ -797,7 +656,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.7.10" } }, "nbformat": 4, From 66d8bea6b06d61bbe8c23b2dbc662ddc77ef9b08 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 2 Apr 2021 14:33:48 -0400 Subject: [PATCH 07/13] fix lint --- cardea/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cardea/core.py b/cardea/core.py index 8110ce19..99e61e78 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -8,7 +8,6 @@ import os import pickle from functools import partial -from inspect import ismethod from types import FunctionType from typing import List, Union @@ -31,6 +30,7 @@ DEFAULT_PIPELINE = 'XGB' DEFAULT_METRICS = ["Accuracy", "F1 Macro", "Precision", "Recall"] + class Cardea: """Cardea Class. From 0da9e8f4561c4d4c9a5de48050869e4eba087711 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 7 Apr 2021 18:11:29 -0400 Subject: [PATCH 08/13] wip --- cardea/core.py | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/cardea/core.py b/cardea/core.py index 99e61e78..c03a9d08 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -55,7 +55,7 @@ class Cardea: Additional hyperparameters to set to the pipeline. """ - def load_entityset(self, data_path: str, fhir: bool = False) -> None: + def load_entityset(self, data_path: str, fhir: bool = True) -> None: """Returns an entityset loaded with .csv files in data. Load the given dataset into an entityset. The dataset @@ -66,19 +66,22 @@ def load_entityset(self, data_path: str, fhir: bool = False) -> None: A directory of all .csv files that should be loaded. fhir (bool): An indicator whether FHIR or MIMIC schema is used. This parameter is - ignored when loading demo data. + ignored when loading demo data. Default is ``True``. Returns: featuretools.EntitySet: An entityset with loaded data. """ LOGGER.info("Loading data %s", data_path) + self._fhir = fhir if fhir: - self.es = self._es_loader.load_data_entityset(data_path) + es = self._es_loader.load_data_entityset(data_path) else: - self.es = load_mimic_data(data_path) + es = load_mimic_data(data_path) + + return es def _set_modeler(self): pipeline = self._pipeline @@ -111,8 +114,9 @@ def __init__(self, data: str = None, labeler: FunctionType = None, self._featurization = Featurization() self._modeler = None + self._fhir = True # default self._target = None - self._set_entityset() + self.es = self._set_entityset() def list_labelers(self) -> set: """Returns a list of the currently available data labelers. @@ -165,13 +169,24 @@ def create_label_times(self, labeler: FunctionType = None, return label_times - def generate_features(self, label_times: pd.DataFrame, - verbose: bool = False) -> pd.DataFrame: + def generate_feature_matrix(self, label_times: pd.DataFrame, + seed_features: Union[bool, list] = None, max_depth: int = 1, + max_features: int = -1, n_jobs: int = 1, + verbose: bool = False) -> pd.DataFrame: """Returns a the calculated feature matrix. Args: label_times (pandas.DataFrame): A dataframe that indicates cutoff time for each instance. + max_depth (int): + Maximum allowed depth of features. + max_features (int): + Cap the number of generated features to this number. If -1, no limit. + n_jobs (int): + Number of parallel processes to use when calculating feature matrix. + seed_features (bool or list): + List of manually defined features to use. If boolean, then use previously + created features as seed. verbose (bool): Indicate verbosity of the featurization. @@ -179,9 +194,12 @@ def generate_features(self, label_times: pd.DataFrame, pandas.DataFrame: Generated feature matrix. """ + if isinstance(seed_features, bool): + seed_features = self._fm_defs - fm, _ = self._featurization.generate_feature_matrix( - self.es, self._target, label_times, verbose=verbose) + fm, self._fm_defs = self._featurization.generate_feature_matrix( + self.es, self._target, label_times, seed_features=seed_features, max_depth=max_depth, + max_features=max_features, n_jobs=n_jobs, verbose=verbose) return fm @@ -247,17 +265,21 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series """ self._modeler.fit(X, y, tune, max_evals, scoring, verbose) - def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> Union[np.ndarray, list]: + def predict(self, X: Union[str, np.ndarray, pd.DataFrame]) -> Union[np.ndarray, list]: """Get predictions from the cardea pipeline. Args: - X (pandas.DataFrame or ndarray): - Inputs to the pipeline. + X (str, pandas.DataFrame or ndarray): + Inputs to the pipeline. If string, it points to the data path. Returns: numpy.ndarray or list: Predictions to the input data. """ + if isinstance(X, str) and os.path.exits(X): + es = load_entityset(X, self._fhir) + + return self._modeler.predict(X) def fit_predict(self, X: Union[np.ndarray, pd.DataFrame], From 3ad41bc73c97883e681e35e7718725ecc6ea8486 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 9 Apr 2021 03:25:28 -0400 Subject: [PATCH 09/13] api (wip) --- README.md | 19 ++- cardea/core.py | 93 +++++------ cardea/data.py | 2 +- cardea/data_labeling/__init__.py | 2 +- .../{definition.py => data_labeler.py} | 22 +-- .../data_labeling/show_noshow_appointment.py | 9 +- cardea/functional.py | 150 ++++++++++++++++++ 7 files changed, 219 insertions(+), 78 deletions(-) rename cardea/data_labeling/{definition.py => data_labeler.py} (68%) create mode 100644 cardea/functional.py diff --git a/README.md b/README.md index 9d1ee7b0..72207207 100644 --- a/README.md +++ b/README.md @@ -67,9 +67,9 @@ In this short tutorial we will guide you through a series of steps that will hel First, load the core class to work with: ```python3 -from cardea import Cardea +from cardea.data import download -cardea = Cardea() +data_path = download('kaggle') ``` We then seamlessly plug in our data. Here in this example, we are loading a pre-processed version of the [Kaggle dataset: Medical Appointment No Shows](https://www.kaggle.com/joniarroba/noshowappointments). @@ -81,10 +81,13 @@ curl -O https://dai-cardea.s3.amazonaws.com/kaggle.zip && unzip -d kaggle kaggle To load the data, supply the ``data_path`` to the loader. By default, ``cardea`` loads the kaggle dataset ```python3 -cardea.load_entityset(data_path='cardea/data/kaggle', fhir=True) +from cardea import Cardea + +cardea = Cardea(data_path=data_path, + fhir=True) ``` -To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.es`` which should output the following: +To verify that the data has been loaded, you can find the loaded entityset by viewing ``cardea.entityset`` which should output the following: ``` Entityset: kaggle @@ -112,7 +115,9 @@ The output shown represents the entityset data structure where ``cardea.es`` is From there, you can select the prediction problem you aim to solve by specifying the name of the class, which in return gives us the ``label_times`` of the problem. ```python3 -label_times = cardea.create_label_times() +from cardea.data_labeling import appointment_no_show + +label_times = cardea.label(appointment_no_show) ``` ``label_times`` summarizes for each instance in the dataset (1) what is its corresponding label of the instance and (2) what is the time index that indicates the timespan allowed for calculating features that pertain to each instance in the dataset. @@ -133,14 +138,14 @@ Then, you can perform the AutoML steps and take advantage of Cardea. Cardea extracts features through automated feature engineering by supplying the ``label_times`` pertaining to the problem you aim to solve ```python3 -feature_matrix = cardea.generate_features(label_times[:1000]) +feature_matrix = cardea.featurize(label_times[:1000]) ``` > :warning: Featurizing the data might take a while depending on the size of the data. For demonstration, we only featurize the first 1000 records. Once we have the features, we can now split the data into training and testing ```python3 -y = feature_matrix.pop('missed').values +y = feature_matrix.pop('label').values X = feature_matrix.values X_train, X_test, y_train, y_test = cardea.train_test_split( diff --git a/cardea/core.py b/cardea/core.py index c03a9d08..5bc49bf7 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -8,6 +8,7 @@ import os import pickle from functools import partial +from inspect import getfullargspec from types import FunctionType from typing import List, Union @@ -16,17 +17,13 @@ from mlblocks import MLPipeline import cardea -from cardea.data import DEMO_DATA, download from cardea.data_assembling import EntitySetLoader, load_mimic_data -from cardea.data_labeling import DataLabeler, appointment_no_show +from cardea.data_labeling import DataLabeler from cardea.featurizing import Featurization from cardea.modeling import Modeler LOGGER = logging.getLogger(__name__) -DEFAULT_DATA = 'kaggle' -DEFAULT_FHIR = True -DEFAULT_LABELER = appointment_no_show DEFAULT_PIPELINE = 'XGB' DEFAULT_METRICS = ["Accuracy", "F1 Macro", "Precision", "Recall"] @@ -39,12 +36,10 @@ class Cardea: pipelines. Args: - data (str): + data_path (str): Path or name of the dataset to load into an entityset. fhir (bool): Indicator of whether to use FHIR or MIMIC schema. - labeler (method): - Function to defined the data labeler for the wanted prediction problem. pipeline (str, dict or MLPipeline): Pipeline to use. It can be passed as: * An ``str`` with a path to a JSON file. @@ -55,15 +50,16 @@ class Cardea: Additional hyperparameters to set to the pipeline. """ - def load_entityset(self, data_path: str, fhir: bool = True) -> None: + def _load_entityset(self, data_path, fhir): """Returns an entityset loaded with .csv files in data. Load the given dataset into an entityset. The dataset must be in FHIR or MIMIC structure format. Args: - data_path (str): - A directory of all .csv files that should be loaded. + data (str): + Path or name of the dataset to load into an entityset. Or + a preloaded entityset fhir (bool): An indicator whether FHIR or MIMIC schema is used. This parameter is ignored when loading demo data. Default is ``True``. @@ -72,16 +68,13 @@ def load_entityset(self, data_path: str, fhir: bool = True) -> None: featuretools.EntitySet: An entityset with loaded data. """ - LOGGER.info("Loading data %s", data_path) - self._fhir = fhir - + function = load_mimic_data if fhir: - es = self._es_loader.load_data_entityset(data_path) + function = self._es_loader.load_data_entityset - else: - es = load_mimic_data(data_path) + LOGGER.info("Loading data %s", data_path) - return es + return function(data_path) def _set_modeler(self): pipeline = self._pipeline @@ -95,18 +88,8 @@ def _set_modeler(self): self._modeler = Modeler(mlpipeline, self._type) - def _set_entityset(self): - data = self._data - - if data in DEMO_DATA: - fhir = False if data == "mimic" else True - data_path = download(data) - - self.load_entityset(data_path, fhir) - - def __init__(self, data: str = None, labeler: FunctionType = None, + def __init__(self, data_path: str = None, labeler: FunctionType = None, fhir: bool = True, pipeline: Union[str, dict, MLPipeline] = None, hyperparameters: dict = None): - self._data = data or DEFAULT_DATA self._pipeline = pipeline or DEFAULT_PIPELINE self._hyperparameters = hyperparameters @@ -114,9 +97,11 @@ def __init__(self, data: str = None, labeler: FunctionType = None, self._featurization = Featurization() self._modeler = None - self._fhir = True # default + self._fhir = fhir self._target = None - self.es = self._set_entityset() + + # load dataset + self.entityset = self._load_entityset(data_path, fhir) def list_labelers(self) -> set: """Returns a list of the currently available data labelers. @@ -134,8 +119,9 @@ def list_labelers(self) -> set: return labelers - def create_label_times(self, labeler: FunctionType = None, - parameter: dict = None) -> pd.DataFrame: + def label(self, labeler: FunctionType, + parameters: dict = None, + verbose: bool = False) -> pd.DataFrame: """Create label times using the data labeler. Update the labeling function and generate the label times, @@ -152,27 +138,26 @@ def create_label_times(self, labeler: FunctionType = None, pandas.DataFrame: A dataframe of cutoff times and their target labels. """ - labeler = labeler or DEFAULT_LABELER - - if parameter: - labeler = partial(labeler, **parameter) + if parameters: + labeler = partial(labeler, **parameters) LOGGER.info("Using labeler %s", str(labeler.__name__)) data_labeler = DataLabeler(labeler) # target label calculation - label_times, self._target, self._type = data_labeler.generate_label_times( - self.es) + label_times, self._type, self._meta = data_labeler.generate_label_times( + self.entityset, verbose=verbose) - # set default pipeline - self._set_modeler() + # set modeler if pipeline defined + if self._pipeline: + self._set_modeler() return label_times - def generate_feature_matrix(self, label_times: pd.DataFrame, - seed_features: Union[bool, list] = None, max_depth: int = 1, - max_features: int = -1, n_jobs: int = 1, - verbose: bool = False) -> pd.DataFrame: + def featurize(self, label_times: pd.DataFrame, + seed_features: Union[bool, list] = None, max_depth: int = 1, + max_features: int = -1, n_jobs: int = 1, + verbose: bool = False) -> pd.DataFrame: """Returns a the calculated feature matrix. Args: @@ -197,9 +182,14 @@ def generate_feature_matrix(self, label_times: pd.DataFrame, if isinstance(seed_features, bool): seed_features = self._fm_defs - fm, self._fm_defs = self._featurization.generate_feature_matrix( - self.es, self._target, label_times, seed_features=seed_features, max_depth=max_depth, - max_features=max_features, n_jobs=n_jobs, verbose=verbose) + method = self._featurization.generate_feature_matrix + target = self._meta.get('entity') + arguments = set(getfullargspec(method)[0]) - set(getfullargspec(self.featurize)[0]) + kwargs = {k: self._meta.get(k) for k in arguments if self._meta.get(k) is not None} + fm, self._fm_defs = method( + self.entityset, target, label_times, + seed_features=seed_features, max_depth=max_depth, + max_features=max_features, n_jobs=n_jobs, verbose=verbose, **kwargs) return fm @@ -276,10 +266,6 @@ def predict(self, X: Union[str, np.ndarray, pd.DataFrame]) -> Union[np.ndarray, numpy.ndarray or list: Predictions to the input data. """ - if isinstance(X, str) and os.path.exits(X): - es = load_entityset(X, self._fhir) - - return self._modeler.predict(X) def fit_predict(self, X: Union[np.ndarray, pd.DataFrame], @@ -351,8 +337,7 @@ def evaluate(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.S X_train, y_train, tune=tune, max_evals=max_evals, scoring=scoring, verbose=verbose) else: - X_test = X - y_test = y + X_test, y_test = X, y scores = { metric: self._modeler.test(X_test, y_test, scoring=metric) diff --git a/cardea/data.py b/cardea/data.py index eb04677f..45452112 100644 --- a/cardea/data.py +++ b/cardea/data.py @@ -26,7 +26,7 @@ BUCKET = 'dai-cardea' S3_URL = 'https://{}.s3.amazonaws.com/{}' -DEMO_DATA = ("kaggle", "mimic") +DEMO_DATA = ("kaggle", "mimic", "dummy") def download(name, data_path=DATA_PATH): diff --git a/cardea/data_labeling/__init__.py b/cardea/data_labeling/__init__.py index 095de682..e7e98f7e 100644 --- a/cardea/data_labeling/__init__.py +++ b/cardea/data_labeling/__init__.py @@ -1,5 +1,5 @@ # import logging -from cardea.data_labeling.definition import DataLabeler +from cardea.data_labeling.data_labeler import DataLabeler from cardea.data_labeling.length_of_stay import length_of_stay from cardea.data_labeling.mortality_prediction import mortality from cardea.data_labeling.predicting_diagnosis import diagnosis_prediction diff --git a/cardea/data_labeling/definition.py b/cardea/data_labeling/data_labeler.py similarity index 68% rename from cardea/data_labeling/definition.py rename to cardea/data_labeling/data_labeler.py index c14eab7d..c8fece87 100644 --- a/cardea/data_labeling/definition.py +++ b/cardea/data_labeling/data_labeler.py @@ -18,7 +18,7 @@ class DataLabeler: def __init__(self, function): self.function = function - def generate_label_times(self, es, *args, **kwargs): + def generate_label_times(self, es, verbose, *args, **kwargs): """Searches the data to calculate label times. Args: @@ -34,19 +34,19 @@ def generate_label_times(self, es, *args, **kwargs): """ labeling_function, df, meta = self.function(es) kwargs = {**meta, **kwargs} - kwargs.get('target_entity') - time_index = kwargs.get('time_index') - kwargs.get('window_size') - thresh = kwargs.get('thresh') + target_entity = meta.get('target_entity') + time_index = meta.get('time_index') + window_size = meta.get('window_size') + thresh = meta.get('thresh') + pred_type = meta.get('type') label_maker = cp.LabelMaker(labeling_function=labeling_function, - target_entity=kwargs.get('target_entity'), - time_index=kwargs.get('time_index'), - window_size=kwargs.get('window_size')) + target_entity=target_entity, + time_index=time_index, + window_size=window_size) label_times = label_maker.search(df.sort_values(time_index), - *args, - **kwargs) + verbose=verbose, *args, **kwargs) if thresh is not None: label_times = label_times.threshold(thresh) - return label_times, kwargs.get('entity'), kwargs.get('type') + return label_times, pred_type, meta diff --git a/cardea/data_labeling/show_noshow_appointment.py b/cardea/data_labeling/show_noshow_appointment.py index b1c23b97..ba44c140 100644 --- a/cardea/data_labeling/show_noshow_appointment.py +++ b/cardea/data_labeling/show_noshow_appointment.py @@ -6,7 +6,7 @@ def appointment_no_show(es): """Defines the labeling task of appointment no show. """ - def missed(ds, **kwargs): + def label(ds, **kwargs): return True if 'noshow' in ds["status"].values else False if es.id == 'mimic': @@ -14,12 +14,13 @@ def missed(ds, **kwargs): meta = { "entity": "Appointment", - "target_entity": "identifier", # automatically, should this be the index of the table? + "target_entity": "identifier", "time_index": "created", "type": "classification", - "num_examples_per_instance": 1 + "num_examples_per_instance": 1, + "ignore_variables": {'Appointment': ['status']} } df = denormalize(es, entities=['Appointment']) - return missed, df, meta + return label, df, meta diff --git a/cardea/functional.py b/cardea/functional.py new file mode 100644 index 00000000..94ec9153 --- /dev/null +++ b/cardea/functional.py @@ -0,0 +1,150 @@ +"""Cardea Functional API. + +This module provides a collection of simple python functions that +allow using Cardea performing as little steps as possible. The +API is oriented around various prediction problems. +""" +import logging +from typing import List, Union + +import pandas as pd +from mlblocks import MLPipeline + +from cardea import Cardea +from cardea.core import DEFAULT_METRICS, DEFAULT_PIPELINE +from cardea.data_labeling import appointment_no_show + +LOGGER = logging.getLogger(__name__) + + +def _run(cls, labeler, max_depth, max_features, n_jobs, test_size, shuffle, tune, max_evals, + scoring, evaluate, metrics, return_lt, return_fm, return_pred, verbose): + output = dict() + # labeling + label_times = cls.label(labeler, verbose=verbose) + if return_lt: + output['label_times'] = label_times + + # featurizing + fm = cls.featurize(label_times, max_depth=max_depth, max_features=max_features, + n_jobs=n_jobs, verbose=verbose) + if return_fm: + output['feature_matrix'] = fm + + # modeling + y = fm.pop('label').values + X = fm.values + X_train, X_test, y_train, y_test = cls.train_test_split( + X, y, test_size=test_size, shuffle=shuffle) + + if test_size == 0.: + LOGGER.info("Setting test data equal to train data") + X_test, y_test = X_train, y_train + + cls.fit(X_train, y_train, tune=tune, max_evals=max_evals, scoring=scoring, verbose=verbose) + + if return_pred: + y_pred = cls.predict(X_test) + output['prediction'] = y_pred + + if evaluate: + result = cls.evaluate(X=X_test, y=y_test, fit=False, metrics=metrics) + output['evaluate'] = result + + if len(output) > 0: + return output + + return None + + +def model_appnoshow(data_path: str, fhir: bool = False, + pipeline: Union[str, dict, MLPipeline] = DEFAULT_PIPELINE, + hyperparameters: Union[str, pd.DataFrame] = None, max_depth: int = 1, + max_features: int = -1, n_jobs: int = 1, test_size: float = 0.2, + shuffle: bool = True, tune: bool = False, max_evals: int = 10, + scoring: str = None, evaluate: bool = False, + metrics: List[str] = DEFAULT_METRICS, return_lt: bool = False, + return_fm: bool = False, return_pred: bool = False, verbose: bool = False, + save_path: str = None) -> Cardea: + """Create and train an appointment no show cardea instance. + + Return a cardea class object that has been trained on the given + dataset. The function loads the data, extracts label times, generates + features, then trains the pipeline all in one command. + + Args: + data_path (str): + A directory of all .csv files that should be loaded. + fhir (bool): + An indicator whether FHIR or MIMIC schema is used. + pipeline (str or MLPipeline or dict): + Pipeline to use. It can be passed as: + * An ``str`` with a path to a JSON file. + * An ``str`` with the name of a registered pipeline. + * An ``str`` with the path to a pickle file. + * An ``MLPipeline`` instance. + * A ``dict`` with an ``MLPipeline`` specification. + hyperparameters (str or dict): + Hyperparameters to set to the pipeline. It can be passed as + a hyperparameters ``dict`` in the ``mlblocks`` format or as + a path to the corresponding JSON file. Defaults to ``None``. + max_depth (int): + Maximum allowed depth of features. + max_features (int): + Cap to the number of generated features. If -1, no limit. + n_jobs (int): + Number of parallel processes to use when calculating the + feature matrix. + test_size (float): + The proportion of the dataset to include in the test dataset. + shuffle (bool): + Whether or not to shuffle the data before splitting. + tune (bool): + Whether to optimize hyper-parameters of the pipelines. + max_evals (int): + Maximum number of hyper-parameter optimization iterations. + scoring (str): + The name of the scoring function used in the hyper-parameter + optimization. + evaluate (bool): + Whether to evaluate the performance of the pipeline. If True, + we evaluate the performance on the test data, if not given, + evaluate on train data. + metrics (list): + A list of scoring function names. The scoring functions should + be consistent with the problem type. + return_lt (bool): + Whether to return ``label_times``. + return_fm (bool): + Whether to return the calculated feature matrix. + return_pred (bool): + Whether to return the predictions of the pipeline. + verbose (bool): + Whether to log information during processing. + save_path (str): + Path to the file where the fitted pipeline will be stored + using ``pickle``. + + Returns: + Cardea: + A fitted Cardea instance. + """ + + cardea = Cardea(data_path=data_path, + fhir=fhir, + pipeline=pipeline, + hyperparameters=hyperparameters) + + # define labeler + labeler = appointment_no_show + output = _run(cardea, labeler, max_depth, max_features, n_jobs, test_size, shuffle, tune, + max_evals, scoring, evaluate, metrics, return_lt, return_fm, return_pred, + verbose) + + if save_path: + cardea.save(save_path) + + if len(output) > 0: + return cardea, output + + return cardea From 14cbc2371543ab7bc3e900435bec70058985ebaa Mon Sep 17 00:00:00 2001 From: sarahmish Date: Wed, 14 Apr 2021 02:19:22 -0400 Subject: [PATCH 10/13] added unittests --- cardea/core.py | 2 +- cardea/data_labeling/data_labeler.py | 17 +++-- cardea/data_labeling/utils.py | 5 ++ tests/featurizing/test_featurization.py | 8 +-- tests/test_core.py | 94 +++++++++++++++++++++++++ 5 files changed, 112 insertions(+), 14 deletions(-) create mode 100644 tests/test_core.py diff --git a/cardea/core.py b/cardea/core.py index 5bc49bf7..c5da7544 100644 --- a/cardea/core.py +++ b/cardea/core.py @@ -360,7 +360,7 @@ def save(self, path: str): @classmethod def load(cls, path: str): - """Load an Orion instance from a pickle file. + """Load a Cardea instance from a pickle file. Args: path (str): diff --git a/cardea/data_labeling/data_labeler.py b/cardea/data_labeling/data_labeler.py index c8fece87..2cf569b8 100644 --- a/cardea/data_labeling/data_labeler.py +++ b/cardea/data_labeling/data_labeler.py @@ -1,5 +1,6 @@ import composeml as cp +from cardea.data_labeling.utils import _get_arguments class DataLabeler: """Class that defines the prediction problem. @@ -18,22 +19,18 @@ class DataLabeler: def __init__(self, function): self.function = function - def generate_label_times(self, es, verbose, *args, **kwargs): + def generate_label_times(self, es, verbose, **kwargs): """Searches the data to calculate label times. Args: - df (pandas.DataFrame): - Data frame to search and extract labels. - *args: - Positional arguments for label maker. - **kwargs: - Keyword arguments for label maker. + es (featuretools.EntitySet): + Entityset to extract `label_times` from. + Returns: composeml.LabelTimes: Calculated labels with cutoff times. """ labeling_function, df, meta = self.function(es) - kwargs = {**meta, **kwargs} target_entity = meta.get('target_entity') time_index = meta.get('time_index') window_size = meta.get('window_size') @@ -44,8 +41,10 @@ def generate_label_times(self, es, verbose, *args, **kwargs): time_index=time_index, window_size=window_size) + kwargs = {**meta, **kwargs} + kwargs = _get_arguments(kwargs, label_maker.search) label_times = label_maker.search(df.sort_values(time_index), - verbose=verbose, *args, **kwargs) + verbose=verbose, **kwargs) if thresh is not None: label_times = label_times.threshold(thresh) diff --git a/cardea/data_labeling/utils.py b/cardea/data_labeling/utils.py index 70664428..ebc7b020 100644 --- a/cardea/data_labeling/utils.py +++ b/cardea/data_labeling/utils.py @@ -1,5 +1,10 @@ import pandas as pd +from inspect import getfullargspec + +def _get_arguments(arguments, function): + function_arguments = set(getfullargspec(function)[0]) + return {k: arguments.get(k) for k in function_arguments if arguments.get(k) is not None} def _search_relationship(es, left, right): for r in es.relationships: diff --git a/tests/featurizing/test_featurization.py b/tests/featurizing/test_featurization.py index ad0313c0..3056c8bd 100644 --- a/tests/featurizing/test_featurization.py +++ b/tests/featurizing/test_featurization.py @@ -59,13 +59,13 @@ def featurization(): return Featurization() -def test_generate_feature_matrix(featurization, entityset, cutoff): +def test_generate_feature_matrix(featurization, entityset, label_times): feature_matrix, features_defs = featurization.generate_feature_matrix( - entityset, "Encounter", cutoff, encode=False) + entityset, "Encounter", label_times, encode=False) assert len(feature_matrix) == 3 and len(feature_matrix.columns) == 12 -def test_generate_feature_matrix_encoded(featurization, entityset, cutoff): +def test_generate_feature_matrix_encoded(featurization, entityset, label_times): fm_encoded, features_encoded = featurization.generate_feature_matrix( - entityset, "Encounter", cutoff, encode=True) + entityset, "Encounter", label_times, encode=True) assert len(fm_encoded) == 3 and len(fm_encoded.columns) == 32 diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 00000000..2d357371 --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,94 @@ +import os + +import numpy as np +import pandas as pd +from mlblocks.discovery import load_pipeline +from sklearn.datasets import load_iris + +from cardea.data import download +from cardea.core import Cardea + +def prediction_problem_function(es): + def label(ds): + return False + + meta = { + "entity": "Appointment", + "target_entity": "identifier", + "time_index": "created", + "type": "classification", + "num_examples_per_instance": 1 + } + + df = es['Appointment'].df.iloc[:100] + + return label, df, meta + +class TestCardea: + + @classmethod + def setup_class(cls): + cls.X, cls.y = load_iris(return_X_y=True) + + def setup(self): + data_path = download('kaggle') + self.cardea = Cardea(data_path, True) + self.label_times = self.cardea.label(prediction_problem_function) + self.cardea.fit(self.X, self.y) + + def test__load_entityset(self): + es = self.cardea.entityset + assert len(es.entities) == 9 + assert len(es.relationships) == 6 + + def test_list_labelers(self): + labelers = self.cardea.list_labelers() + assert isinstance(labelers, set) + + def test_label(self): + assert len(self.label_times) == 100 + + def test_featurize(self): + label_times = self.label_times.iloc[:10] + feature_matrix = self.cardea.featurize(label_times) + assert len(feature_matrix) == 10 + + def test_set_pipeline(self): + pipeline = "Random Forest" + self.cardea.set_pipeline(pipeline) + + def test_fit(self): + self.cardea.fit(self.X, self.y) + + def test_predict(self): + y = self.cardea.predict(self.X) + assert self.y.shape == y.shape + + def test_fit_predict(self): + y = self.cardea.fit_predict(self.X, self.y) + assert self.y.shape == y.shape + + def test_train_test_split(self): + test_size = 0.2 + X_train, X_test, y_train, y_test = self.cardea.train_test_split(self.X, self.y) + assert X_train.shape[1] == X_test.shape[1] + assert len(X_train) == len(y_train) + assert len(X_test) == len(y_test) + + def test_evaluate(self): + results = self.cardea.evaluate(self.X, self.y) + assert isinstance(results, pd.Series) + assert len(results) == 4 + + def test_evaluate_fit(self): + results = self.cardea.evaluate(self.X, self.y, fit=True) + assert isinstance(results, pd.Series) + assert len(results) == 4 + + def test_save_load(self, tmpdir): + path = os.path.join(tmpdir, 'some/path.pkl') + self.cardea.save(path) + + new_cardea = Cardea.load(path) + assert new_cardea.entityset == self.cardea.entityset + From 7f6078e30d42d59cfd3147736f8f8cf20bb9d8db Mon Sep 17 00:00:00 2001 From: sarahmish Date: Fri, 16 Apr 2021 16:36:12 -0400 Subject: [PATCH 11/13] update unittests --- cardea/data_labeling/__init__.py | 6 +- ..._appointment.py => appointment_no_show.py} | 4 +- .../{predicting_diagnosis.py => diagnosis.py} | 4 +- cardea/data_labeling/length_of_stay.py | 10 +-- .../{mortality_prediction.py => mortality.py} | 6 +- cardea/data_labeling/readmission.py | 4 +- .../data_labeling/test_appointment_no_show.py | 27 +++++++ tests/data_labeling/test_data_labeler.py | 22 ++++++ tests/data_labeling/test_definition.py | 6 -- ...tality_prediction.py => test_diagnosis.py} | 0 tests/data_labeling/test_length_of_stay.py | 79 +++++++++++++++++++ ...dicting_diagnosis.py => test_mortality.py} | 0 .../test_show_noshow_appointment.py | 2 - 13 files changed, 145 insertions(+), 25 deletions(-) rename cardea/data_labeling/{show_noshow_appointment.py => appointment_no_show.py} (84%) rename cardea/data_labeling/{predicting_diagnosis.py => diagnosis.py} (93%) rename cardea/data_labeling/{mortality_prediction.py => mortality.py} (93%) create mode 100644 tests/data_labeling/test_appointment_no_show.py create mode 100644 tests/data_labeling/test_data_labeler.py delete mode 100644 tests/data_labeling/test_definition.py rename tests/data_labeling/{test_mortality_prediction.py => test_diagnosis.py} (100%) rename tests/data_labeling/{test_predicting_diagnosis.py => test_mortality.py} (100%) delete mode 100644 tests/data_labeling/test_show_noshow_appointment.py diff --git a/cardea/data_labeling/__init__.py b/cardea/data_labeling/__init__.py index e7e98f7e..e5a4f8b3 100644 --- a/cardea/data_labeling/__init__.py +++ b/cardea/data_labeling/__init__.py @@ -1,7 +1,7 @@ # import logging +from cardea.data_labeling.appointment_no_show import appointment_no_show from cardea.data_labeling.data_labeler import DataLabeler +from cardea.data_labeling.diagnosis import diagnosis_prediction from cardea.data_labeling.length_of_stay import length_of_stay -from cardea.data_labeling.mortality_prediction import mortality -from cardea.data_labeling.predicting_diagnosis import diagnosis_prediction +from cardea.data_labeling.mortality import mortality_prediction from cardea.data_labeling.readmission import readmission -from cardea.data_labeling.show_noshow_appointment import appointment_no_show diff --git a/cardea/data_labeling/show_noshow_appointment.py b/cardea/data_labeling/appointment_no_show.py similarity index 84% rename from cardea/data_labeling/show_noshow_appointment.py rename to cardea/data_labeling/appointment_no_show.py index ba44c140..45e0439e 100644 --- a/cardea/data_labeling/show_noshow_appointment.py +++ b/cardea/data_labeling/appointment_no_show.py @@ -1,6 +1,6 @@ -from cardea.data_labeling.utils import denormalize +from cardea.data_labeling import utils def appointment_no_show(es): @@ -21,6 +21,6 @@ def label(ds, **kwargs): "ignore_variables": {'Appointment': ['status']} } - df = denormalize(es, entities=['Appointment']) + df = utils.denormalize(es, entities=['Appointment']) return label, df, meta diff --git a/cardea/data_labeling/predicting_diagnosis.py b/cardea/data_labeling/diagnosis.py similarity index 93% rename from cardea/data_labeling/predicting_diagnosis.py rename to cardea/data_labeling/diagnosis.py index 2e566237..81ee1817 100644 --- a/cardea/data_labeling/predicting_diagnosis.py +++ b/cardea/data_labeling/diagnosis.py @@ -20,7 +20,7 @@ def diagnosis_prediction(es, diag): Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ - def diagnosis(ds, **kwargs): + def label(ds, **kwargs): return True if diag in ds[column].values else False if es.id == 'mimic': @@ -39,4 +39,4 @@ def diagnosis(ds, **kwargs): df = denormalize(es, entities=entities) - return diagnosis, df, meta + return label, df, meta diff --git a/cardea/data_labeling/length_of_stay.py b/cardea/data_labeling/length_of_stay.py index a37751ee..a0153ab4 100644 --- a/cardea/data_labeling/length_of_stay.py +++ b/cardea/data_labeling/length_of_stay.py @@ -1,6 +1,6 @@ import pandas as pd -from cardea.data_labeling.utils import denormalize +from cardea.data_labeling import utils MIMIC_META = { 'entity': 'admissions', @@ -21,7 +21,7 @@ def length_of_stay(es, k=None): Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ - def los(ds, **kwargs): + def label(ds, **kwargs): return (ds['los'].dt.days).sum() if es.id == 'mimic': @@ -32,7 +32,7 @@ def los(ds, **kwargs): elif es.id == 'fhir': meta = FHIR_META - entities = ['encounter', 'period'] + entities = ['Encounter', 'Period'] start = 'start' end = 'end' @@ -43,11 +43,11 @@ def los(ds, **kwargs): meta['type'] = 'classification' meta['thresh'] = k - df = denormalize(es, entities=entities) + df = utils.denormalize(es, entities=entities) # generate label df[end] = pd.to_datetime(df[end]) df[start] = pd.to_datetime(df[start]) df['los'] = df[end] - df[start] - return los, df, meta + return label, df, meta diff --git a/cardea/data_labeling/mortality_prediction.py b/cardea/data_labeling/mortality.py similarity index 93% rename from cardea/data_labeling/mortality_prediction.py rename to cardea/data_labeling/mortality.py index aba9309b..a1069302 100644 --- a/cardea/data_labeling/mortality_prediction.py +++ b/cardea/data_labeling/mortality.py @@ -14,13 +14,13 @@ } -def mortality(es): +def mortality_prediction(es): """Defines the labeling task of length of stay. Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ - def mortal(ds, **kwargs): + def label(ds, **kwargs): return ds['hospital_expire_flag'].sum() > 0 if es.id == 'mimic': @@ -44,4 +44,4 @@ def mortal(ds, **kwargs): df['hospital_expire_flag'] = int(df['code'].isin(causes_of_death)) - return mortal, df, meta + return label, df, meta diff --git a/cardea/data_labeling/readmission.py b/cardea/data_labeling/readmission.py index 62fc5f90..12a622e7 100644 --- a/cardea/data_labeling/readmission.py +++ b/cardea/data_labeling/readmission.py @@ -21,7 +21,7 @@ def readmission(es, k=30): Predict how many days the patient will be in the hospital. For a classification version of the problem, specify k. """ - def readmit(ds, **kwargs): + def label(ds, **kwargs): initial_discharge = min(ds[end].values) second_admission = sorted(ds[start].values)[1] return (second_admission - initial_discharge).dt.days @@ -48,4 +48,4 @@ def readmit(ds, **kwargs): df[end] = pd.to_datetime(df[end]) df[start] = pd.to_datetime(df[start]) - return readmit, df, meta + return label, df, meta diff --git a/tests/data_labeling/test_appointment_no_show.py b/tests/data_labeling/test_appointment_no_show.py new file mode 100644 index 00000000..3942b1bc --- /dev/null +++ b/tests/data_labeling/test_appointment_no_show.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import appointment_no_show + + +@patch('cardea.data_labeling.utils.denormalize') +def test_appointment_no_show(denormalize_mock): + es = Mock(autospec=ft.EntitySet) + df = Mock(autospec=pd.DataFrame) + denormalize_mock.return_value = df + + returned = appointment_no_show(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(meta, dict) diff --git a/tests/data_labeling/test_data_labeler.py b/tests/data_labeling/test_data_labeler.py new file mode 100644 index 00000000..21aa44a2 --- /dev/null +++ b/tests/data_labeling/test_data_labeler.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from unittest.mock import Mock + +import featuretools as ft + +from cardea.data_labeling import DataLabeler + + +class TestDataLabeler: + + @classmethod + def setup_class(cls): + cls.function = lambda x: x + cls.es = Mock(autospec=ft.EntitySet) + cls.subset = None + cls.verbose = False + + def test_data_labeler(self): + def function(x): return x + DataLabeler(function) diff --git a/tests/data_labeling/test_definition.py b/tests/data_labeling/test_definition.py deleted file mode 100644 index 654c0c6e..00000000 --- a/tests/data_labeling/test_definition.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - - -def test_data_labeler(): - pass diff --git a/tests/data_labeling/test_mortality_prediction.py b/tests/data_labeling/test_diagnosis.py similarity index 100% rename from tests/data_labeling/test_mortality_prediction.py rename to tests/data_labeling/test_diagnosis.py diff --git a/tests/data_labeling/test_length_of_stay.py b/tests/data_labeling/test_length_of_stay.py index faa18be5..db2d882c 100644 --- a/tests/data_labeling/test_length_of_stay.py +++ b/tests/data_labeling/test_length_of_stay.py @@ -1,2 +1,81 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import length_of_stay + + +@patch('cardea.data_labeling.utils.denormalize') +def test_length_of_stay_fhir(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="fhir") + + df = pd.DataFrame({ + 'col 1': range(5), + 'start': range(5), + 'end': range(5) + }) + denormalize_mock.return_value = df + + returned = length_of_stay(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + + +@patch('cardea.data_labeling.utils.denormalize') +def test_length_of_stay_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="mimic") + + df = pd.DataFrame({ + 'col 1': range(5), + 'admittime': range(5), + 'dischtime': range(5) + }) + denormalize_mock.return_value = df + + returned = length_of_stay(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + + +@patch('cardea.data_labeling.utils.denormalize') +def test_length_of_stay_classification(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="mimic") + + df = pd.DataFrame({ + 'col 1': range(5), + 'admittime': range(5), + 'dischtime': range(5) + }) + denormalize_mock.return_value = df + + returned = length_of_stay(es, k=7) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + assert meta['type'] == 'classification' + assert meta['thresh'] == 7 diff --git a/tests/data_labeling/test_predicting_diagnosis.py b/tests/data_labeling/test_mortality.py similarity index 100% rename from tests/data_labeling/test_predicting_diagnosis.py rename to tests/data_labeling/test_mortality.py diff --git a/tests/data_labeling/test_show_noshow_appointment.py b/tests/data_labeling/test_show_noshow_appointment.py deleted file mode 100644 index faa18be5..00000000 --- a/tests/data_labeling/test_show_noshow_appointment.py +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- From 4aa6c5d8c93aa6e69827ad97eab9f5dbec21b8bd Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 20 Apr 2021 00:44:24 -0400 Subject: [PATCH 12/13] fix lint --- cardea/data.py | 2 +- tests/data_labeling/test_data_labeler.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cardea/data.py b/cardea/data.py index 45452112..eb04677f 100644 --- a/cardea/data.py +++ b/cardea/data.py @@ -26,7 +26,7 @@ BUCKET = 'dai-cardea' S3_URL = 'https://{}.s3.amazonaws.com/{}' -DEMO_DATA = ("kaggle", "mimic", "dummy") +DEMO_DATA = ("kaggle", "mimic") def download(name, data_path=DATA_PATH): diff --git a/tests/data_labeling/test_data_labeler.py b/tests/data_labeling/test_data_labeler.py index 21aa44a2..bb8c5a34 100644 --- a/tests/data_labeling/test_data_labeler.py +++ b/tests/data_labeling/test_data_labeler.py @@ -18,5 +18,7 @@ def setup_class(cls): cls.verbose = False def test_data_labeler(self): - def function(x): return x + def function(x): + return x + DataLabeler(function) From f837f3c34be8b6e673bdcddae6e2c8286a5b98fb Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 20 Apr 2021 17:52:16 -0400 Subject: [PATCH 13/13] added ignore variabels to mortality and readmission --- cardea/data_labeling/data_labeler.py | 11 ++--- cardea/data_labeling/diagnosis.py | 6 +-- cardea/data_labeling/mortality.py | 19 +++++---- cardea/data_labeling/readmission.py | 25 +++++++----- tests/data_labeling/test_diagnosis.py | 25 ++++++++++++ tests/data_labeling/test_mortality.py | 25 ++++++++++++ tests/data_labeling/test_readmission.py | 54 +++++++++++++++++++++++++ 7 files changed, 139 insertions(+), 26 deletions(-) diff --git a/cardea/data_labeling/data_labeler.py b/cardea/data_labeling/data_labeler.py index a8d1925c..0636cc54 100644 --- a/cardea/data_labeling/data_labeler.py +++ b/cardea/data_labeling/data_labeler.py @@ -20,12 +20,16 @@ class DataLabeler: def __init__(self, function): self.function = function - def generate_label_times(self, es, subset, verbose, **kwargs): + def generate_label_times(self, es, subset=None, verbose=False, **kwargs): """Searches the data to calculate label times. Args: es (featuretools.EntitySet): - Entityset to extract `label_times` from. + Entityset to extract `label_times` from. + subset (float or int): + Portion of the data to select for searching. + verbose: + An indicator to the verbosity of searching. Returns: composeml.LabelTimes: @@ -37,9 +41,6 @@ def generate_label_times(self, es, subset, verbose, **kwargs): if isinstance(subset, float) or isinstance(subset, int): data = data.sample(subset) - if isinstance(subset, list): - data = data[data['isinstance'].isin(subset)] - target_entity = meta.get('target_entity') time_index = meta.get('time_index') window_size = meta.get('window_size') diff --git a/cardea/data_labeling/diagnosis.py b/cardea/data_labeling/diagnosis.py index 81ee1817..842c90a0 100644 --- a/cardea/data_labeling/diagnosis.py +++ b/cardea/data_labeling/diagnosis.py @@ -1,5 +1,5 @@ -from cardea.data_labeling.utils import denormalize +from cardea.data_labeling import utils MIMIC_META = { 'entity': 'admissions', @@ -28,7 +28,7 @@ def label(ds, **kwargs): entities = ['admissions'] column = 'diagnosis' - elif es.id == 'fhir': + else: meta = FHIR_META entities = ['encounter', 'encounter_diagnosis', 'condition', 'codeableconcept', 'coding', 'period'] @@ -37,6 +37,6 @@ def label(ds, **kwargs): meta['type'] = 'classification' meta['num_examples_per_instance'] = 1 - df = denormalize(es, entities=entities) + df = utils.denormalize(es, entities=entities) return label, df, meta diff --git a/cardea/data_labeling/mortality.py b/cardea/data_labeling/mortality.py index a1069302..1d02fb72 100644 --- a/cardea/data_labeling/mortality.py +++ b/cardea/data_labeling/mortality.py @@ -1,10 +1,17 @@ -from cardea.data_labeling.utils import denormalize +from cardea.data_labeling import utils MIMIC_META = { 'entity': 'admissions', 'target_entity': 'hadm_id', 'time_index': 'admittime', + 'ignore_variables': {'admissions': [ + 'hospital_expire_flag', + 'deathtime', + 'discharge_location', + 'dischtime'], + 'patients': ['expire_flag'], + 'callout': ['discharge_wardid']} } FHIR_META = { @@ -15,10 +22,8 @@ def mortality_prediction(es): - """Defines the labeling task of length of stay. - - Predict how many days the patient will be in the hospital. For - a classification version of the problem, specify k. + """Defines the labeling task of mortality prediction. + Predict patient mortality from the point of admission. """ def label(ds, **kwargs): return ds['hospital_expire_flag'].sum() > 0 @@ -27,7 +32,7 @@ def label(ds, **kwargs): meta = MIMIC_META entities = ['admissions'] - elif es.id == 'fhir': + else: meta = FHIR_META entities = ['encounter', 'encounter_diagnosis', 'condition', 'codeableconcept', 'coding', 'period'] @@ -35,7 +40,7 @@ def label(ds, **kwargs): meta['type'] = 'classification' meta['num_examples_per_instance'] = 1 - df = denormalize(es, entities=entities) + df = utils.denormalize(es, entities=entities) # generate label if es.id == 'fhir': diff --git a/cardea/data_labeling/readmission.py b/cardea/data_labeling/readmission.py index 12a622e7..2090f9a2 100644 --- a/cardea/data_labeling/readmission.py +++ b/cardea/data_labeling/readmission.py @@ -1,10 +1,10 @@ import pandas as pd -from cardea.data_labeling.utils import denormalize +from cardea.data_labeling import utils MIMIC_META = { 'entity': 'admissions', - 'target_entity': 'patient_id', + 'target_entity': 'subject_id', 'time_index': 'dischtime' } @@ -16,15 +16,17 @@ def readmission(es, k=30): - """Defines the labeling task of length of stay. - - Predict how many days the patient will be in the hospital. For - a classification version of the problem, specify k. + """Defines the labeling task of readmission. + Predict whether or not the patient will get readmitted + into the hospital, you can specify the number of days + between one visit and another using k. """ def label(ds, **kwargs): - initial_discharge = min(ds[end].values) - second_admission = sorted(ds[start].values)[1] - return (second_admission - initial_discharge).dt.days + if len(ds) < 2: + return 0 + initial_discharge = min(ds.index) + second_admission = sorted(ds[start])[1] + return (second_admission - initial_discharge).days if es.id == 'mimic': meta = MIMIC_META @@ -32,7 +34,7 @@ def label(ds, **kwargs): start = 'admittime' end = 'dischtime' - elif es.id == 'fhir': + else: meta = FHIR_META entities = ['encounter', 'period'] start = 'start' @@ -41,8 +43,9 @@ def label(ds, **kwargs): meta['type'] = 'classification' meta['thresh'] = k meta['num_examples_per_instance'] = 2 + meta['window_size'] = 2 - df = denormalize(es, entities=entities) + df = utils.denormalize(es, entities=entities) # generate label df[end] = pd.to_datetime(df[end]) diff --git a/tests/data_labeling/test_diagnosis.py b/tests/data_labeling/test_diagnosis.py index faa18be5..b0e56543 100644 --- a/tests/data_labeling/test_diagnosis.py +++ b/tests/data_labeling/test_diagnosis.py @@ -1,2 +1,27 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import diagnosis_prediction + + +@patch('cardea.data_labeling.utils.denormalize') +def test_mortality_prediction_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id='mimic') + df = Mock(autospec=pd.DataFrame) + denormalize_mock.return_value = df + + returned = diagnosis_prediction(es, 'disease') + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(meta, dict) diff --git a/tests/data_labeling/test_mortality.py b/tests/data_labeling/test_mortality.py index faa18be5..2f971c21 100644 --- a/tests/data_labeling/test_mortality.py +++ b/tests/data_labeling/test_mortality.py @@ -1,2 +1,27 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import mortality_prediction + + +@patch('cardea.data_labeling.utils.denormalize') +def test_mortality_prediction_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id='mimic') + df = Mock(autospec=pd.DataFrame) + denormalize_mock.return_value = df + + returned = mortality_prediction(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(meta, dict) diff --git a/tests/data_labeling/test_readmission.py b/tests/data_labeling/test_readmission.py index faa18be5..f27fc3ce 100644 --- a/tests/data_labeling/test_readmission.py +++ b/tests/data_labeling/test_readmission.py @@ -1,2 +1,56 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + +from types import FunctionType +from unittest.mock import Mock, patch + +import featuretools as ft +import pandas as pd + +from cardea.data_labeling import readmission + + +@patch('cardea.data_labeling.utils.denormalize') +def test_readmission_fhir(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="fhir") + + df = pd.DataFrame({ + 'col 1': range(5), + 'start': range(5), + 'end': range(5) + }) + denormalize_mock.return_value = df + + returned = readmission(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict) + + +@patch('cardea.data_labeling.utils.denormalize') +def test_readmission_mimic(denormalize_mock): + es = Mock(autospec=ft.EntitySet, id="mimic") + + df = pd.DataFrame({ + 'col 1': range(5), + 'admittime': range(5), + 'dischtime': range(5) + }) + denormalize_mock.return_value = df + + returned = readmission(es) + + assert isinstance(returned, tuple) + assert len(returned) == 3 + + function, dataframe, meta = returned + + assert isinstance(function, FunctionType) + assert isinstance(dataframe, pd.DataFrame) + assert isinstance(meta, dict)