Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

modularize split_n_preprocessor file #6

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
96 changes: 25 additions & 71 deletions scripts/split_n_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,90 +6,44 @@
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.split_train_test import split_train_test_data, validate_split_data
from src.clean_data import write_data
from src.preprocessor import create_save_preprocessor

@click.command()
@click.option('--raw-data', type=str, help="Path to raw data")
@click.option('--cleaned-data', type=str, help="Path to cleaned data")
@click.option('--train-data-size', type=str, help="Size of the dataset to include in the train split")
@click.option('--data-to', type=str, help="Path to directory where processed data will be written to")
@click.option('--preprocessor-to', type=str, help="Path to directory where the preprocessor object will be written to")
@click.option('--seed', type=int, help="Random seed", default=123)

def main(raw_data, data_to, preprocessor_to, seed):
def main(cleaned_data, train_data_size, data_to, preprocessor_to, seed):
'''This script splits the raw data into train and test sets,
and then preprocesses the data to be used in exploratory data analysis.
It also saves the preprocessor to be used in the model training script.'''
np.random.seed(seed)
set_config(transform_output="pandas")

colnames = [
"id",
"class",
"mean_radius",
"mean_texture",
"mean_perimeter",
"mean_area",
"mean_smoothness",
"mean_compactness",
"mean_concavity",
"mean_concave_points",
"mean_symmetry",
"mean_fractal_dimension",
"se_radius",
"se_texture",
"se_perimeter",
"se_area",
"se_smoothness",
"se_compactness",
"se_concavity",
"se_concave_points",
"se_symmetry",
"se_fractal_dimension",
"max_radius",
"max_texture",
"max_perimeter",
"max_area",
"max_smoothness",
"max_compactness",
"max_concavity",
"max_concave_points",
"max_symmetry",
"max_fractal_dimension"
]

cancer = pd.read_csv(raw_data, names=colnames, header=None).drop(columns=['id'])
# re-label Class 'M' as 'Malignant', and Class 'B' as 'Benign'
cancer['class'] = cancer['class'].replace({
'M' : 'Malignant',
'B' : 'Benign'
})

# create the split
cancer_train, cancer_test = train_test_split(
cancer, train_size=0.70, stratify=cancer["class"]
)

cancer_train.to_csv(os.path.join(data_to, "cancer_train.csv"), index=False)
cancer_test.to_csv(os.path.join(data_to, "cancer_test.csv"), index=False)

cancer_preprocessor = make_column_transformer(
(StandardScaler(), make_column_selector(dtype_include='number')),
remainder='passthrough',
verbose_feature_names_out=False
)
pickle.dump(cancer_preprocessor, open(os.path.join(preprocessor_to, "cancer_preprocessor.pickle"), "wb"))

cancer_preprocessor.fit(cancer_train)
scaled_cancer_train = cancer_preprocessor.transform(cancer_train)
scaled_cancer_test = cancer_preprocessor.transform(cancer_test)

scaled_cancer_train.to_csv(os.path.join(data_to, "scaled_cancer_train.csv"), index=False)
scaled_cancer_test.to_csv(os.path.join(data_to, "scaled_cancer_test.csv"), index=False)
cleaned_data = pd.read_csv(cleaned_data)
cancer_train, cancer_test = split_train_test_data(cleaned_data, train_data_size,
stratify_by=cleaned_data["diagnosis"])
validate_split_data(cancer_train, cancer_test)

try:
write_data(cancer_train, data_to)
write_data(cancer_test, data_to)
except:
os.makedirs(data_to)
write_data(cancer_train, data_to)
write_data(cancer_test, data_to)

try:
create_save_preprocessor(cancer_train, cancer_test, data_to, preprocessor_to)
except:
os.makedirs(preprocessor_to)
create_save_preprocessor(cancer_train, cancer_test, data_to, preprocessor_to)

if __name__ == '__main__':
main()
21 changes: 21 additions & 0 deletions src/preprocessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector

def create_save_preprocessor(train_data, test_data, data_to, preprocessor_to):
"""Create and save preprocess using train and test data"""
cancer_preprocessor = make_column_transformer(
(StandardScaler(), make_column_selector(dtype_include='number')),
remainder='passthrough',
verbose_feature_names_out=False
)

pickle.dump(cancer_preprocessor, open(os.path.join(preprocessor_to, "cancer_preprocessor.pickle"), "wb"))

cancer_preprocessor.fit(train_data)
scaled_cancer_train = cancer_preprocessor.transform(train_data)
scaled_cancer_test = cancer_preprocessor.transform(test_data)

scaled_cancer_train.to_csv(os.path.join(data_to, "scaled_cancer_train.csv"), index=False)
scaled_cancer_test.to_csv(os.path.join(data_to, "scaled_cancer_test.csv"), index=False)
73 changes: 73 additions & 0 deletions src/split_train_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import DatasetsSizeComparison, TrainTestSamplesMix, MultivariateDrift, LabelDrift, FeatureDrift

def split_train_test_data(cleaned_data, train_data_size, stratify_by=None):
"""Split train test data using cleaned data"""

# Ensure cleaned_data is a dataframe
if not isinstance(cleaned_data, pd.DataFrame):
raise TypeError("The cleaned_data must be a pandas DataFrame.")

# Ensure train_data_size is a valid proportion
if not (0 < train_data_size < 1):
raise ValueError("train_data_size must be a float between 0 and 1.")

# Handle stratify_by
if stratify_by:
if stratify_by not in cleaned_data.columns:
raise KeyError(f"The column '{stratify_by}' does not exist in the cleaned_data.")

# create the split
data_train, data_test = train_test_split(
cleaned_data, train_size=train_data_size, stratify=stratify_by #cleaned_data["diagnosis"]
)

return data_train, data_test

def validate_split_data(data_train, data_test):
"""Test split datasets using Deepchecks"""

# test train test split dataframes
# prepare the train test datasets to Deepchecks Dataset format
data_train = Dataset(data_train,features=data_train.columns[1:],label=data_train.columns[0])
data_test = Dataset(data_test,features=data_test.columns[1:],label=data_test.columns[0])

# Datasets Size Comparison Check
check_instance = (
DatasetsSizeComparison()
.add_condition_train_dataset_greater_or_equal_test()
.add_condition_test_train_size_ratio_greater_than(0.2)
)
data_size_comp = check_instance.run(data_train, data_test)
if not data_size_comp.passed_conditions():
raise ValueError("The train test data size ratio should be greater than 0.2")

# Train Test Samples Mix Check
check = TrainTestSamplesMix().add_condition_duplicates_ratio_less_or_equal(0)
sample_mix_check = check.run(test_dataset=data_test, train_dataset=data_train)
if not sample_mix_check.passed_conditions():
raise ValueError("Data from Test dataset also present in Train dataset")

# Label Drift Check
check = LabelDrift().add_condition_drift_score_less_than(0.4)
label_drift_check = check.run(train_dataset=data_train, test_dataset=data_test)
# drift_score = label_drift_check.reduce_output()
if not label_drift_check.passed_conditions():
raise ValueError(f"Label drift score above threshold: 0.4")

# Feature Drift Check
check = FeatureDrift().add_condition_drift_score_less_than(0.4)
feature_drift_check = check.run(train_dataset=data_train, test_dataset=data_test)
# drift_score = feature_drift_check.reduce_output()
if not feature_drift_check.passed_conditions():
raise ValueError(f"Feature drift score above threshold: 0.4")

# Multivariate Drift Check
check = MultivariateDrift().add_condition_overall_drift_value_less_than(0.4)
multivariate_drift_check = check.run(train_dataset=data_train, test_dataset=data_test)
# drift_score = multivariate_drift_check.reduce_output()
if not multivariate_drift_check.passed_conditions():
raise ValueError(f"Multivariate drift score above threshold: 0.4")
Loading