added two scripts

ttimbers · Nov 28, 2023 · c90f289 · c90f289
1 parent d902d5e
commit c90f289
Show file tree

Hide file tree

Showing 9 changed files with 1,266 additions and 3 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -11,4 +11,5 @@ RUN conda install -y \
     requests=2.31.0 \
     notebook=7.0.6 \
     pytest=7.4.3 \
-    responses=0.24.1
+    responses=0.24.1 \
+    click=8.0.3
diff --git a/README.md b/README.md
@@ -79,8 +79,13 @@ Copy and paste that URL into your browser.
 <img src="img/jupyter-container-web-app-launch-url.png" width=400>
 
 3. To run the analysis,
-open `src/breast_cancer_predict_report.ipynb` in Jupyter Lab you just launched
-and under the "Kernel" menu click "Restart Kernel and Run All Cells...".
+enter the following commands in the terminal in the project root:
+
+```
+python scripts/download_data.py --url="https://archive.ics.uci.edu/static/public/15/breast+cancer+wisconsin+original.zip" --write-to="data/raw"
+
+python scripts/split_n_preprocess.py --raw_data=data/raw/wdbc.data --write_to=results
+```
 
 #### Clean up
 

diff --git a/results/cancer_preprocessor.pickle b/results/cancer_preprocessor.pickle
diff --git a/results/cancer_test.csv b/results/cancer_test.csv
diff --git a/results/cancer_train.csv b/results/cancer_train.csv
diff --git a/results/scaled_cancer_test.csv b/results/scaled_cancer_test.csv
diff --git a/results/scaled_cancer_train.csv b/results/scaled_cancer_train.csv
diff --git a/scripts/download_data.py b/scripts/download_data.py
@@ -0,0 +1,24 @@
+# download_data.py
+# author: Tiffany Timbers
+# date: 2023-11-27
+
+import click
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from src.read_zip import read_zip
+
+@click.command()
+@click.option('--url', type=str, help="URL of dataset to be downloaded")
+@click.option('--write_to', type=str, help="Path to directory where raw data will be written to")
+
+def main(url, write_to):
+    """Downloads data zip data from the web to a local filepath and extracts it."""
+    try:
+        read_zip(url, write_to)
+    except:
+        os.makedirs(write_to)
+        read_zip(url, write_to)
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/split_n_preprocess.py b/scripts/split_n_preprocess.py
@@ -0,0 +1,91 @@
+# split_n_preprocess.py
+# author: Tiffany Timbers
+# date: 2023-11-27
+
+import click
+import os
+import numpy as np
+import pandas as pd
+import pickle
+from sklearn.model_selection import train_test_split
+from sklearn import set_config
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.compose import make_column_transformer, make_column_selector
+np.random.seed(522)
+set_config(transform_output="pandas")
+
+@click.command()
+@click.option('--raw_data', type=str, help="Path to raw data")
+@click.option('--write_to', type=str, help="Path to directory where processed data will be written to")
+
+def main(raw_data, write_to):
+    '''This script splits the raw data into train and test sets, 
+    and then preprocesses the data to be used in exploratory data analysis.
+    It also saves the preprocessor to be used in the model training script.'''
+    colnames = [
+        "id",
+        "class",
+        "mean_radius",
+        "mean_texture",
+        "mean_perimeter", 
+        "mean_area",
+        "mean_smoothness",
+        "mean_compactness",
+        "mean_concavity",
+        "mean_concave_points",
+        "mean_symmetry",
+        "mean_fractal_dimension",
+        "se_radius",
+        "se_texture",
+        "se_perimeter", 
+        "se_area",
+        "se_smoothness",
+        "se_compactness",
+        "se_concavity",
+        "se_concave_points",
+        "se_symmetry",
+        "se_fractal_dimension",
+        "max_radius",
+        "max_texture",
+        "max_perimeter", 
+        "max_area",
+        "max_smoothness",
+        "max_compactness",
+        "max_concavity",
+        "max_concave_points",
+        "max_symmetry",
+        "max_fractal_dimension"
+    ]
+
+    cancer = pd.read_csv(raw_data, names=colnames, header=None).drop(columns=['id'])
+    # re-label Class 'M' as 'Malignant', and Class 'B' as 'Benign'
+    cancer['class'] = cancer['class'].replace({
+        'M' : 'Malignant',
+        'B' : 'Benign'
+    })
+
+    # create the split
+    cancer_train, cancer_test = train_test_split(
+        cancer, train_size=0.70, stratify=cancer["class"]
+    )
+
+    cancer_train.to_csv(os.path.join(write_to, "cancer_train.csv"))
+    cancer_test.to_csv(os.path.join(write_to, "cancer_test.csv"))
+
+    cancer_preprocessor = make_column_transformer(
+        (StandardScaler(), make_column_selector(dtype_include='number')),
+        remainder='passthrough',
+        verbose_feature_names_out=False
+    )
+    pickle.dump(cancer_preprocessor, open(os.path.join(write_to, "cancer_preprocessor.pickle"), "wb"))
+
+    cancer_preprocessor.fit(cancer_train)
+    scaled_cancer_train = cancer_preprocessor.transform(cancer_train)
+    scaled_cancer_test = cancer_preprocessor.transform(cancer_test)
+
+    scaled_cancer_train.to_csv(os.path.join(write_to, "scaled_cancer_train.csv"))
+    scaled_cancer_test.to_csv(os.path.join(write_to, "scaled_cancer_test.csv"))
+
+if __name__ == '__main__':
+    main()