Skip to content

Commit

Permalink
added two scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ttimbers committed Nov 28, 2023
1 parent d902d5e commit c90f289
Show file tree
Hide file tree
Showing 9 changed files with 1,266 additions and 3 deletions.
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ RUN conda install -y \
requests=2.31.0 \
notebook=7.0.6 \
pytest=7.4.3 \
responses=0.24.1
responses=0.24.1 \
click=8.0.3
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,13 @@ Copy and paste that URL into your browser.
<img src="img/jupyter-container-web-app-launch-url.png" width=400>

3. To run the analysis,
open `src/breast_cancer_predict_report.ipynb` in Jupyter Lab you just launched
and under the "Kernel" menu click "Restart Kernel and Run All Cells...".
enter the following commands in the terminal in the project root:

```
python scripts/download_data.py --url="https://archive.ics.uci.edu/static/public/15/breast+cancer+wisconsin+original.zip" --write-to="data/raw"
python scripts/split_n_preprocess.py --raw_data=data/raw/wdbc.data --write_to=results
```

#### Clean up

Expand Down
Binary file added results/cancer_preprocessor.pickle
Binary file not shown.
172 changes: 172 additions & 0 deletions results/cancer_test.csv

Large diffs are not rendered by default.

399 changes: 399 additions & 0 deletions results/cancer_train.csv

Large diffs are not rendered by default.

172 changes: 172 additions & 0 deletions results/scaled_cancer_test.csv

Large diffs are not rendered by default.

399 changes: 399 additions & 0 deletions results/scaled_cancer_train.csv

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions scripts/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# download_data.py
# author: Tiffany Timbers
# date: 2023-11-27

import click
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from src.read_zip import read_zip

@click.command()
@click.option('--url', type=str, help="URL of dataset to be downloaded")
@click.option('--write_to', type=str, help="Path to directory where raw data will be written to")

def main(url, write_to):
"""Downloads data zip data from the web to a local filepath and extracts it."""
try:
read_zip(url, write_to)
except:
os.makedirs(write_to)
read_zip(url, write_to)

if __name__ == '__main__':
main()
91 changes: 91 additions & 0 deletions scripts/split_n_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# split_n_preprocess.py
# author: Tiffany Timbers
# date: 2023-11-27

import click
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector
np.random.seed(522)
set_config(transform_output="pandas")

@click.command()
@click.option('--raw_data', type=str, help="Path to raw data")
@click.option('--write_to', type=str, help="Path to directory where processed data will be written to")

def main(raw_data, write_to):
'''This script splits the raw data into train and test sets,
and then preprocesses the data to be used in exploratory data analysis.
It also saves the preprocessor to be used in the model training script.'''
colnames = [
"id",
"class",
"mean_radius",
"mean_texture",
"mean_perimeter",
"mean_area",
"mean_smoothness",
"mean_compactness",
"mean_concavity",
"mean_concave_points",
"mean_symmetry",
"mean_fractal_dimension",
"se_radius",
"se_texture",
"se_perimeter",
"se_area",
"se_smoothness",
"se_compactness",
"se_concavity",
"se_concave_points",
"se_symmetry",
"se_fractal_dimension",
"max_radius",
"max_texture",
"max_perimeter",
"max_area",
"max_smoothness",
"max_compactness",
"max_concavity",
"max_concave_points",
"max_symmetry",
"max_fractal_dimension"
]

cancer = pd.read_csv(raw_data, names=colnames, header=None).drop(columns=['id'])
# re-label Class 'M' as 'Malignant', and Class 'B' as 'Benign'
cancer['class'] = cancer['class'].replace({
'M' : 'Malignant',
'B' : 'Benign'
})

# create the split
cancer_train, cancer_test = train_test_split(
cancer, train_size=0.70, stratify=cancer["class"]
)

cancer_train.to_csv(os.path.join(write_to, "cancer_train.csv"))
cancer_test.to_csv(os.path.join(write_to, "cancer_test.csv"))

cancer_preprocessor = make_column_transformer(
(StandardScaler(), make_column_selector(dtype_include='number')),
remainder='passthrough',
verbose_feature_names_out=False
)
pickle.dump(cancer_preprocessor, open(os.path.join(write_to, "cancer_preprocessor.pickle"), "wb"))

cancer_preprocessor.fit(cancer_train)
scaled_cancer_train = cancer_preprocessor.transform(cancer_train)
scaled_cancer_test = cancer_preprocessor.transform(cancer_test)

scaled_cancer_train.to_csv(os.path.join(write_to, "scaled_cancer_train.csv"))
scaled_cancer_test.to_csv(os.path.join(write_to, "scaled_cancer_test.csv"))

if __name__ == '__main__':
main()

0 comments on commit c90f289

Please sign in to comment.