FixML · WeilinHan8 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -3,16 +3,17 @@ FROM quay.io/jupyter/minimal-notebook:notebook-7.0.6
 
 # install necessary packages for analysis
 RUN conda install -y \
-    python=3.11.6 \
-    altair=5.1.2 \
-    pandas=2.1.2 \
-    ipykernel=6.26.0 \
-    scikit-learn=1.3.2 \
-    requests=2.31.0 \
-    notebook=7.0.6 \
-    pytest=7.4.3 \
-    responses=0.24.1 \
+    python=3.11.7 \
+    altair=5.4.1 \
+    pandas=1.5.3 \
+    ipykernel=6.29.5  \
+    scikit-learn=1.5.2 \
+    requests=2.32.3 \
+    notebook=7.0.8 \
+    pytest=8.3.3 \
+    responses=0.25.3 \
     click=8.1.7 \
-    vl-convert-python=1.1.0 \
-    jupyter-book=0.15.1 \
-    make
+    vl-convert-python=1.7.0 \
+    jupyter-book=1.0.3 \
+    make 
+RUN pip install great-expectations==1.1.3
diff --git a/scripts/clean_validate.py b/scripts/clean_validate.py
@@ -0,0 +1,95 @@
+# clean_validate.py
+# author: Weilin Han
+# date: 2024-10-20
+
+import click
+import os
+import sys
+import re
+import pandas as pd
+import great_expectations as gx
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from src.clean_data import *
+from src.validate_data import *
+
+@click.command()
+@click.option('--raw-data', type=str, help="Path to directory where raw data resides")
+@click.option('--name-file', type=str, help="Path to dirctory where names file resides")
+@click.option('--write-to', type=str, help="Path to directory where cleaned data will be written to")
+
+def main(raw_data, name_file, write_to):
+    """Clean raw data and validate it."""
+    # Extract column names from .names file
+    colnames = extract_column_name(name_file)
+
+    # Read raw data
+    imported_data = read_raw_data(raw_data, colnames)
+
+    # Removing id column and relabel diagnosis column
+    cleaned_data = clean_data(imported_data)
+
+    # Create Great Expectation batch object for data validation
+    batch = create_data_batch(cleaned_data)
+
+    # Validate cleaned data
+    # Validates that the dataframe contains specific columns.
+    exp_column_exsist(batch, colnames)
+
+    # Validates that the distinct values in specified columns of a dataset contain the expected values.
+    col_set = {
+        'diagnosis': ['Malignant', 'Benign']
+    }
+    exp_value_set_in_col(batch, col_set)
+
+    # Validates that the type of values of specified column is correct.
+    col_type = {'diagnosis': 'string'}
+    for key in colnames[2:]:
+        col_type[key] = 'number'
+    exp_type_of_col_values(batch, col_type)
+
+    # Validates that the numbers of nulls of specific columns are within the tolerable limit.
+    col_percent = {'diagnosis': 0}
+    for key in colnames[2:]:
+        col_type[key] = 0.1
+    # For numerical columns, no more than 10% of values are nulls
+    exp_col_not_null(batch, col_percent)
+
+    # Validates that the numeric values of specified columns are within specified range.
+    col_range = {
+            'mean_radius': [6,30,False,False],
+            'mean_texture': [9,40,False,False],
+            'mean_perimeter': [40,200,False,False],
+            'mean_area': [140,2510,False,False],
+            'mean_smoothness': [0,1,False,False],
+            'mean_compactness': [0,1,False,False],
+            'mean_concavity': [0,1,False,False],
+            'mean_concave': [0,1,False,False],
+            'mean_symmetry': [0,1,False,False],
+            'mean_fractal': [0,1,False,False],
+            'se_radius': [0,3,False,False],
+            'se_texture': [0,5,False,False],
+            'se_perimeter': [0,22,False,False],
+            'se_area': [6,550,False,False],
+            'se_smoothness': [0,1,False,False],
+            'se_compactness': [0,1,False,False],
+            'se_concavity': [0,1,False,False],
+            'se_concave': [0,1,False,False],
+            'se_symmetry': [0,1,False,False],
+            'se_fractal': [0,1,False,False],
+            'max_radius': [7,40,False,False],
+            'max_texture': [12,50,False,False],
+            'max_perimeter': [50,260,False,False],
+            'max_area': [180,4300,False,False],
+            'max_smoothness': [0,1,False,False],
+            'max_compactness': [0,2,False,False],
+            'max_concavity': [0,2,False,False],
+            'max_concave': [0,1,False,False],
+            'max_symmetry': [0,1,False,False],
+            'max_fractal': [0,1,False,False]
+    }
+    exp_value_range(batch, col_range)
+
+    write_data(cleaned_data, write_to)
+
+if __name__ == '__main__':
+    main()
diff --git a/src/clean_data.py b/src/clean_data.py
@@ -0,0 +1,104 @@
+# clean_data.py
+# author: Weilin Han
+# date: 2024-10-15
+
+import pandas as pd
+import re
+import os
+
+
+def extract_column_name(raw_name_file):
+    """Extract and clean column names from .names file."""
+
+    # Test 1: Ensure the raw name file exists, if not raise error
+    if not os.path.exists(raw_name_file):
+        raise FileNotFoundError(f"The raw_name file does not exist.")
+
+    # Test 2: Ensure the raw name file is a .names file, if not raise error
+    if not raw_name_file.endswith('.names'):
+        raise ValueError("The raw_name file must be a .names file.")
+
+    # Extracting column names from downloaded raw file
+    text_lines = []
+    with open(raw_name_file, 'r') as file:
+        for line in file:
+            line = line.strip()
+            if not line.startswith('#') and line:  # Skip comma
+                text_lines.append(line)
+        start = text_lines.index('7. Attribute information')
+        end = text_lines.index('8. Missing attribute values: none')
+        text_lines = text_lines[start:end]
+
+        pattern = re.compile(r'^[1-9a-z]\)\s*')
+        text_lines = [item for item in text_lines if pattern.match(item)]
+        text_lines = [pattern.sub('', item) for item in text_lines]
+        text_lines = [item.split()[0].lower() for item in text_lines]
+
+        statistics = ['mean','se','max'] 
+        #se is standard error, and max is the worst or largest (mean of three largest values)
+
+        # please refer to original file for explanation of feactures
+        colnames = text_lines[0:2]
+        for stat in statistics:
+            for feature in text_lines[2:]:
+                colnames.append(stat+'_'+feature)
+
+    return colnames
+
+def read_raw_data(raw_data, col_name):
+    """Read data from .data file."""
+
+    # Test 1: Ensure the raw data file exists, if not raise error
+    if not os.path.exists(raw_data):
+        raise FileNotFoundError(f"The raw_data file does not exist.")
+
+    # Test 2: Ensure the raw_data file's extension is .data', if not raise error
+    if not raw_data.endswith('.data'):
+        raise ValueError("The raw_data file must be a .data file.")
+
+    # Test 3: Ensure the col_name is a list, if not raise error
+    if not isinstance(col_name, list):
+        raise TypeError("col_name must be a list.")
+
+    # Test 4: Ensure the list has 32 items, if not raise error
+    if len(col_name) != 32:
+        raise ValueError("col_name must contain exactly 32 items.")
+
+    # Test 5: Ensure the list only contains strings, if not raise error
+    if not all(isinstance(item, str) for item in col_name):
+        raise ValueError("col_name must only contain strings.")
+
+    imported_data = pd.read_csv(raw_data, names=col_name, header=None)
+    return imported_data
+
+def clean_data(imported_data, drop_columns=['id'], relabel={'M' : 'Malignant','B' : 'Benign'}):
+    """Clean imported data"""
+    # Test 1: Ensure the imported_data is a dataframe
+    if not isinstance(imported_data, pd.DataFrame):
+        raise TypeError("imported_data must be a data frame.")
+
+    # Test 2: Ensure the drop_columns is a list
+    if not isinstance(drop_columns, list):
+        raise TypeError("drop_columns must be a list.")
+
+    # Test 3: Ensure the relabel is a dictionary
+    if not isinstance(relabel, dict):
+        raise TypeError("relabel must be a dictionary")
+
+    cleaned_data = imported_data.drop(columns=drop_columns)
+    cleaned_data['diagnosis'] = cleaned_data['diagnosis'].replace(relabel)
+    return cleaned_data
+
+def write_data(cleaned_data, data_to):
+    """Write cleaned and validated data to directory"""
+    # Test 1: Ensure the cleaned_data is a dataframe, if not raise an error
+    if not isinstance(cleaned_data, pd.DataFrame):
+        raise TypeError("cleaned_data must be a data frame.")
+
+    # Test 2: check if the directory path exists, if not raise an error
+    if not os.path.exists(data_to):
+        raise FileNotFoundError('The directory provided does not exist.')
+
+    # Test 3: check if the dirctory path provided is a directory, if not raise an error
+    if not os.path.isdir(data_to):
+        raise NotADirectoryError('The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.')