FixML · WeilinHan8 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -3,16 +3,17 @@ FROM quay.io/jupyter/minimal-notebook:notebook-7.0.6
 
 # install necessary packages for analysis
 RUN conda install -y \
-    python=3.11.6 \
-    altair=5.1.2 \
-    pandas=2.1.2 \
-    ipykernel=6.26.0 \
-    scikit-learn=1.3.2 \
-    requests=2.31.0 \
-    notebook=7.0.6 \
-    pytest=7.4.3 \
-    responses=0.24.1 \
+    python=3.11.7 \
+    altair=5.4.1 \
+    pandas=1.5.3 \
+    ipykernel=6.29.5  \
+    scikit-learn=1.5.2 \
+    requests=2.32.3 \
+    notebook=7.0.8 \
+    pytest=8.3.3 \
+    responses=0.25.3 \
     click=8.1.7 \
-    vl-convert-python=1.1.0 \
-    jupyter-book=0.15.1 \
-    make
+    vl-convert-python=1.7.0 \
+    jupyter-book=1.0.3 \
+    make 
+RUN pip install great-expectations==1.1.3
diff --git a/data/processed/data_config.csv b/data/processed/data_config.csv
@@ -0,0 +1,32 @@
+column,type,max,min
+diagnosis,str,,
+mean_radius,float,6,40
+mean_texture,float,9,50
+mean_perimeter,float,40,260
+mean_area,float,140,4300
+mean_smoothness,float,0,1
+mean_compactness,float,0,2
+mean_concavity,float,0,2
+mean_concave,float,0,1
+mean_symmetry,float,0,1
+mean_fractal,float,0,1
+se_radius,float,0,3
+se_texture,float,0,5
+se_perimeter,float,0,22
+se_area,float,6,550
+se_smoothness,float,0,1
+se_compactness,float,0,1
+se_concavity,float,0,1
+se_concave,float,0,1
+se_symmetry,float,0,1
+se_fractal,float,0,1
+max_radius,float,6,40
+max_texture,float,9,50
+max_perimeter,float,40,260
+max_area,float,140,4300
+max_smoothness,float,0,1
+max_compactness,float,0,2
+max_concavity,float,0,2
+max_concave,float,0,1
+max_symmetry,float,0,1
+max_fractal,float,0,1
diff --git a/scripts/clean_validate.py b/scripts/clean_validate.py
@@ -0,0 +1,41 @@
+# clean_validate.py
+# author: Weilin Han
+# date: 2024-10-20
+
+import click
+import os
+import sys
+import pandas as pd
+import pandera as pa
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from src.clean_data import extract_column_name, read_raw_data, clean_data, write_data
+from src.validate_data import build_schema_from_csv
+
+@click.command()
+@click.option('--raw-data-file', type=str, help="Path to raw data file")
+@click.option('--name-file', type=str, help="Path to dirctory where names file resides")
+@click.option('--write-to', type=str, help="Path to directory where cleaned data will be written to")
+
+def main(raw_data_file, name_file, write_to):
+    """Clean raw data and validate it."""
+    # Extract column names from .names file
+    colnames = extract_column_name(name_file)
+
+    # Read raw data
+    imported_data = read_raw_data(raw_data_file, colnames)
+
+    # Removing id column and relabel diagnosis column
+    cleaned_data = clean_data(imported_data)
+
+    # Validate cleaned data
+    # Load the CSV config file
+    data_config_file = '/data/processed/data_config.csv'
+    # define schema
+    schema = build_schema_from_csv(data_config=data_config_file,expected_columns=colnames)
-    schema = build_schema_from_csv(data_config=data_config_file,expected_columns=colnames)
+    schema = build_schema_from_csv(data_config=data_config_file, expected_columns=colnames)
-    schema = build_schema_from_csv(data_config=data_config_file,expected_columns=colnames)
+    schema = build_schema_from_csv(data_config=data_config_file, expected_columns=colnames)
+    schema.validate(cleaned_data)
+
+    # Write data to specified directory
+    write_data(cleaned_data, write_to)
+
+if __name__ == '__main__':
+    main()
diff --git a/src/clean_data.py b/src/clean_data.py
@@ -0,0 +1,96 @@
+# clean_data.py
+# author: Weilin Han
+# date: 2024-10-15
+
+import pandas as pd
+import re
+import os
+
+
+def extract_column_name(raw_name_file):
+    """Extract and clean column names from .names file."""
+
+    # Input Validation Checks 1: Ensure the raw name file exists, if not raise error
+    if not os.path.exists(raw_name_file):
+        raise FileNotFoundError(f"The raw_name file does not exist.")
+
+    # Extracting column names from downloaded raw file
+    text_lines = []
+    with open(raw_name_file, 'r') as file:
+        for line in file:
+            line = line.strip()
+            if not line.startswith('#') and line:  # Skip comma
+                text_lines.append(line)
+        start = text_lines.index('7. Attribute information')
+        end = text_lines.index('8. Missing attribute values: none')
+        text_lines = text_lines[start:end]
+
+        pattern = re.compile(r'^[1-9a-z]\)\s*')
+        text_lines = [item for item in text_lines if pattern.match(item)]
+        text_lines = [pattern.sub('', item) for item in text_lines]
+        text_lines = [item.split()[0].lower() for item in text_lines]
+
+        statistics = ['mean','se','max'] 
+        #se is standard error, and max is the worst or largest (mean of three largest values)
+
+        # please refer to original file for explanation of feactures
+        colnames = text_lines[0:2]
+        for stat in statistics:
+            for feature in text_lines[2:]:
+                colnames.append(stat+'_'+feature)
+
+    return colnames
+
+def read_raw_data(raw_data, col_name):
+    """Read data from .data file."""
+
+    # Input Validation Checks 1: Ensure the raw data file exists, if not raise error
+    if not os.path.exists(raw_data):
+        raise FileNotFoundError(f"The raw_data file does not exist.")
+
+    # Input Validation Checks 2: Ensure the col_name is a list, if not raise error
+    if not isinstance(col_name, list):
+        raise TypeError("col_name must be a list.")
+
+    # Input Validation Checks 3: Ensure the list has 32 items, if not raise error
+    if len(col_name) != 32:
+        raise ValueError("col_name must contain exactly 32 items.")
+
+    # Input Validation Checks 4: Ensure the list only contains strings, if not raise error
+    if not all(isinstance(item, str) for item in col_name):
+        raise ValueError("col_name must only contain strings.")
+
+    imported_data = pd.read_csv(raw_data, names=col_name, header=None)
+    return imported_data
+
+def clean_data(imported_data, drop_columns=['id'], relabel={'M' : 'Malignant','B' : 'Benign'}):
+    """Clean imported data"""
+    # Input Validation Checks 1: Ensure the imported_data is a dataframe
+    if not isinstance(imported_data, pd.DataFrame):
+        raise TypeError("imported_data must be a data frame.")
+
+    # Input Validation Checks 2: Ensure the drop_columns is a list
+    if not isinstance(drop_columns, list):
+        raise TypeError("drop_columns must be a list.")
+
+    # Input Validation Checks 3: Ensure the relabel is a dictionary
+    if not isinstance(relabel, dict):
+        raise TypeError("relabel must be a dictionary")
+
+    cleaned_data = imported_data.drop(columns=drop_columns)
+    cleaned_data['diagnosis'] = cleaned_data['diagnosis'].replace(relabel)
+    return cleaned_data
+
+def write_data(cleaned_data, data_to):
+    """Write cleaned and validated data to directory"""
+    # Input Validation Checks 1: Ensure the cleaned_data is a dataframe, if not raise an error
+    if not isinstance(cleaned_data, pd.DataFrame):
+        raise TypeError("cleaned_data must be a data frame.")
+
+    # Input Validation Checks 2: check if the directory path exists, if not raise an error
+    if not os.path.exists(data_to):
+        raise FileNotFoundError('The directory provided does not exist.')
+
+    # Input Validation Checks 3: check if the dirctory path provided is a directory, if not raise an error
+    if not os.path.isdir(data_to):
+        raise NotADirectoryError('The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.')
diff --git a/src/validate_data.py b/src/validate_data.py
@@ -0,0 +1,63 @@
+# clean_validate.py
+# author: Weilin Han
+# date: 2024-10-03
+
+import pandas as pd
+import pandera as pa
+import os
+
+# Function to build schema from the config file
+def build_schema_from_csv(data_config, expected_columns):
+    """Building schema to validate data using pandera"""
+
+    # Input Validation Checks 1: Ensure the data_config file exists, if not raise error
+    if not os.path.exists(data_config):
+        raise FileNotFoundError(f"The data_config file does not exist.")
+
+    config_df = pd.read_csv(data_config)
+
+    # Ensure the pandas dataframe has four columns: column,type,max,min
+    required_columns = ['column', 'type', 'min', 'max']
+    if list(config_df.columns) != required_columns:
+        raise ValueError(f"The configuration file must have exactly four columns: 'column', 'type', 'min', 'max'.")
+
+    # Ensure the values of 'column' match the column names extracted from name file
+    if expected_columns is not None:
+        actual_columns = config_df['column'].str.strip("'").tolist()  # Clean up any extra quotation marks in 'column'
+        if set(actual_columns) != set(expected_columns):
+            raise ValueError("Column names in the config file do not match the expected columns.")
+
+    schema_dict = {}
+
+    # Loop through each row in the config DataFrame
+    for _, row in config_df.iterrows():
+        column_name = row['column'].strip()  # Removing potential extra spaces
+        column_type = row['type'].strip()    # Strip any spaces
+        min_value = row['min'] if pd.notna(row['min']) else None
+        max_value = row['max'] if pd.notna(row['max']) else None
+
+        # Define the correct Pandera data type
+        if column_type == 'int':
+            dtype = pa.Int
+        elif column_type == 'float':
+            dtype = pa.Float
+        elif column_type == 'str':
+            dtype = pa.String
+        else:
+            raise ValueError(f"Unsupported column type: {column_type}")
+
+        # Create validation checks
+        checks = []
+        if min_value is not None:
+            checks.append(pa.Check.greater_than_or_equal_to(float(min_value)))
+        if max_value is not None:
+            checks.append(pa.Check.less_than_or_equal_to(float(max_value)))
+
+        # Add the column schema to the schema dictionary
+        schema_dict[column_name] = pa.Column(dtype, checks=checks, nullable=False)
+
+    # Return the DataFrameSchema object
+    return pa.DataFrameSchema(schema_dict)
+
+
+
diff --git a/tests/test_clean_data.py b/tests/test_clean_data.py
@@ -0,0 +1,107 @@
+import pytest
+import pandas as pd
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from src.clean_data import *
+
+# Test files setup
+col_name1 = ["col" + str(i) for i in range(32)] # 32 strings
+col_name2 = {"1":"apple"}
+col_name3 = ['1','2','3']
+col_name4 = ["col" + str(i) for i in range(31)] + [123]  # 31 strings + 1 integer
+
+imported_data1 = pd.DataFrame({
+        'id': [1, 2, 3],
+        'class': ['M', 'B', 'M']
+    })
+imported_data2 = [1, 2, 3, 4, 5]
+drop_columns1=['id']
+drop_columns2={'1':'id'}
+relabel1={'M' : 'Malignant','B' : 'Benign'}
+relabel2=['M','B']
+
+cleaned_data1 = pd.DataFrame({
+        'id': [1, 2, 3],
+        'class': ['M', 'B', 'M']
+    })
+cleaned_data2 = [1, 2, 3, 4, 5]
+# setup empty directory for data files to be downloaded to
+if not os.path.exists('tests/test_write_data1'):
+    os.makedirs('tests/test_write_data1')
+
+# Tests
+
+# Tests for extract_column_name
+
+# test extract_column_name function throws an error 
+# if the raw name file does not exist
+def test_extract_column_name_error_on_missing_file():
+    with pytest.raises(FileNotFoundError, match='The raw_name file does not exist.'):
+        extract_column_name('tests/test_name_data.name')
+
+# Tests for read_raw_data
+
+# test read_raw_data function throws an error 
+# if the raw data file does not exist
+def test_read_raw_data_error_on_missing_file():
+    with pytest.raises(FileNotFoundError, match='The raw_data file does not exist.'):
+        read_raw_data('tests/test_raw_data.data',col_name1)
+
+# test read_raw_data function throws an error 
+# if the col_name is not a list
+def test_read_raw_data_error_on_non_list():
+    with pytest.raises(TypeError, match="col_name must be a list."):
+        read_raw_data('tests/test_wdbc.data',col_name2)
+
+# test read_raw_data function throws an error 
+# if the col_name does not have 32 values
+def test_read_raw_data_error_on_insufficient_list_item():
+    with pytest.raises(ValueError, match="col_name must contain exactly 32 items."):
+        read_raw_data('tests/test_wdbc.data', col_name3)
+
+# test read_raw_data function throws an error 
+# if the col_name contains items other than string
+def test_read_raw_data_error_on_wrong_item_type():
+    with pytest.raises(ValueError, match="col_name must only contain strings."):
+        read_raw_data('tests/test_wdbc.data', col_name4)
+
+# Tests for clean_data
+
+# test clean_data function throws an error
+# if the imported_data is not a dataframe
+def test_clean_data_error_on_wrong_imported_data_format():
+    with pytest.raises(TypeError, match="imported_data must be a data frame."):
+        clean_data(imported_data2, drop_columns1, relabel1)
+
+# test clean_data function throws an error
+# if the drop_columns is not a list
+def test_clean_data_error_on_wrong_drop_columns_format():
+    with pytest.raises(TypeError, match="drop_columns must be a list."):
+        clean_data(imported_data1, drop_columns2, relabel1)
+
+
+# test clean_data function throws an error
+# if the relabel is not a dictionary
+def test_clean_data_error_on_wrong_relabel_format():
+    with pytest.raises(TypeError, match="relabel must be a dictionary"):
+        clean_data(imported_data1, drop_columns1, relabel2)
+
+# Tests for write_data
+
+# test write_data function throws an error
+# if the cleaned_data is not a dataframe
+def test_write_data_error_on_wrong_cleaned_data_format():
+    with pytest.raises(TypeError, match="cleaned_data must be a data frame."):
+        write_data(cleaned_data2, 'tests/test_write_data1')
+
+# test write_data function throws an error 
+# if the write_to path provided does not exist
+def test_read_zip_error_on_nonexistent_dir():
+    with pytest.raises(FileNotFoundError, match='The directory provided does not exist.'):
+        write_data(cleaned_data1, 'tests/test_write_data3')
+
+# if the directory path provided is not directory
+def test_read_zip_error_on_missing_dir():
+    with pytest.raises(NotADirectoryError, match='The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.'):
+        write_data(cleaned_data1, 'tests/conftest.py')