-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Wl clean validate #5
base: main
Are you sure you want to change the base?
Changes from 12 commits
d42e94d
eb86505
fee3c78
05d548d
96d1507
f406da0
f989c6b
3a7ddf3
8148e3f
d0cb2ff
915bef5
1c0b396
d7d5235
93791e7
ccc7d5e
956688e
77bdc92
9be3fad
3f2d183
cd71ea2
124e09d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
column,type,max,min | ||
diagnosis,str,, | ||
mean_radius,float,6,40 | ||
mean_texture,float,9,50 | ||
mean_perimeter,float,40,260 | ||
mean_area,float,140,4300 | ||
mean_smoothness,float,0,1 | ||
mean_compactness,float,0,2 | ||
mean_concavity,float,0,2 | ||
mean_concave,float,0,1 | ||
mean_symmetry,float,0,1 | ||
mean_fractal,float,0,1 | ||
se_radius,float,0,3 | ||
se_texture,float,0,5 | ||
se_perimeter,float,0,22 | ||
se_area,float,6,550 | ||
se_smoothness,float,0,1 | ||
se_compactness,float,0,1 | ||
se_concavity,float,0,1 | ||
se_concave,float,0,1 | ||
se_symmetry,float,0,1 | ||
se_fractal,float,0,1 | ||
max_radius,float,6,40 | ||
max_texture,float,9,50 | ||
max_perimeter,float,40,260 | ||
max_area,float,140,4300 | ||
max_smoothness,float,0,1 | ||
max_compactness,float,0,2 | ||
max_concavity,float,0,2 | ||
max_concave,float,0,1 | ||
max_symmetry,float,0,1 | ||
max_fractal,float,0,1 |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,41 @@ | ||||||
# clean_validate.py | ||||||
# author: Weilin Han | ||||||
# date: 2024-10-20 | ||||||
|
||||||
import click | ||||||
import os | ||||||
import sys | ||||||
import pandas as pd | ||||||
import pandera as pa | ||||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | ||||||
from src.clean_data import extract_column_name, read_raw_data, clean_data, write_data | ||||||
from src.validate_data import build_schema_from_csv | ||||||
|
||||||
@click.command() | ||||||
@click.option('--raw-data-file', type=str, help="Path to raw data file") | ||||||
@click.option('--name-file', type=str, help="Path to dirctory where names file resides") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. point to the file for names file, not just the directory There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, its just the comment that is wrong, not the code. |
||||||
@click.option('--write-to', type=str, help="Path to directory where cleaned data will be written to") | ||||||
|
||||||
def main(raw_data_file, name_file, write_to): | ||||||
"""Clean raw data and validate it.""" | ||||||
# Extract column names from .names file | ||||||
colnames = extract_column_name(name_file) | ||||||
|
||||||
# Read raw data | ||||||
imported_data = read_raw_data(raw_data_file, colnames) | ||||||
|
||||||
# Removing id column and relabel diagnosis column | ||||||
cleaned_data = clean_data(imported_data) | ||||||
|
||||||
# Validate cleaned data | ||||||
# Load the CSV config file | ||||||
data_config_file = '/data/processed/data_config.csv' | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||||||
# define schema | ||||||
schema = build_schema_from_csv(data_config=data_config_file,expected_columns=colnames) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
schema.validate(cleaned_data) | ||||||
|
||||||
# Write data to specified directory | ||||||
write_data(cleaned_data, write_to) | ||||||
|
||||||
if __name__ == '__main__': | ||||||
main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# clean_data.py | ||
# author: Weilin Han | ||
# date: 2024-10-15 | ||
|
||
import pandas as pd | ||
import re | ||
import os | ||
|
||
|
||
def extract_column_name(raw_name_file): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this function does too much. A function should do just one thing. I would move the open command to the script (so read in the whole file) and then just have the regular expressions as what is modularized to the function. |
||
"""Extract and clean column names from .names file.""" | ||
|
||
# Input Validation Checks 1: Ensure the raw name file exists, if not raise error | ||
if not os.path.exists(raw_name_file): | ||
raise FileNotFoundError(f"The raw_name file does not exist.") | ||
Comment on lines
+14
to
+15
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am fairly certain that |
||
|
||
# Extracting column names from downloaded raw file | ||
text_lines = [] | ||
with open(raw_name_file, 'r') as file: | ||
for line in file: | ||
line = line.strip() | ||
if not line.startswith('#') and line: # Skip comma | ||
text_lines.append(line) | ||
start = text_lines.index('7. Attribute information') | ||
end = text_lines.index('8. Missing attribute values: none') | ||
text_lines = text_lines[start:end] | ||
|
||
pattern = re.compile(r'^[1-9a-z]\)\s*') | ||
text_lines = [item for item in text_lines if pattern.match(item)] | ||
text_lines = [pattern.sub('', item) for item in text_lines] | ||
text_lines = [item.split()[0].lower() for item in text_lines] | ||
|
||
statistics = ['mean','se','max'] | ||
#se is standard error, and max is the worst or largest (mean of three largest values) | ||
|
||
# please refer to original file for explanation of feactures | ||
colnames = text_lines[0:2] | ||
for stat in statistics: | ||
for feature in text_lines[2:]: | ||
colnames.append(stat+'_'+feature) | ||
ttimbers marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
return colnames | ||
|
||
def read_raw_data(raw_data, col_name): | ||
"""Read data from .data file.""" | ||
|
||
# Input Validation Checks 1: Ensure the raw data file exists, if not raise error | ||
if not os.path.exists(raw_data): | ||
raise FileNotFoundError(f"The raw_data file does not exist.") | ||
|
||
# Input Validation Checks 2: Ensure the col_name is a list, if not raise error | ||
if not isinstance(col_name, list): | ||
raise TypeError("col_name must be a list.") | ||
|
||
# Input Validation Checks 3: Ensure the list has 32 items, if not raise error | ||
if len(col_name) != 32: | ||
raise ValueError("col_name must contain exactly 32 items.") | ||
Comment on lines
+58
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The magic numbers here are a brittle and confusing to others that won't know where they come from. This is the number of columns in |
||
|
||
# Input Validation Checks 4: Ensure the list only contains strings, if not raise error | ||
if not all(isinstance(item, str) for item in col_name): | ||
raise ValueError("col_name must only contain strings.") | ||
|
||
imported_data = pd.read_csv(raw_data, names=col_name, header=None) | ||
return imported_data | ||
|
||
def clean_data(imported_data, drop_columns=['id'], relabel={'M' : 'Malignant','B' : 'Benign'}): | ||
"""Clean imported data""" | ||
# Input Validation Checks 1: Ensure the imported_data is a dataframe | ||
if not isinstance(imported_data, pd.DataFrame): | ||
raise TypeError("imported_data must be a data frame.") | ||
|
||
# Input Validation Checks 2: Ensure the drop_columns is a list | ||
if not isinstance(drop_columns, list): | ||
raise TypeError("drop_columns must be a list.") | ||
|
||
# Input Validation Checks 3: Ensure the relabel is a dictionary | ||
if not isinstance(relabel, dict): | ||
raise TypeError("relabel must be a dictionary") | ||
|
||
cleaned_data = imported_data.drop(columns=drop_columns) | ||
cleaned_data['diagnosis'] = cleaned_data['diagnosis'].replace(relabel) | ||
return cleaned_data | ||
|
||
def write_data(cleaned_data, data_to): | ||
"""Write cleaned and validated data to directory""" | ||
# Input Validation Checks 1: Ensure the cleaned_data is a dataframe, if not raise an error | ||
if not isinstance(cleaned_data, pd.DataFrame): | ||
raise TypeError("cleaned_data must be a data frame.") | ||
|
||
# Input Validation Checks 2: check if the directory path exists, if not raise an error | ||
if not os.path.exists(data_to): | ||
raise FileNotFoundError('The directory provided does not exist.') | ||
Comment on lines
+93
to
+94
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you don't have this in your function, will Python's |
||
|
||
# Input Validation Checks 3: check if the dirctory path provided is a directory, if not raise an error | ||
if not os.path.isdir(data_to): | ||
raise NotADirectoryError('The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# clean_validate.py | ||
# author: Weilin Han | ||
# date: 2024-10-03 | ||
|
||
import pandas as pd | ||
import pandera as pa | ||
import os | ||
|
||
# Function to build schema from the config file | ||
def build_schema_from_csv(data_config, expected_columns): | ||
"""Building schema to validate data using pandera""" | ||
|
||
# Input Validation Checks 1: Ensure the data_config file exists, if not raise error | ||
if not os.path.exists(data_config): | ||
raise FileNotFoundError(f"The data_config file does not exist.") | ||
|
||
config_df = pd.read_csv(data_config) | ||
|
||
# Ensure the pandas dataframe has four columns: column,type,max,min | ||
required_columns = ['column', 'type', 'min', 'max'] | ||
if list(config_df.columns) != required_columns: | ||
raise ValueError(f"The configuration file must have exactly four columns: 'column', 'type', 'min', 'max'.") | ||
|
||
# Ensure the values of 'column' match the column names extracted from name file | ||
if expected_columns is not None: | ||
actual_columns = config_df['column'].str.strip("'").tolist() # Clean up any extra quotation marks in 'column' | ||
if set(actual_columns) != set(expected_columns): | ||
raise ValueError("Column names in the config file do not match the expected columns.") | ||
|
||
schema_dict = {} | ||
|
||
# Loop through each row in the config DataFrame | ||
for _, row in config_df.iterrows(): | ||
column_name = row['column'].strip() # Removing potential extra spaces | ||
column_type = row['type'].strip() # Strip any spaces | ||
min_value = row['min'] if pd.notna(row['min']) else None | ||
max_value = row['max'] if pd.notna(row['max']) else None | ||
|
||
# Define the correct Pandera data type | ||
if column_type == 'int': | ||
dtype = pa.Int | ||
elif column_type == 'float': | ||
dtype = pa.Float | ||
elif column_type == 'str': | ||
dtype = pa.String | ||
else: | ||
raise ValueError(f"Unsupported column type: {column_type}") | ||
Comment on lines
+40
to
+48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. None of this is needed. Pandera's schemas work with |
||
|
||
# Create validation checks | ||
checks = [] | ||
if min_value is not None: | ||
checks.append(pa.Check.greater_than_or_equal_to(float(min_value))) | ||
if max_value is not None: | ||
checks.append(pa.Check.less_than_or_equal_to(float(max_value))) | ||
|
||
# Add the column schema to the schema dictionary | ||
schema_dict[column_name] = pa.Column(dtype, checks=checks, nullable=False) | ||
|
||
# Return the DataFrameSchema object | ||
return pa.DataFrameSchema(schema_dict) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import pytest | ||
import pandas as pd | ||
import os | ||
import sys | ||
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) | ||
from src.clean_data import * | ||
|
||
# Test files setup | ||
col_name1 = ["col" + str(i) for i in range(32)] # 32 strings | ||
col_name2 = {"1":"apple"} | ||
col_name3 = ['1','2','3'] | ||
col_name4 = ["col" + str(i) for i in range(31)] + [123] # 31 strings + 1 integer | ||
|
||
imported_data1 = pd.DataFrame({ | ||
'id': [1, 2, 3], | ||
'class': ['M', 'B', 'M'] | ||
}) | ||
imported_data2 = [1, 2, 3, 4, 5] | ||
drop_columns1=['id'] | ||
drop_columns2={'1':'id'} | ||
relabel1={'M' : 'Malignant','B' : 'Benign'} | ||
relabel2=['M','B'] | ||
|
||
cleaned_data1 = pd.DataFrame({ | ||
'id': [1, 2, 3], | ||
'class': ['M', 'B', 'M'] | ||
}) | ||
cleaned_data2 = [1, 2, 3, 4, 5] | ||
# setup empty directory for data files to be downloaded to | ||
if not os.path.exists('tests/test_write_data1'): | ||
os.makedirs('tests/test_write_data1') | ||
|
||
# Tests | ||
|
||
# Tests for extract_column_name | ||
|
||
# test extract_column_name function throws an error | ||
# if the raw name file does not exist | ||
def test_extract_column_name_error_on_missing_file(): | ||
with pytest.raises(FileNotFoundError, match='The raw_name file does not exist.'): | ||
extract_column_name('tests/test_name_data.name') | ||
|
||
# Tests for read_raw_data | ||
|
||
# test read_raw_data function throws an error | ||
# if the raw data file does not exist | ||
def test_read_raw_data_error_on_missing_file(): | ||
with pytest.raises(FileNotFoundError, match='The raw_data file does not exist.'): | ||
read_raw_data('tests/test_raw_data.data',col_name1) | ||
|
||
# test read_raw_data function throws an error | ||
# if the col_name is not a list | ||
def test_read_raw_data_error_on_non_list(): | ||
with pytest.raises(TypeError, match="col_name must be a list."): | ||
read_raw_data('tests/test_wdbc.data',col_name2) | ||
|
||
# test read_raw_data function throws an error | ||
# if the col_name does not have 32 values | ||
def test_read_raw_data_error_on_insufficient_list_item(): | ||
with pytest.raises(ValueError, match="col_name must contain exactly 32 items."): | ||
read_raw_data('tests/test_wdbc.data', col_name3) | ||
|
||
# test read_raw_data function throws an error | ||
# if the col_name contains items other than string | ||
def test_read_raw_data_error_on_wrong_item_type(): | ||
with pytest.raises(ValueError, match="col_name must only contain strings."): | ||
read_raw_data('tests/test_wdbc.data', col_name4) | ||
|
||
# Tests for clean_data | ||
|
||
# test clean_data function throws an error | ||
# if the imported_data is not a dataframe | ||
def test_clean_data_error_on_wrong_imported_data_format(): | ||
with pytest.raises(TypeError, match="imported_data must be a data frame."): | ||
clean_data(imported_data2, drop_columns1, relabel1) | ||
|
||
# test clean_data function throws an error | ||
# if the drop_columns is not a list | ||
def test_clean_data_error_on_wrong_drop_columns_format(): | ||
with pytest.raises(TypeError, match="drop_columns must be a list."): | ||
clean_data(imported_data1, drop_columns2, relabel1) | ||
|
||
|
||
# test clean_data function throws an error | ||
# if the relabel is not a dictionary | ||
def test_clean_data_error_on_wrong_relabel_format(): | ||
with pytest.raises(TypeError, match="relabel must be a dictionary"): | ||
clean_data(imported_data1, drop_columns1, relabel2) | ||
|
||
# Tests for write_data | ||
|
||
# test write_data function throws an error | ||
# if the cleaned_data is not a dataframe | ||
def test_write_data_error_on_wrong_cleaned_data_format(): | ||
with pytest.raises(TypeError, match="cleaned_data must be a data frame."): | ||
write_data(cleaned_data2, 'tests/test_write_data1') | ||
|
||
# test write_data function throws an error | ||
# if the write_to path provided does not exist | ||
def test_read_zip_error_on_nonexistent_dir(): | ||
with pytest.raises(FileNotFoundError, match='The directory provided does not exist.'): | ||
write_data(cleaned_data1, 'tests/test_write_data3') | ||
|
||
# if the directory path provided is not directory | ||
def test_read_zip_error_on_missing_dir(): | ||
with pytest.raises(NotADirectoryError, match='The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.'): | ||
write_data(cleaned_data1, 'tests/conftest.py') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we include column labels in this config, or in another. Seems strange to only have numerical data in the config file.