diff --git a/Dockerfile b/Dockerfile index d9dd6d2..c709802 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,16 +3,17 @@ FROM quay.io/jupyter/minimal-notebook:notebook-7.0.6 # install necessary packages for analysis RUN conda install -y \ - python=3.11.6 \ - altair=5.1.2 \ - pandas=2.1.2 \ - ipykernel=6.26.0 \ - scikit-learn=1.3.2 \ - requests=2.31.0 \ - notebook=7.0.6 \ - pytest=7.4.3 \ - responses=0.24.1 \ + python=3.11.7 \ + altair=5.4.1 \ + pandas=1.5.3 \ + ipykernel=6.29.5 \ + scikit-learn=1.5.2 \ + requests=2.32.3 \ + notebook=7.0.8 \ + pytest=8.3.3 \ + responses=0.25.3 \ click=8.1.7 \ - vl-convert-python=1.1.0 \ - jupyter-book=0.15.1 \ - make + vl-convert-python=1.7.0 \ + jupyter-book=1.0.3 \ + make +RUN pip install great-expectations==1.1.3 diff --git a/data/processed/data_config.csv b/data/processed/data_config.csv new file mode 100644 index 0000000..c00e11b --- /dev/null +++ b/data/processed/data_config.csv @@ -0,0 +1,32 @@ +column,type,min,max,category,max_nullable +diagnosis,str,,,"Malignant,Benign",0 +mean_radius,float,6,40,,0.1 +mean_texture,float,9,50,,0.1 +mean_perimeter,float,40,260,,0.1 +mean_area,float,140,4300,,0.1 +mean_smoothness,float,0,1,,0.1 +mean_compactness,float,0,2,,0.1 +mean_concavity,float,0,2,,0.1 +mean_concave,float,0,1,,0.1 +mean_symmetry,float,0,1,,0.1 +mean_fractal,float,0,1,,0.1 +se_radius,float,0,3,,0.1 +se_texture,float,0,5,,0.1 +se_perimeter,float,0,22,,0.1 +se_area,float,6,550,,0.1 +se_smoothness,float,0,1,,0.1 +se_compactness,float,0,1,,0.1 +se_concavity,float,0,1,,0.1 +se_concave,float,0,1,,0.1 +se_symmetry,float,0,1,,0.1 +se_fractal,float,0,1,,0.1 +max_radius,float,6,40,,0.1 +max_texture,float,9,50,,0.1 +max_perimeter,float,40,260,,0.1 +max_area,float,140,4300,,0.1 +max_smoothness,float,0,1,,0.1 +max_compactness,float,0,2,,0.1 +max_concavity,float,0,2,,0.1 +max_concave,float,0,1,,0.1 +max_symmetry,float,0,1,,0.1 +max_fractal,float,0,1,,0.1 \ No newline at end of file diff --git a/scripts/clean_validate.py b/scripts/clean_validate.py new file mode 100644 index 0000000..5964079 --- /dev/null +++ b/scripts/clean_validate.py @@ -0,0 +1,44 @@ +# clean_validate.py +# author: Weilin Han +# date: 2024-10-20 + +import click +import os +import sys +import pandas as pd +import pandera as pa +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from src.clean_data import extract_column_name, read_raw_data, clean_data, write_data +from src.validate_data import build_schema_from_csv, validate_data + +@click.command() +@click.option('--raw-data-file', type=str, help="Path to raw data file") +@click.option('--name-file', type=str, help="Path to names file") +@click.option('--data-config-file', type=str, help="Path to data configuration file") +@click.option('--write-to', type=str, help="Path to directory where cleaned data will be written to") +@click.option('--written-file-name', type=str, help="The name of the file will be written") + +def main(raw_data_file, name_file, data_config_file, write_to, written_file_name): + """Clean raw data and validate it.""" + # Extract column names from .names file + colnames = extract_column_name(name_file) + + # Read raw data + imported_data = read_raw_data(raw_data_file, colnames) + + # Removing id column and relabel diagnosis column + cleaned_data = clean_data(imported_data) + + + # Create schema + config_df = pd.read_csv(data_config_file) + + schema=build_schema_from_csv(data_config=config_df, expected_columns=colnames[1:]) #removing id colnames list + # Validate cleaned data + validate_data(schema=schema, dataframe=cleaned_data) + + # Write data to specified directory + write_data(cleaned_data, write_to, written_file_name) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/src/clean_data.py b/src/clean_data.py new file mode 100644 index 0000000..1120dec --- /dev/null +++ b/src/clean_data.py @@ -0,0 +1,104 @@ +# clean_data.py +# author: Weilin Han +# date: 2024-10-15 + +import pandas as pd +import re +import os + + +def extract_column_name(raw_name_file): + """Extract and clean column names from .names file.""" + + # Ensure the raw name file exists, if not raise error + if not os.path.exists(raw_name_file): + raise FileNotFoundError(f"The raw_name file does not exist.") + + # Extracting column names from downloaded raw file + text_lines = [] + with open(raw_name_file, 'r') as file: + for line in file: + line = line.strip() + if not line.startswith('#') and line: # Skip comma + text_lines.append(line) + start = text_lines.index('7. Attribute information') + end = text_lines.index('8. Missing attribute values: none') + text_lines = text_lines[start:end] + + pattern = re.compile(r'^[1-9a-z]\)\s*') + text_lines = [item for item in text_lines if pattern.match(item)] + text_lines = [pattern.sub('', item) for item in text_lines] + pattern = re.compile(r'\(.*?\)') + text_lines = [re.sub(r"\s+", "_", pattern.sub('', item).strip()) for item in text_lines] + + statistics = ['mean','se','max'] + #se is standard error, and max is the worst or largest (mean of three largest values) + + # please refer to original file for explanation of feactures + colnames = text_lines[0:2] + for stat in statistics: + for feature in text_lines[2:]: + colnames.append(stat+'_'+feature) + colnames = [col.lower() for col in colnames] + + return colnames + +def read_data(raw_data, col_name): + """Read data from .data or .csv file.""" + + # Ensure the raw data file exists, if not raise error + if not os.path.exists(raw_data): + raise FileNotFoundError(f"The raw_data file does not exist.") + + # Ensure the col_name is a list, if not raise error + if not isinstance(col_name, list): + raise TypeError("col_name must be a list.") + + # Ensure the list has 32 items, if not raise error + if len(col_name) != 32: + raise ValueError("col_name must contain exactly 32 items.") + + # Ensure the list only contains strings, if not raise error + if not all(isinstance(item, str) for item in col_name): + raise ValueError("col_name must only contain strings.") + + imported_data = pd.read_csv(raw_data, names=col_name, header=None) + return imported_data + +def clean_data(imported_data, drop_columns=['id'], relabel={'M' : 'Malignant','B' : 'Benign'}): + """Clean imported data""" + # Ensure the imported_data is a dataframe + if not isinstance(imported_data, pd.DataFrame): + raise TypeError("imported_data must be a data frame.") + + # Ensure the drop_columns is a list + if not isinstance(drop_columns, list): + raise TypeError("drop_columns must be a list.") + + # Ensure the relabel is a dictionary + if not isinstance(relabel, dict): + raise TypeError("relabel must be a dictionary") + + cleaned_data = imported_data.drop(columns=drop_columns) + cleaned_data['diagnosis'] = cleaned_data['diagnosis'].replace(relabel) + return cleaned_data + +def write_data(dataframe, data_to, name_of_file): + """Write data to directory""" + # Ensure the data_frame is a dataframe, if not raise an error + if not isinstance(dataframe, pd.DataFrame): + raise TypeError("dataframe must be a pandas data frame.") + + # Ensure directory path exists, if not raise an error + if not os.path.exists(data_to): + raise FileNotFoundError('The directory provided does not exist.') + + # Ensure the dirctory path provided is a directory, if not raise an error + if not os.path.isdir(data_to): + raise NotADirectoryError('The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.') + + # Ensure the name of file is string, if not raise an error + if not isinstance(name_of_file, str): + raise TypeError("name_of_file must be string.") + + dataframe.to_csv(os.path.join(data_to, name_of_file), index=False) \ No newline at end of file diff --git a/src/validate_data.py b/src/validate_data.py new file mode 100644 index 0000000..b440eef --- /dev/null +++ b/src/validate_data.py @@ -0,0 +1,98 @@ +# clean_validate.py +# author: Weilin Han +# date: 2024-10-03 + +import pandas as pd +import pandera as pa + +# Function to build schema from the config file +def build_schema_from_csv(data_config, expected_columns): + """Building schema for validation""" + + # Ensure the data_config is a pandas dataframe + if not isinstance(data_config, pd.DataFrame): + raise TypeError("data_config must be a pandas dataframe.") + + # Ensure the data_config has following columns: column,type,max,min,category + required_columns = ['column', 'type', 'min', 'max','category', 'max_nullable'] + missing_columns = [col for col in required_columns if col not in data_config.columns] + if missing_columns: + raise ValueError(f"The data_config must have following columns: 'column', 'type', 'min', 'max', 'category', 'max_nullable'.") + + # Ensure the values of 'column' match the column names extracted from name file + if expected_columns is not None: + actual_columns = data_config['column'].str.strip("'").tolist() # Clean up any extra quotation marks in 'column' + if actual_columns != expected_columns: + raise ValueError("Column names in the config file do not match the expected columns.") + + + schema_dict = {} + + # Loop through each row in the config DataFrame + for _, row in data_config.iterrows(): + column_name = row['column'].strip() # Removing potential extra spaces + column_type = row['type'].strip() # Strip any spaces + min_value = row['min'] if pd.notna(row['min']) else None + max_value = row['max'] if pd.notna(row['max']) else None + category_in = row['category'] if pd.notna(row['category']) else None + max_nullable = row['max_nullable'] if pd.notna(row['max_nullable']) else None + + # Define the correct Pandera data type + if column_type == 'int': + dtype = pa.Int + elif column_type == 'float': + dtype = pa.Float + elif column_type == 'str': + dtype = pa.String + else: + raise ValueError(f"Unsupported column type: {column_type}") + + # Create value range validation checks + value_range_checks = [] + if min_value is not None: + value_range_checks.append(pa.Check.greater_than_or_equal_to(float(min_value), + error=f'Value is smaller than {min_value}')) + if max_value is not None: + value_range_checks.append(pa.Check.less_than_or_equal_to(float(max_value), + error=f'Value is larger than {max_value}')) + if category_in is not None: + category_list = category_in.split(',') + value_range_checks.append(pa.Check.isin(category_list, + error=f'Value not in {category_list}')) + if max_nullable is not None: + value_range_checks.append(pa.Check(lambda s: s.isna().mean() <= max_nullable, + error=f'Too many missing values, must have at least {(1-max_nullable)*100}% non-null values.')) + + # Add the column schema to the schema dictionary + schema_dict[column_name] = pa.Column(dtype,nullable=True, checks=value_range_checks) + + global_checks=[ + pa.Check(lambda df: ~df.duplicated().any(), error="Duplicate rows found."), + pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error="Empty rows found.") + ] + + return pa.DataFrameSchema(schema_dict, checks=global_checks) + + +# Function to validate schema +def validate_data(schema, dataframe): + """Building schema to validate data using pandera""" + + # Ensure the schema is a pandera schema, if not raise an error + if not isinstance(schema, pa.DataFrameSchema): + raise TypeError("schema must be a pandera dataframe schema.") + + # Ensure the data_frame is a dataframe, if not raise an error + if not isinstance(dataframe, pd.DataFrame): + raise TypeError("dataframe must be a pandas data frame.") + + # Ensure the data_frame has observations, if not raise an error + if dataframe.empty: + raise ValueError("dataframe must contain observations.") + + schema.validate(dataframe, lazy=True) + # return print(f"Expected Columns: {expected_columns}, Actual Columns: {actual_columns}") + + + + diff --git a/tests/test_clean_data.py b/tests/test_clean_data.py new file mode 100644 index 0000000..7ed2632 --- /dev/null +++ b/tests/test_clean_data.py @@ -0,0 +1,112 @@ +import pytest +import pandas as pd +import os +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from src.clean_data import extract_column_name,read_data,clean_data,write_data + +# Test files setup +col_name1 = ["col" + str(i) for i in range(32)] # 32 strings +col_name2 = {"1":"apple"} +col_name3 = ['1','2','3'] +col_name4 = ["col" + str(i) for i in range(31)] + [123] # 31 strings + 1 integer + +imported_data1 = pd.DataFrame({ + 'id': [1, 2, 3], + 'class': ['M', 'B', 'M'] + }) +imported_data2 = [1, 2, 3, 4, 5] +drop_columns1=['id'] +drop_columns2={'1':'id'} +relabel1={'M' : 'Malignant','B' : 'Benign'} +relabel2=['M','B'] + +cleaned_data1 = pd.DataFrame({ + 'diagnosis': ['Malignant','Benign','Malignant'], + 'mean_raius': [1, 2, 3] + }) +cleaned_data2 = [1, 2, 3, 4, 5] +# setup empty directory for data files to be downloaded to +if not os.path.exists('tests/test_write_data1'): + os.makedirs('tests/test_write_data1') + +# Tests + +# Tests for extract_column_name + +# test extract_column_name function throws an error +# if the raw name file does not exist +def test_extract_column_name_error_on_missing_file(): + with pytest.raises(FileNotFoundError, match='The raw_name file does not exist.'): + extract_column_name('tests/test_name_data.name') + +# Tests for read_data + +# test read_data function throws an error +# if the raw data file does not exist +def test_read_data_error_on_missing_file(): + with pytest.raises(FileNotFoundError, match='The raw_data file does not exist.'): + read_data('tests/test_data.data',col_name1) + +# test read_data function throws an error +# if the col_name is not a list +def test_read_data_error_on_non_list(): + with pytest.raises(TypeError, match="col_name must be a list."): + read_data('tests/test_wdbc.data',col_name2) + +# test read_data function throws an error +# if the col_name does not have 32 values +def test_read_data_error_on_insufficient_list_item(): + with pytest.raises(ValueError, match="col_name must contain exactly 32 items."): + read_data('tests/test_wdbc.data', col_name3) + +# test read_data function throws an error +# if the col_name contains items other than string +def test_read_data_error_on_wrong_item_type(): + with pytest.raises(ValueError, match="col_name must only contain strings."): + read_data('tests/test_wdbc.data', col_name4) + +# Tests for clean_data + +# test clean_data function throws an error +# if the imported_data is not a dataframe +def test_clean_data_error_on_wrong_imported_data_format(): + with pytest.raises(TypeError, match="imported_data must be a data frame."): + clean_data(imported_data2, drop_columns1, relabel1) + +# test clean_data function throws an error +# if the drop_columns is not a list +def test_clean_data_error_on_wrong_drop_columns_format(): + with pytest.raises(TypeError, match="drop_columns must be a list."): + clean_data(imported_data1, drop_columns2, relabel1) + + +# test clean_data function throws an error +# if the relabel is not a dictionary +def test_clean_data_error_on_wrong_relabel_format(): + with pytest.raises(TypeError, match="relabel must be a dictionary"): + clean_data(imported_data1, drop_columns1, relabel2) + +# Tests for write_data + +# test write_data function throws an error +# if the dataframe is not a dataframe +def test_write_data_error_on_wrong_cleaned_data_format(): + with pytest.raises(TypeError, match="dataframe must be a pandas data frame."): + write_data(cleaned_data2, 'tests/', 'test_write_data1') + +# test write_data function throws an error +# if the write_to path provided does not exist +def test_write_data_error_on_nonexistent_dir(): + with pytest.raises(FileNotFoundError, match='The directory provided does not exist.'): + write_data(cleaned_data1, 'test/', 'test_write_data2') + +# if the directory path provided is not directory +def test_write_data_error_on_missing_dir(): + with pytest.raises(NotADirectoryError, match='The directory path provided is not a directory, it is an existing file path. Please provide a path to a new, or existing directory.'): + write_data(cleaned_data1, 'tests/conftest.py','test_write_data3') + +# if the name_of_file is not a string +def test_read_data_error_on_wrong_name_of_file_format(): + with pytest.raises(TypeError, match='name_of_file must be string.'): + write_data(cleaned_data1, 'tests/', 1) \ No newline at end of file diff --git a/tests/test_cleaned_data.csv b/tests/test_cleaned_data.csv new file mode 100644 index 0000000..d7fb5ea --- /dev/null +++ b/tests/test_cleaned_data.csv @@ -0,0 +1,101 @@ +diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,se_radius,se_texture,se_perimeter,se_area,se_smoothness,se_compactness,se_concavity,se_concave_points,se_symmetry,se_fractal_dimension,max_radius,max_texture,max_perimeter,max_area,max_smoothness,max_compactness,max_concavity,max_concave_points,max_symmetry,max_fractal_dimension +Benign,11.16,21.41,70.95,380.3,0.1018,0.05978,0.008955,0.01076,0.1615,0.06144,0.2865,1.678,1.968,18.99,0.006908,0.009442,0.006972,0.006159,0.02694,0.00206,12.36,28.92,79.26,458.0,0.1282,0.1108,0.03582,0.04306,0.2976,0.07123 +Malignant,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,1.176,1.256,7.673,158.7,0.0103,0.02891,0.05198,0.02454,0.01114,0.004239,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115 +Malignant,18.45,21.91,120.2,1075.0,0.0943,0.09709,0.1153,0.06847,0.1692,0.05727,0.5959,1.202,3.766,68.35,0.006001,0.01422,0.02855,0.009148,0.01492,0.002205,22.52,31.39,145.6,1590.0,0.1465,0.2275,0.3965,0.1379,0.3109,0.0761 +Benign,10.94,18.59,70.39,370.0,0.1004,0.0746,0.04944,0.02932,0.1486,0.06615,0.3796,1.743,3.018,25.78,0.009519,0.02134,0.0199,0.01155,0.02079,0.002701,12.4,25.58,82.76,472.4,0.1363,0.1644,0.1412,0.07887,0.2251,0.07732 +Malignant,17.08,27.15,111.2,930.9,0.09898,0.111,0.1007,0.06431,0.1793,0.06281,0.9291,1.152,6.051,115.2,0.00874,0.02219,0.02721,0.01458,0.02045,0.004417,22.96,34.49,152.1,1648.0,0.16,0.2444,0.2639,0.1555,0.301,0.0906 +Malignant,16.24,18.77,108.8,805.1,0.1066,0.1802,0.1948,0.09052,0.1876,0.06684,0.2873,0.9173,2.464,28.09,0.004563,0.03481,0.03872,0.01209,0.01388,0.004081,18.55,25.09,126.9,1031.0,0.1365,0.4706,0.5026,0.1732,0.277,0.1063 +Benign,11.74,14.69,76.31,426.0,0.08099,0.09661,0.06726,0.02639,0.1499,0.06758,0.1924,0.6417,1.345,13.04,0.006982,0.03916,0.04017,0.01528,0.0226,0.006822,12.45,17.6,81.25,473.8,0.1073,0.2793,0.269,0.1056,0.2604,0.09879 +Malignant,20.51,27.81,134.4,1319.0,0.09159,0.1074,0.1554,0.0834,0.1448,0.05592,0.524,1.189,3.767,70.01,0.00502,0.02062,0.03457,0.01091,0.01298,0.002887,24.47,37.38,162.7,1872.0,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328 +Benign,11.6,24.49,74.23,417.2,0.07474,0.05688,0.01974,0.01313,0.1935,0.05878,0.2512,1.786,1.961,18.21,0.006122,0.02337,0.01596,0.006998,0.03194,0.002211,12.44,31.62,81.39,476.5,0.09545,0.1361,0.07239,0.04815,0.3244,0.06745 +Malignant,18.63,25.11,124.8,1088.0,0.1064,0.1887,0.2319,0.1244,0.2183,0.06197,0.8307,1.466,5.574,105.0,0.006248,0.03374,0.05196,0.01158,0.02007,0.00456,23.15,34.01,160.5,1670.0,0.1491,0.4257,0.6133,0.1848,0.3444,0.09782 +Benign,11.95,14.96,77.23,426.7,0.1158,0.1206,0.01171,0.01787,0.2459,0.06581,0.361,1.05,2.455,26.65,0.0058,0.02417,0.007816,0.01052,0.02734,0.003114,12.81,17.72,83.09,496.2,0.1293,0.1885,0.03122,0.04766,0.3124,0.0759 +Benign,9.676,13.14,64.12,272.5,0.1255,0.2204,0.1188,0.07038,0.2057,0.09575,0.2744,1.39,1.787,17.67,0.02177,0.04888,0.05189,0.0145,0.02632,0.01148,10.6,18.04,69.47,328.1,0.2006,0.3663,0.2913,0.1075,0.2848,0.1364 +Malignant,17.54,19.32,115.1,951.6,0.08968,0.1198,0.1036,0.07488,0.1506,0.05491,0.3971,0.8282,3.088,40.73,0.00609,0.02569,0.02713,0.01345,0.01594,0.002658,20.42,25.84,139.5,1239.0,0.1381,0.342,0.3508,0.1939,0.2928,0.07867 +Benign,13.34,15.86,86.49,520.0,0.1078,0.1535,0.1169,0.06987,0.1942,0.06902,0.286,1.016,1.535,12.96,0.006794,0.03575,0.0398,0.01383,0.02134,0.004603,15.53,23.19,96.66,614.9,0.1536,0.4791,0.4858,0.1708,0.3527,0.1016 +Benign,10.82,24.21,68.89,361.6,0.08192,0.06602,0.01548,0.00816,0.1976,0.06328,0.5196,1.918,3.564,33.0,0.008263,0.0187,0.01277,0.005917,0.02466,0.002977,13.03,31.45,83.9,505.6,0.1204,0.1633,0.06194,0.03264,0.3059,0.07626 +Malignant,18.31,20.58,120.8,1052.0,0.1068,0.1248,0.1569,0.09451,0.186,0.05941,0.5449,0.9225,3.218,67.36,0.006176,0.01877,0.02913,0.01046,0.01559,0.002725,21.86,26.2,142.2,1493.0,0.1492,0.2536,0.3759,0.151,0.3074,0.07863 +Benign,11.49,14.59,73.99,404.9,0.1046,0.08228,0.05308,0.01969,0.1779,0.06574,0.2034,1.166,1.567,14.34,0.004957,0.02114,0.04156,0.008038,0.01843,0.003614,12.4,21.9,82.04,467.6,0.1352,0.201,0.2596,0.07431,0.2941,0.0918 +Malignant,19.4,18.18,127.2,1145.0,0.1037,0.1442,0.1626,0.09464,0.1893,0.05892,0.4709,0.9951,2.903,53.16,0.005654,0.02199,0.03059,0.01499,0.01623,0.001965,23.79,28.65,152.4,1628.0,0.1518,0.3749,0.4316,0.2252,0.359,0.07787 +Malignant,14.9,22.53,102.1,685.0,0.09947,0.2225,0.2733,0.09711,0.2041,0.06898,0.253,0.8749,3.466,24.19,0.006965,0.06213,0.07926,0.02234,0.01499,0.005784,16.35,27.57,125.4,832.7,0.1419,0.709,0.9019,0.2475,0.2866,0.1155 +Benign,14.81,14.7,94.66,680.7,0.08472,0.05016,0.03416,0.02541,0.1659,0.05348,0.2182,0.6232,1.677,20.72,0.006708,0.01197,0.01482,0.01056,0.0158,0.001779,15.61,17.58,101.7,760.2,0.1139,0.1011,0.1101,0.07955,0.2334,0.06142 +Benign,13.11,22.54,87.02,529.4,0.1002,0.1483,0.08705,0.05102,0.185,0.0731,0.1931,0.9223,1.491,15.09,0.005251,0.03041,0.02526,0.008304,0.02514,0.004198,14.55,29.16,99.48,639.3,0.1349,0.4402,0.3162,0.1126,0.4128,0.1076 +Malignant,17.6,23.33,119.0,980.5,0.09289,0.2004,0.2136,0.1002,0.1696,0.07369,0.9289,1.465,5.801,104.9,0.006766,0.07025,0.06591,0.02311,0.01673,0.0113,21.57,28.87,143.6,1437.0,0.1207,0.4785,0.5165,0.1996,0.2301,0.1224 +Benign,10.16,19.59,64.73,311.7,0.1003,0.07504,0.005025,0.01116,0.1791,0.06331,0.2441,2.09,1.648,16.8,0.01291,0.02222,0.004174,0.007082,0.02572,0.002278,10.65,22.88,67.88,347.3,0.1265,0.12,0.01005,0.02232,0.2262,0.06742 +Benign,8.888,14.64,58.79,244.0,0.09783,0.1531,0.08606,0.02872,0.1902,0.0898,0.5262,0.8522,3.168,25.44,0.01721,0.09368,0.05671,0.01766,0.02541,0.02193,9.733,15.67,62.56,284.4,0.1207,0.2436,0.1434,0.04786,0.2254,0.1084 +Benign,10.48,14.98,67.49,333.6,0.09816,0.1013,0.06335,0.02218,0.1925,0.06915,0.3276,1.127,2.564,20.77,0.007364,0.03867,0.05263,0.01264,0.02161,0.00483,12.13,21.57,81.41,440.4,0.1327,0.2996,0.2939,0.0931,0.302,0.09646 +Malignant,17.57,15.05,115.0,955.1,0.09847,0.1157,0.09875,0.07953,0.1739,0.06149,0.6003,0.8225,4.655,61.1,0.005627,0.03033,0.03407,0.01354,0.01925,0.003742,20.01,19.52,134.9,1227.0,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919 +Benign,12.78,16.49,81.37,502.5,0.09831,0.05234,0.03653,0.02864,0.159,0.05653,0.2368,0.8732,1.471,18.33,0.007962,0.005612,0.01585,0.008662,0.02254,0.001906,13.46,19.76,85.67,554.9,0.1296,0.07061,0.1039,0.05882,0.2383,0.0641 +Benign,11.63,29.29,74.87,415.1,0.09357,0.08574,0.0716,0.02017,0.1799,0.06166,0.3135,2.426,2.15,23.13,0.009861,0.02418,0.04275,0.009215,0.02475,0.002128,13.12,38.81,86.04,527.8,0.1406,0.2031,0.2923,0.06835,0.2884,0.0722 +Benign,11.2,29.37,70.67,386.0,0.07449,0.03558,0.0,0.0,0.106,0.05502,0.3141,3.896,2.041,22.81,0.007594,0.008878,0.0,0.0,0.01989,0.001773,11.92,38.3,75.19,439.6,0.09267,0.05494,0.0,0.0,0.1566,0.05905 +Benign,13.37,16.39,86.1,553.5,0.07115,0.07325,0.08092,0.028,0.1422,0.05823,0.1639,1.14,1.223,14.66,0.005919,0.0327,0.04957,0.01038,0.01208,0.004076,14.26,22.75,91.99,632.1,0.1025,0.2531,0.3308,0.08978,0.2048,0.07628 +Malignant,17.42,25.56,114.5,948.0,0.1006,0.1146,0.1682,0.06597,0.1308,0.05866,0.5296,1.667,3.767,58.53,0.03113,0.08555,0.1438,0.03927,0.02175,0.01256,18.07,28.07,120.4,1021.0,0.1243,0.1793,0.2803,0.1099,0.1603,0.06818 +Malignant,13.81,23.75,91.56,597.8,0.1323,0.1768,0.1558,0.09176,0.2251,0.07421,0.5648,1.93,3.909,52.72,0.008824,0.03108,0.03112,0.01291,0.01998,0.004506,19.2,41.85,128.5,1153.0,0.2226,0.5209,0.4646,0.2013,0.4432,0.1086 +Benign,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,0.06373,0.3961,1.044,2.497,30.29,0.006953,0.01911,0.02701,0.01037,0.01782,0.003586,14.97,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875 +Benign,14.4,26.99,92.25,646.1,0.06995,0.05223,0.03476,0.01737,0.1707,0.05433,0.2315,0.9112,1.727,20.52,0.005356,0.01679,0.01971,0.00637,0.01414,0.001892,15.4,31.98,100.4,734.6,0.1017,0.146,0.1472,0.05563,0.2345,0.06464 +Benign,11.75,17.56,75.89,422.9,0.1073,0.09713,0.05282,0.0444,0.1598,0.06677,0.4384,1.907,3.149,30.66,0.006587,0.01815,0.01737,0.01316,0.01835,0.002318,13.5,27.98,88.52,552.3,0.1349,0.1854,0.1366,0.101,0.2478,0.07757 +Benign,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,0.06404,0.2978,1.502,2.203,20.95,0.007112,0.02493,0.02703,0.01293,0.01958,0.004463,13.83,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261 +Malignant,15.13,29.81,96.71,719.5,0.0832,0.04605,0.04686,0.02739,0.1852,0.05294,0.4681,1.627,3.043,45.38,0.006831,0.01427,0.02489,0.009087,0.03151,0.00175,17.26,36.91,110.1,931.4,0.1148,0.09866,0.1547,0.06575,0.3233,0.06165 +Malignant,21.09,26.57,142.7,1311.0,0.1141,0.2832,0.2487,0.1496,0.2395,0.07398,0.6298,0.7629,4.414,81.46,0.004253,0.04759,0.03872,0.01567,0.01798,0.005295,26.68,33.48,176.5,2089.0,0.1491,0.7584,0.678,0.2903,0.4098,0.1284 +Malignant,17.35,23.06,111.0,933.1,0.08662,0.0629,0.02891,0.02837,0.1564,0.05307,0.4007,1.317,2.577,44.41,0.005726,0.01106,0.01246,0.007671,0.01411,0.001578,19.85,31.47,128.2,1218.0,0.124,0.1486,0.1211,0.08235,0.2452,0.06515 +Benign,8.671,14.45,54.42,227.2,0.09138,0.04276,0.0,0.0,0.1722,0.06724,0.2204,0.7873,1.435,11.36,0.009172,0.008007,0.0,0.0,0.02711,0.003399,9.262,17.04,58.36,259.2,0.1162,0.07057,0.0,0.0,0.2592,0.07848 +Benign,13.78,15.79,88.37,585.9,0.08817,0.06718,0.01055,0.009937,0.1405,0.05848,0.3563,0.4833,2.235,29.34,0.006432,0.01156,0.007741,0.005657,0.01227,0.002564,15.27,17.5,97.9,706.6,0.1072,0.1071,0.03517,0.03312,0.1859,0.0681 +Malignant,17.95,20.01,114.2,982.0,0.08402,0.06722,0.07293,0.05596,0.2129,0.05025,0.5506,1.214,3.357,54.04,0.004024,0.008422,0.02291,0.009863,0.05014,0.001902,20.58,27.83,129.2,1261.0,0.1072,0.1202,0.2249,0.1185,0.4882,0.06111 +Benign,10.66,15.15,67.49,349.6,0.08792,0.04302,0.0,0.0,0.1928,0.05975,0.3309,1.925,2.155,21.98,0.008713,0.01017,0.0,0.0,0.03265,0.001002,11.54,19.2,73.2,408.3,0.1076,0.06791,0.0,0.0,0.271,0.06164 +Malignant,17.93,24.48,115.2,998.9,0.08855,0.07027,0.05699,0.04744,0.1538,0.0551,0.4212,1.433,2.765,45.81,0.005444,0.01169,0.01622,0.008522,0.01419,0.002751,20.92,34.69,135.1,1320.0,0.1315,0.1806,0.208,0.1136,0.2504,0.07948 +Benign,12.25,22.44,78.18,466.5,0.08192,0.052,0.01714,0.01261,0.1544,0.05976,0.2239,1.139,1.577,18.04,0.005096,0.01205,0.00941,0.004551,0.01608,0.002399,14.17,31.99,92.74,622.9,0.1256,0.1804,0.123,0.06335,0.31,0.08203 +Benign,12.49,16.85,79.19,481.6,0.08511,0.03834,0.004473,0.006423,0.1215,0.05673,0.1716,0.7151,1.047,12.69,0.004928,0.003012,0.00262,0.00339,0.01393,0.001344,13.34,19.71,84.48,544.2,0.1104,0.04953,0.01938,0.02784,0.1917,0.06174 +Benign,11.6,12.84,74.34,412.6,0.08983,0.07525,0.04196,0.0335,0.162,0.06582,0.2315,0.5391,1.475,15.75,0.006153,0.0133,0.01693,0.006884,0.01651,0.002551,13.06,17.16,82.96,512.5,0.1431,0.1851,0.1922,0.08449,0.2772,0.08756 +Malignant,17.02,23.98,112.8,899.3,0.1197,0.1496,0.2417,0.1203,0.2248,0.06382,0.6009,1.398,3.999,67.78,0.008268,0.03082,0.05042,0.01112,0.02102,0.003854,20.88,32.09,136.1,1344.0,0.1634,0.3559,0.5588,0.1847,0.353,0.08482 +Benign,14.2,20.53,92.41,618.4,0.08931,0.1108,0.05063,0.03058,0.1506,0.06009,0.3478,1.018,2.749,31.01,0.004107,0.03288,0.02821,0.0135,0.0161,0.002744,16.45,27.26,112.1,828.5,0.1153,0.3429,0.2512,0.1339,0.2534,0.07858 +Benign,11.9,14.65,78.11,432.8,0.1152,0.1296,0.0371,0.03003,0.1995,0.07839,0.3962,0.6538,3.021,25.03,0.01017,0.04741,0.02789,0.0111,0.03127,0.009423,13.15,16.51,86.26,509.6,0.1424,0.2517,0.0942,0.06042,0.2727,0.1036 +Benign,9.0,14.4,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,0.06833,0.1746,1.305,1.144,9.789,0.007389,0.004883,0.003681,0.003472,0.02701,0.002153,9.699,20.07,60.9,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804 +Benign,13.94,13.17,90.31,594.2,0.1248,0.09755,0.101,0.06615,0.1976,0.06457,0.5461,2.635,4.091,44.74,0.01004,0.03247,0.04763,0.02853,0.01715,0.005528,14.62,15.38,94.52,653.3,0.1394,0.1364,0.1559,0.1015,0.216,0.07253 +Benign,14.97,19.76,95.5,690.2,0.08421,0.05352,0.01947,0.01939,0.1515,0.05266,0.184,1.065,1.286,16.64,0.003634,0.007983,0.008268,0.006432,0.01924,0.00152,15.98,25.82,102.3,782.1,0.1045,0.09995,0.0775,0.05754,0.2646,0.06085 +Benign,10.05,17.53,64.41,310.8,0.1007,0.07326,0.02511,0.01775,0.189,0.06331,0.2619,2.015,1.778,16.85,0.007803,0.01449,0.0169,0.008043,0.021,0.002778,11.16,26.84,71.98,384.0,0.1402,0.1402,0.1055,0.06499,0.2894,0.07664 +Benign,12.22,20.04,79.47,453.1,0.1096,0.1152,0.08175,0.02166,0.2124,0.06894,0.1811,0.7959,0.9857,12.58,0.006272,0.02198,0.03966,0.009894,0.0132,0.003813,13.16,24.17,85.13,515.3,0.1402,0.2315,0.3535,0.08088,0.2709,0.08839 +Malignant,14.68,20.13,94.74,684.5,0.09867,0.072,0.07395,0.05259,0.1586,0.05922,0.4727,1.24,3.195,45.4,0.005718,0.01162,0.01998,0.01109,0.0141,0.002085,19.07,30.88,123.4,1138.0,0.1464,0.1871,0.2914,0.1609,0.3029,0.08216 +Malignant,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902 +Benign,11.43,15.39,73.06,399.8,0.09639,0.06889,0.03503,0.02875,0.1734,0.05865,0.1759,0.9938,1.143,12.67,0.005133,0.01521,0.01434,0.008602,0.01501,0.001588,12.32,22.02,79.93,462.0,0.119,0.1648,0.1399,0.08476,0.2676,0.06765 +Malignant,19.55,28.77,133.6,1207.0,0.0926,0.2063,0.1784,0.1144,0.1893,0.06232,0.8426,1.199,7.158,106.4,0.006356,0.04765,0.03863,0.01519,0.01936,0.005252,25.05,36.27,178.6,1926.0,0.1281,0.5329,0.4251,0.1941,0.2818,0.1005 +Benign,9.72,18.22,60.73,288.1,0.0695,0.02344,0.0,0.0,0.1653,0.06447,0.3539,4.885,2.23,21.69,0.001713,0.006736,0.0,0.0,0.03799,0.001688,9.968,20.83,62.25,303.8,0.07117,0.02729,0.0,0.0,0.1909,0.06559 +Malignant,13.28,20.28,87.32,545.2,0.1041,0.1436,0.09847,0.06158,0.1974,0.06782,0.3704,0.8249,2.427,31.33,0.005072,0.02147,0.02185,0.00956,0.01719,0.003317,17.38,28.0,113.1,907.2,0.153,0.3724,0.3664,0.1492,0.3739,0.1027 +Malignant,14.25,22.15,96.42,645.7,0.1049,0.2008,0.2135,0.08653,0.1949,0.07292,0.7036,1.268,5.373,60.78,0.009407,0.07056,0.06899,0.01848,0.017,0.006113,17.67,29.51,119.1,959.5,0.164,0.6247,0.6922,0.1785,0.2844,0.1132 +Malignant,18.77,21.43,122.9,1092.0,0.09116,0.1402,0.106,0.0609,0.1953,0.06083,0.6422,1.53,4.369,88.25,0.007548,0.03897,0.03914,0.01816,0.02168,0.004445,24.54,34.37,161.1,1873.0,0.1498,0.4827,0.4634,0.2048,0.3679,0.0987 +Benign,11.26,19.83,71.3,388.1,0.08511,0.04413,0.005067,0.005664,0.1637,0.06343,0.1344,1.083,0.9812,9.332,0.0042,0.0059,0.003846,0.004065,0.01487,0.002295,11.93,26.43,76.38,435.9,0.1108,0.07723,0.02533,0.02832,0.2557,0.07613 +Benign,7.691,25.44,48.34,170.4,0.08668,0.1199,0.09252,0.01364,0.2037,0.07751,0.2196,1.479,1.445,11.73,0.01547,0.06457,0.09252,0.01364,0.02105,0.007551,8.678,31.89,54.49,223.6,0.1596,0.3064,0.3393,0.05,0.279,0.1066 +Malignant,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072 +Malignant,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368 +Benign,13.05,18.59,85.09,512.0,0.1082,0.1304,0.09603,0.05603,0.2035,0.06501,0.3106,1.51,2.59,21.57,0.007807,0.03932,0.05112,0.01876,0.0286,0.005715,14.19,24.85,94.22,591.2,0.1343,0.2658,0.2573,0.1258,0.3113,0.08317 +Benign,14.92,14.93,96.45,686.9,0.08098,0.08549,0.05539,0.03221,0.1687,0.05669,0.2446,0.4334,1.826,23.31,0.003271,0.0177,0.0231,0.008399,0.01148,0.002379,17.18,18.22,112.0,906.6,0.1065,0.2791,0.3151,0.1147,0.2688,0.08273 +Benign,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,0.06072,0.3384,1.343,1.851,26.33,0.01127,0.03498,0.02187,0.01965,0.0158,0.003442,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784 +Malignant,13.82,24.49,92.33,595.9,0.1162,0.1681,0.1357,0.06759,0.2275,0.07237,0.4751,1.528,2.974,39.05,0.00968,0.03856,0.03476,0.01616,0.02434,0.006995,16.01,32.94,106.0,788.0,0.1794,0.3966,0.3381,0.1521,0.3651,0.1183 +Benign,10.25,16.18,66.52,324.2,0.1061,0.1111,0.06726,0.03965,0.1743,0.07279,0.3677,1.471,1.597,22.68,0.01049,0.04265,0.04004,0.01544,0.02719,0.007596,11.28,20.61,71.53,390.4,0.1402,0.236,0.1898,0.09744,0.2608,0.09702 +Benign,14.05,27.15,91.38,600.4,0.09929,0.1126,0.04462,0.04304,0.1537,0.06171,0.3645,1.492,2.888,29.84,0.007256,0.02678,0.02071,0.01626,0.0208,0.005304,15.3,33.17,100.2,706.7,0.1241,0.2264,0.1326,0.1048,0.225,0.08321 +Benign,11.41,14.92,73.53,402.0,0.09059,0.08155,0.06181,0.02361,0.1167,0.06217,0.3344,1.108,1.902,22.77,0.007356,0.03728,0.05915,0.01712,0.02165,0.004784,12.37,17.7,79.12,467.2,0.1121,0.161,0.1648,0.06296,0.1811,0.07427 +Benign,13.71,18.68,88.73,571.0,0.09916,0.107,0.05385,0.03783,0.1714,0.06843,0.3191,1.249,2.284,26.45,0.006739,0.02251,0.02086,0.01352,0.0187,0.003747,15.11,25.63,99.43,701.9,0.1425,0.2566,0.1935,0.1284,0.2849,0.09031 +Benign,12.3,19.02,77.88,464.4,0.08313,0.04202,0.007756,0.008535,0.1539,0.05945,0.184,1.532,1.199,13.24,0.007881,0.008432,0.007004,0.006522,0.01939,0.002222,13.35,28.46,84.53,544.3,0.1222,0.09052,0.03619,0.03983,0.2554,0.07207 +Benign,11.3,18.19,73.93,389.4,0.09592,0.1325,0.1548,0.02854,0.2054,0.07669,0.2428,1.642,2.369,16.39,0.006663,0.05914,0.0888,0.01314,0.01995,0.008675,12.58,27.96,87.16,472.9,0.1347,0.4848,0.7436,0.1218,0.3308,0.1297 +Benign,8.598,20.98,54.66,221.8,0.1243,0.08963,0.03,0.009259,0.1828,0.06757,0.3582,2.067,2.493,18.39,0.01193,0.03162,0.03,0.009259,0.03357,0.003048,9.565,27.04,62.06,273.9,0.1639,0.1698,0.09001,0.02778,0.2972,0.07712 +Malignant,12.83,22.33,85.26,503.2,0.1088,0.1799,0.1695,0.06861,0.2123,0.07254,0.3061,1.069,2.257,25.13,0.006983,0.03858,0.04683,0.01499,0.0168,0.005617,15.2,30.15,105.3,706.0,0.1777,0.5343,0.6282,0.1977,0.3407,0.1243 +Malignant,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678 +Malignant,10.95,21.35,71.9,371.1,0.1227,0.1218,0.1044,0.05669,0.1895,0.0687,0.2366,1.428,1.822,16.97,0.008064,0.01764,0.02595,0.01037,0.01357,0.00304,12.84,35.34,87.22,514.0,0.1909,0.2698,0.4023,0.1424,0.2964,0.09606 +Benign,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,0.05935,0.2913,1.389,2.347,23.29,0.006418,0.03961,0.07927,0.01774,0.01878,0.003696,13.9,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127 +Malignant,20.73,31.12,135.7,1419.0,0.09469,0.1143,0.1367,0.08646,0.1769,0.05674,1.172,1.617,7.749,199.7,0.004551,0.01478,0.02143,0.00928,0.01367,0.002299,32.49,47.16,214.0,3432.0,0.1401,0.2644,0.3442,0.1659,0.2868,0.08218 +Malignant,14.42,19.77,94.48,642.5,0.09752,0.1141,0.09388,0.05839,0.1879,0.0639,0.2895,1.851,2.376,26.85,0.008005,0.02895,0.03321,0.01424,0.01462,0.004452,16.33,30.86,109.5,826.4,0.1431,0.3026,0.3194,0.1565,0.2718,0.09353 +Malignant,15.46,11.89,102.5,736.9,0.1257,0.1555,0.2032,0.1097,0.1966,0.07069,0.4209,0.6583,2.805,44.64,0.005393,0.02321,0.04303,0.0132,0.01792,0.004168,18.79,17.04,125.0,1102.0,0.1531,0.3583,0.583,0.1827,0.3216,0.101 +Malignant,11.84,18.7,77.93,440.6,0.1109,0.1516,0.1218,0.05182,0.2301,0.07799,0.4825,1.03,3.475,41.0,0.005551,0.03414,0.04205,0.01044,0.02273,0.005667,16.82,28.12,119.4,888.7,0.1637,0.5775,0.6956,0.1546,0.4761,0.1402 +Benign,8.571,13.1,54.53,221.3,0.1036,0.07632,0.02565,0.0151,0.1678,0.07126,0.1267,0.6793,1.069,7.254,0.007897,0.01762,0.01801,0.00732,0.01592,0.003925,9.473,18.45,63.3,275.6,0.1641,0.2235,0.1754,0.08512,0.2983,0.1049 +Benign,10.03,21.28,63.19,307.3,0.08117,0.03912,0.00247,0.005159,0.163,0.06439,0.1851,1.341,1.184,11.6,0.005724,0.005697,0.002074,0.003527,0.01445,0.002411,11.11,28.94,69.92,376.3,0.1126,0.07094,0.01235,0.02579,0.2349,0.08061 +Benign,11.51,23.93,74.52,403.5,0.09261,0.1021,0.1112,0.04105,0.1388,0.0657,0.2388,2.904,1.936,16.97,0.0082,0.02982,0.05738,0.01267,0.01488,0.004738,12.48,37.16,82.28,474.2,0.1298,0.2517,0.363,0.09653,0.2112,0.08732 +Benign,13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766,0.2699,0.7886,2.058,23.56,0.008462,0.0146,0.02387,0.01315,0.0198,0.0023,15.11,19.26,99.7,711.2,0.144,0.1773,0.239,0.1288,0.2977,0.07259 +Benign,11.68,16.17,75.49,420.5,0.1128,0.09263,0.04279,0.03132,0.1853,0.06401,0.3713,1.154,2.554,27.57,0.008998,0.01292,0.01851,0.01167,0.02152,0.003213,13.32,21.59,86.57,549.8,0.1526,0.1477,0.149,0.09815,0.2804,0.08024 +Malignant,19.18,22.49,127.5,1148.0,0.08523,0.1428,0.1114,0.06772,0.1767,0.05529,0.4357,1.073,3.833,54.22,0.005524,0.03698,0.02706,0.01221,0.01415,0.003397,23.36,32.06,166.4,1688.0,0.1322,0.5601,0.3865,0.1708,0.3193,0.09221 +Benign,12.34,14.95,78.29,469.1,0.08682,0.04571,0.02109,0.02054,0.1571,0.05708,0.3833,0.9078,2.602,30.15,0.007702,0.008491,0.01307,0.0103,0.0297,0.001432,13.18,16.85,84.11,533.1,0.1048,0.06744,0.04921,0.04793,0.2298,0.05974 +Malignant,20.44,21.78,133.8,1293.0,0.0915,0.1131,0.09799,0.07785,0.1618,0.05557,0.5781,0.9168,4.218,72.44,0.006208,0.01906,0.02375,0.01461,0.01445,0.001906,24.31,26.37,161.2,1780.0,0.1327,0.2376,0.2702,0.1765,0.2609,0.06735 +Benign,14.04,15.98,89.78,611.2,0.08458,0.05895,0.03534,0.02944,0.1714,0.05898,0.3892,1.046,2.644,32.74,0.007976,0.01295,0.01608,0.009046,0.02005,0.00283,15.66,21.58,101.2,750.0,0.1195,0.1252,0.1117,0.07453,0.2725,0.07234 +Benign,9.755,28.2,61.68,290.9,0.07984,0.04626,0.01541,0.01043,0.1621,0.05952,0.1781,1.687,1.243,11.28,0.006588,0.0127,0.0145,0.006104,0.01574,0.002268,10.67,36.92,68.03,349.9,0.111,0.1109,0.0719,0.04866,0.2321,0.07211 +Benign,12.34,12.27,78.94,468.5,0.09003,0.06307,0.02958,0.02647,0.1689,0.05808,0.1166,0.4957,0.7714,8.955,0.003681,0.009169,0.008732,0.00574,0.01129,0.001366,13.61,19.27,87.22,564.9,0.1292,0.2074,0.1791,0.107,0.311,0.07592 +Malignant,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504,1.214,2.188,8.077,106.0,0.006883,0.01094,0.01818,0.01917,0.007882,0.001754,14.99,25.2,95.54,698.8,0.09387,0.05131,0.02398,0.02899,0.1565,0.05504 +Benign,10.57,20.22,70.15,338.3,0.09073,0.166,0.228,0.05941,0.2188,0.0845,0.1115,1.231,2.363,7.228,0.008499,0.07643,0.1535,0.02919,0.01617,0.0122,10.85,22.82,76.51,351.9,0.1143,0.3619,0.603,0.1465,0.2597,0.12 +Benign,13.14,20.74,85.98,536.9,0.08675,0.1089,0.1085,0.0351,0.1562,0.0602,0.3152,0.7884,2.312,27.4,0.007295,0.03179,0.04615,0.01254,0.01561,0.00323,14.8,25.46,100.9,689.1,0.1351,0.3549,0.4504,0.1181,0.2563,0.08174 \ No newline at end of file diff --git a/tests/test_data_config.csv b/tests/test_data_config.csv new file mode 100644 index 0000000..93225de --- /dev/null +++ b/tests/test_data_config.csv @@ -0,0 +1,32 @@ +column,type,min,max,category,max_nullable +diagnosis,str,,,"Malignant,Benign",0 +mean_radius,float,6,40,,0.1 +mean_texture,float,9,50,,0.1 +mean_perimeter,float,40,260,,0.1 +mean_area,float,140,4300,,0.1 +mean_smoothness,float,0,1,,0.1 +mean_compactness,float,0,2,,0.1 +mean_concavity,float,0,2,,0.1 +mean_concave_points,float,0,1,,0.1 +mean_symmetry,float,0,1,,0.1 +mean_fractal_dimension,float,0,1,,0.1 +se_radius,float,0,3,,0.1 +se_texture,float,0,5,,0.1 +se_perimeter,float,0,22,,0.1 +se_area,float,6,550,,0.1 +se_smoothness,float,0,1,,0.1 +se_compactness,float,0,1,,0.1 +se_concavity,float,0,1,,0.1 +se_concave_points,float,0,1,,0.1 +se_symmetry,float,0,1,,0.1 +se_fractal_dimension,float,0,1,,0.1 +max_radius,float,6,40,,0.1 +max_texture,float,9,50,,0.1 +max_perimeter,float,40,260,,0.1 +max_area,float,140,4300,,0.1 +max_smoothness,float,0,1,,0.1 +max_compactness,float,0,2,,0.1 +max_concavity,float,0,2,,0.1 +max_concave_points,float,0,1,,0.1 +max_symmetry,float,0,1,,0.1 +max_fractal_dimension,float,0,1,,0.1 \ No newline at end of file diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py new file mode 100644 index 0000000..15606ed --- /dev/null +++ b/tests/test_validate_data.py @@ -0,0 +1,154 @@ +import pytest +import os +import pandas as pd +import pandera as pa +import numpy as np +from pandera import Column, DataFrameSchema +import sys +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from src.validate_data import build_schema_from_csv, validate_data +from src.clean_data import extract_column_name + +# Test setup for build_schema_from_csv + +invalid_data_config1 = pd.DataFrame({ + 'column':['diagnosis','mean_radius'], + 'type':['str','float'], + 'min':[None,4], + 'max':[None,30] +}) +invalid_data_config2 = pd.DataFrame({ + 'column':['diagnosis','mean_texture'], + 'type':['str','float'], + 'min':[None,4], + 'max':[None,30], + 'category':["Malignant,Benign",None], + 'max_nullable':0.1 +}) +valid_data_config = pd.DataFrame({ + 'column':['diagnosis','mean_radius'], + 'type':['str','float'], + 'min':[None,6], + 'max':[None,40], + 'category':["Malignant,Benign",None], + 'max_nullable':0.1 +}) + +valid_colnames = ['diagnosis','mean_radius'] +invalid_data_type = [1, 2, 3, 4, 5] + +# Tests for build_schema_from_csv + +# test build_schema_from_csv function throws an error +# if the data_config is not a dataframe +def test_build_schema_from_csv_error_on_wrong_data_config_type(): + with pytest.raises(TypeError, match="data_config must be a pandas dataframe."): + build_schema_from_csv(data_config=invalid_data_type, expected_columns=valid_colnames) + +# if pandas dataframe doesn't have following columns: column,type,max,min,category +def test_build_schema_from_csv_error_on_incorrect_columns(): + with pytest.raises(ValueError, match=f"The data_config must have following columns: 'column', 'type', 'min', 'max', 'category'."): + build_schema_from_csv(data_config=invalid_data_config1, expected_columns=valid_colnames) + +# if the values of 'column' match the column names extracted from name file +def test_build_schedma_from_csv_error_on_mismatch_column_names(): + with pytest.raises(ValueError, match="Column names in the config file do not match the expected columns."): + build_schema_from_csv(data_config=invalid_data_config2, expected_columns=valid_colnames) + +# Tests setup for validate_data function + +data_config_df = pd.read_csv('tests/test_data_config.csv') +colnames = extract_column_name('tests/test_wdbc.names')[1:] #removing column name: 'id' + +valid_schema = build_schema_from_csv(data_config=data_config_df,expected_columns=colnames) +invalid_schema = [1] + +valid_data = pd.read_csv('tests/test_cleaned_data.csv', nrows=3) +empty_data_frame = valid_data.copy().iloc[0:0] + +# Setup list of invalid data cases +invalid_data_cases = [] + +# Case: missing "diagnosis" column +case_missing_class_col = valid_data.copy() +case_missing_class_col = case_missing_class_col.drop("diagnosis", axis=1) # drop class column +invalid_data_cases.append((case_missing_class_col, "`diagnosis` from DataFrameSchema")) + +# Case: label in "diagnosis" column encoded as 0 and 1, instead of Benign and Malignant +case_wrong_label_type = valid_data.copy() +case_wrong_label_type["diagnosis"] = case_wrong_label_type["diagnosis"].map({'Benign': 0, 'Malignant': 1}) +invalid_data_cases.append((case_wrong_label_type, "Check incorrect type for'diagnosis' values is missing or incorrect")) + +# Case: wrong string value/category in "diagnosis" column +case_wrong_category_label = valid_data.copy() +case_wrong_category_label.loc[0, "diagnosis"] = "benign" +invalid_data_cases.append((case_wrong_category_label, "Check absent or incorrect for wrong string value/category in 'diagnosis' column")) + +# Case: missing value in "diagnosis" column +case_missing_class = valid_data.copy() +case_missing_class.loc[0, "diagnosis"] = None +invalid_data_cases.append((case_missing_class, "Check absent or incorrect for missing/null 'diagnosis' value")) + +# Case: missing numeric columns (one for each numeric column) where column is missing +numeric_columns = valid_data.select_dtypes(include=np.number).columns +for col in numeric_columns: + case_missing_col = valid_data.copy() + case_missing_col = case_missing_col.drop(col, axis=1) # drop column + invalid_data_cases.append((case_missing_col, f"'{col}' is missing from DataFrameSchema")) + +# Generate 30 cases (one for each numeric column) where data is out of range (too large) +numeric_columns = valid_data.select_dtypes(include=np.number).columns +for col in numeric_columns: + case_too_big = valid_data.copy() + case_too_big[col] = case_too_big[col] + 5000 # Adding an arbitrary value to make it out of range + invalid_data_cases.append((case_too_big, f"Check absent or incorrect for numeric values in '{col}' being too large")) + +# Generate 30 cases (one for each numeric column) where data is out of range (too small) +numeric_columns = valid_data.select_dtypes(include=np.number).columns +for col in numeric_columns: + case_too_small = valid_data.copy() + case_too_small[col] = case_too_small[col] - 1000 # Adding an arbitrary value to make it out of range + invalid_data_cases.append((case_too_small, f"Check absent or incorrect for numeric values in '{col}' being too small")) + +# Generate 30 cases (one for each numeric column) where data is the wrong type +numeric_columns = valid_data.select_dtypes(include=np.number).columns +for col in numeric_columns: + case_wrong_type = valid_data.copy() + case_wrong_type[col] = case_wrong_type[col].fillna(0.0).astype(int) # convert from float to int + invalid_data_cases.append((case_wrong_type, f"Check incorrect type for float values in '{col}' is missing or incorrect")) + +# Case: duplicate observations +case_duplicate = valid_data.copy() +case_duplicate = pd.concat([case_duplicate, case_duplicate.iloc[[0], :]], ignore_index=True) +invalid_data_cases.append((case_duplicate, f"Check absent or incorrect for duplicate rows")) + +# Case: entire missing observation +case_missing_obs = valid_data.copy() +nan_row = pd.DataFrame([[np.nan] * (case_missing_obs.shape[1] - 1) + [np.nan]], columns=case_missing_obs.columns) +case_missing_obs = pd.concat([case_missing_obs, nan_row], ignore_index=True) +invalid_data_cases.append((case_missing_obs, f"Check absent or incorrect for missing observations (e.g., a row of all missing values)")) + + +# Tests for validate_data function + +# test build_schema_from_csv function throws an error +# if the schema is invalid pandera dataframe schema +def test_validate_data_error_on_invalid_schema_type(): + with pytest.raises(TypeError, match='schema must be a pandera dataframe schema.'): + validate_data(schema=invalid_schema, dataframe=valid_data) + +# if the dataframe is not a pandas data frame +def test_validate_data_error_on_invalid_dataframe_type(): + with pytest.raises(TypeError, match='dataframe must be a pandas data frame.'): + validate_data(schema=valid_schema, dataframe=invalid_data_type) + +# if the dataframe has no observations +def test_validate_data_error_on_empty_dataframe_type(): + with pytest.raises(ValueError): + validate_data(schema=valid_schema, dataframe=empty_data_frame) + +# if the dataframe has invalid data +@pytest.mark.parametrize("invalid_data, description", invalid_data_cases) +def test_valid_w_invalid_data(invalid_data, description): + with pytest.raises(pa.errors.SchemaErrors): + validate_data(schema=valid_schema, dataframe=invalid_data) diff --git a/tests/test_wdbc.data b/tests/test_wdbc.data new file mode 100644 index 0000000..2daaa02 --- /dev/null +++ b/tests/test_wdbc.data @@ -0,0 +1,10 @@ +842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189 +842517,M,20.57,17.77,132.9,1326,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956,0.1238,0.1866,0.2416,0.186,0.275,0.08902 +84300903,M,19.69,21.25,130,1203,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709,0.1444,0.4245,0.4504,0.243,0.3613,0.08758 +84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173 +84358402,M,20.29,14.34,135.1,1297,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575,0.1374,0.205,0.4,0.1625,0.2364,0.07678 +843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,0.3345,0.8902,2.217,27.19,0.00751,0.03345,0.03672,0.01137,0.02165,0.005082,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244 +844359,M,18.25,19.98,119.6,1040,0.09463,0.109,0.1127,0.074,0.1794,0.05742,0.4467,0.7732,3.18,53.91,0.004314,0.01382,0.02254,0.01039,0.01369,0.002179,22.88,27.66,153.2,1606,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368 +84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,0.5835,1.377,3.856,50.96,0.008805,0.03029,0.02488,0.01448,0.01486,0.005412,17.06,28.14,110.6,897,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151 +844981,M,13,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,0.3063,1.002,2.406,24.32,0.005731,0.03502,0.03553,0.01226,0.02143,0.003749,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072 +84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,0.2976,1.599,2.039,23.94,0.007149,0.07217,0.07743,0.01432,0.01789,0.01008,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075 diff --git a/tests/test_wdbc.names b/tests/test_wdbc.names new file mode 100644 index 0000000..3af8990 --- /dev/null +++ b/tests/test_wdbc.names @@ -0,0 +1,140 @@ +1. Title: Wisconsin Diagnostic Breast Cancer (WDBC) + +2. Source Information + +a) Creators: + + Dr. William H. Wolberg, General Surgery Dept., University of + Wisconsin, Clinical Sciences Center, Madison, WI 53792 + wolberg@eagle.surgery.wisc.edu + + W. Nick Street, Computer Sciences Dept., University of + Wisconsin, 1210 West Dayton St., Madison, WI 53706 + street@cs.wisc.edu 608-262-6619 + + Olvi L. Mangasarian, Computer Sciences Dept., University of + Wisconsin, 1210 West Dayton St., Madison, WI 53706 + olvi@cs.wisc.edu + +b) Donor: Nick Street + +c) Date: November 1995 + +3. Past Usage: + +first usage: + + W.N. Street, W.H. Wolberg and O.L. Mangasarian + Nuclear feature extraction for breast tumor diagnosis. + IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science + and Technology, volume 1905, pages 861-870, San Jose, CA, 1993. + +OR literature: + + O.L. Mangasarian, W.N. Street and W.H. Wolberg. + Breast cancer diagnosis and prognosis via linear programming. + Operations Research, 43(4), pages 570-577, July-August 1995. + +Medical literature: + + W.H. Wolberg, W.N. Street, and O.L. Mangasarian. + Machine learning techniques to diagnose breast cancer from + fine-needle aspirates. + Cancer Letters 77 (1994) 163-171. + + W.H. Wolberg, W.N. Street, and O.L. Mangasarian. + Image analysis and machine learning applied to breast cancer + diagnosis and prognosis. + Analytical and Quantitative Cytology and Histology, Vol. 17 + No. 2, pages 77-87, April 1995. + + W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. + Computerized breast cancer diagnosis and prognosis from fine + needle aspirates. + Archives of Surgery 1995;130:511-516. + + W.H. Wolberg, W.N. Street, D.M. Heisey, and O.L. Mangasarian. + Computer-derived nuclear features distinguish malignant from + benign breast cytology. + Human Pathology, 26:792--796, 1995. + +See also: + http://www.cs.wisc.edu/~olvi/uwmp/mpml.html + http://www.cs.wisc.edu/~olvi/uwmp/cancer.html + +Results: + + - predicting field 2, diagnosis: B = benign, M = malignant + - sets are linearly separable using all 30 input features + - best predictive accuracy obtained using one separating plane + in the 3-D space of Worst Area, Worst Smoothness and + Mean Texture. Estimated accuracy 97.5% using repeated + 10-fold crossvalidations. Classifier has correctly + diagnosed 176 consecutive new patients as of November + 1995. + +4. Relevant information + + Features are computed from a digitized image of a fine needle + aspirate (FNA) of a breast mass. They describe + characteristics of the cell nuclei present in the image. + A few of the images can be found at + http://www.cs.wisc.edu/~street/images/ + + Separating plane described above was obtained using + Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree + Construction Via Linear Programming." Proceedings of the 4th + Midwest Artificial Intelligence and Cognitive Science Society, + pp. 97-101, 1992], a classification method which uses linear + programming to construct a decision tree. Relevant features + were selected using an exhaustive search in the space of 1-4 + features and 1-3 separating planes. + + The actual linear program used to obtain the separating plane + in the 3-dimensional space is that described in: + [K. P. Bennett and O. L. Mangasarian: "Robust Linear + Programming Discrimination of Two Linearly Inseparable Sets", + Optimization Methods and Software 1, 1992, 23-34]. + + + This database is also available through the UW CS ftp server: + + ftp ftp.cs.wisc.edu + cd math-prog/cpo-dataset/machine-learn/WDBC/ + +5. Number of instances: 569 + +6. Number of attributes: 32 (ID, diagnosis, 30 real-valued input features) + +7. Attribute information + +1) ID number +2) Diagnosis (M = malignant, B = benign) +3-32) + +Ten real-valued features are computed for each cell nucleus: + + a) radius (mean of distances from center to points on the perimeter) + b) texture (standard deviation of gray-scale values) + c) perimeter + d) area + e) smoothness (local variation in radius lengths) + f) compactness (perimeter^2 / area - 1.0) + g) concavity (severity of concave portions of the contour) + h) concave points (number of concave portions of the contour) + i) symmetry + j) fractal dimension ("coastline approximation" - 1) + +Several of the papers listed above contain detailed descriptions of +how these features are computed. + +The mean, standard error, and "worst" or largest (mean of the three +largest values) of these features were computed for each image, +resulting in 30 features. For instance, field 3 is Mean Radius, field +13 is Radius SE, field 23 is Worst Radius. + +All feature values are recoded with four significant digits. + +8. Missing attribute values: none + +9. Class distribution: 357 benign, 212 malignant \ No newline at end of file