diff --git a/examples/example_files/dataset_categorical_missing.csv b/examples/example_files/dataset_categorical_missing.csv new file mode 100644 index 0000000..6012798 --- /dev/null +++ b/examples/example_files/dataset_categorical_missing.csv @@ -0,0 +1,100 @@ +11.18628795,8.553093649,2.337681394,7.740221556,2.076186822,12.53763102,a,Class 1 +10.14583771,8.771100076,9.617580733,11.28898328,3.731375463,3.61496461,a,Class 1 +7.608378163,9.215208082,12.21721984,0.217171385,10.84786072,0.068879981,b,Class 1 +9.741381036,4.373282412,0.46661895,9.670759021,3.769897645,3.166427585,b,Class 1 +6.326460397,4.950181274,0.955970145,13.27044224,13.3803711,6.7042629,b,Class 2 +6.614221714,5.139908792,6.322918249,7.613060017,2.965112605,8.757576779,c,Class 1 +9.77079694,13.22809724,12.27910247,4.486662894,1.637543898,3.145057057,b,Class 2 +5.986399993,3.575034439,4.436729038,10.27403804,6.085388637,9.638152072,a,Class 2 +9.306449023,0.419825519,5.207460405,3.352657815,8.955085451,13.74251068,c,Class 2 +12.85278467,0.554461277,13.25029775,7.70837554,12.81365721,8.448635991,c,Class 2 +10.32440339,3.195964543,1.215275549,3.741461311,11.6736581,6.435247906,a,Class 1 +12.45890387,6.754763706,3.568497268,6.787252535,4.528132031,11.24413956,,Class 1 +4.92766004,8.894839385,1.599234587,6.491628957,1.003488457,8.990811618,a,Class 1 +14.62465143,,12.02718728,3.868350191,5.604315633,13.4574281,a,Class 1 +5.548176135,6.507710991,0.798813746,13.28445746,7.037019737,13.71370375,b,Class 2 +4.414357119,9.250023962,8.716711231,7.733484723,3.661143919,14.63698956,b,Class 1 +7.794274494,0.554776276,6.717665128,3.422141362,12.80249591,3.744667173,b,Class 2 +10.46207608,14.78604433,11.14501886,13.28194261,13.35036026,8.342502238,c,Class 1 +11.38516345,11.33272181,1.919660335,4.978216028,8.668419104,6.052792392,c,Class 2 +11.19640261,10.3436337,0.939034115,14.91069148,7.269366966,12.53406348,c,Class 2 +11.06549373,,13.96718884,12.53348993,4.476687297,11.87992708,a,Class 1 +5.721763439,10.70136406,4.677923895,12.04602629,6.630499903,13.04574224,c,Class 2 +14.87203026,4.717515614,12.16090195,10.17484858,1.258457287,3.762734746,c,Class 1 +9.517250388,14.61073986,10.55186687,12.13409641,4.195938316,14.9085867,b,Class 2 +5.490151571,11.07922707,2.912349404,11.26243041,6.909836863,12.93169762,b,Class 1 +3.597959325,7.3606205,1.89533481,8.407778067,12.94742999,9.956797585,a,Class 2 +13.99187099,6.16144391,4.430074749,10.48992388,6.724889945,11.63545045,a,Class 1 +13.87167852,11.47473231,12.91040409,5.329482463,12.41092153,9.923540019,c,Class 2 +6.884021,0.536048784,13.77495679,6.51467553,4.70254023,8.780237509,,Class 2 +2.208914531,12.70665676,13.62555578,9.598180651,7.438779306,7.81610053,b,Class 1 +8.659531425,8.209053873,6.907242925,9.847209807,7.643627147,1.24454444,b,Class 1 +1.448257785,10.22497998,1.269324615,3.714269901,13.03906827,2.870250771,b,Class 1 +10.19241437,,8.886001739,5.828695389,2.605134041,1.19188785,b,Class 1 +7.777359497,10.96783191,4.890083745,5.284618971,4.411218163,8.605757632,b,Class 2 +1.056011622,7.844004878,10.65020289,4.234763934,6.43943205,1.262495126,a,Class 2 +7.648844009,10.14403542,9.539688734,13.66072313,0.330411845,5.610949661,a,Class 1 +4.321962749,5.604955856,7.525456962,2.795185293,0.557651224,9.096120183,c,Class 2 +7.580303996,14.13657189,2.208779404,12.65807527,8.616258995,14.2742891,c,Class 1 +6.617679318,12.17838447,6.219814209,9.278219597,2.627013838,10.26198055,a,Class 1 +10.42636852,11.37466476,1.605370071,13.38238859,13.4486372,0.658796404,a,Class 1 +10.52477845,5.275716432,11.83515271,0.617870822,0.921374579,8.348557261,b,Class 2 +12.42122246,0.012249697,7.74555252,12.02705019,3.442939685,7.110063876,b,Class 1 +2.721191099,14.56211777,12.2194075,8.457083772,1.843488398,8.775189039,b,Class 2 +11.93193151,7.265208519,0.45505228,4.217468632,13.6978792,11.24703349,c,Class 2 +0.493376888,0.414245824,5.492426678,3.926579473,5.14363276,9.3274729,b,Class 2 +11.99067679,2.224771613,14.51607498,13.038479,2.048398725,10.21056055,a,Class 2 +4.39009476,2.715892095,13.65208099,1.276459275,3.1947636,5.738578547,c,Class 2 +9.517850998,7.870570236,11.66133708,3.158457987,0.994101959,1.760078291,c,Class 2 +1.67226824,1.257990444,13.35645397,6.432977593,8.173353149,10.47964661,a,Class 2 +11.40110217,3.755922456,14.78250639,9.12235283,5.463228968,8.004612121,a,Class 1 +0.51828403,7.467344863,0.403372329,1.324884922,6.204846153,0.397427501,a,Class 1 +8.56890712,3.700288257,14.23433924,1.836880065,0.168958671,1.260377664,a,Class 2 +0.927565538,0.256079044,8.244925899,10.78666638,13.47379713,0.009413535,b,Class 2 +11.12587642,0.512971929,10.37022985,9.927926894,8.924001776,5.446182158,b,Class 1 +8.296166587,3.881765358,12.50788295,6.751744679,3.270039419,13.16076438,b,Class 2 +5.342454277,14.06289262,6.052115238,4.60660751,14.92130785,9.251614117,c,Class 1 +10.76413906,3.108242418,7.407200789,0.124640979,6.315064545,6.974791847,c,Class 2 +11.60334067,7.869475964,14.06285885,3.010169778,0.21862987,10.7119562,c,Class 1 +0.675107878,10.62554075,9.16909931,1.51930099,9.054828927,8.018314854,a,Class 2 +0.136391516,12.15438651,13.10410369,4.54379884,1.467941336,3.708272962,c,Class 2 +11.20314646,7.917973833,7.205146518,14.47482833,5.385158132,3.962870806,c,Class 1 +12.96777011,7.276652989,12.46734536,8.774357457,14.49755617,1.021454967,b,Class 1 +7.259751863,5.37753719,8.753494011,1.105904802,7.423842186,7.060245922,b,Class 1 +5.550401633,10.28344926,8.849268232,10.35224505,11.42901447,2.015178403,a,Class 2 +8.724250626,5.144158413,8.881589983,5.654339781,3.348767179,7.567443724,a,Class 2 +1.505308287,8.327887318,5.967980754,5.861512631,1.942362782,12.08752455,c,Class 1 +11.95352828,13.83709019,1.484043207,14.9990425,0.358430191,0.936128377,b,Class 2 +14.1424292,5.653091086,14.75697191,6.534335531,14.59624216,4.217427045,b,Class 1 +2.096214566,13.41972927,5.026757888,10.15382225,10.69199037,8.119000359,b,Class 2 +4.658969577,8.152082829,10.69897004,3.807611391,9.432866697,7.469063458,b,Class 1 +9.880365248,6.857500577,9.50270486,6.185811124,6.801593649,1.426651215,b,Class 2 +11.38053935,5.64968146,13.16726558,3.969547861,3.409401613,6.754962952,b,Class 1 +2.549597194,3.81373774,3.381883424,12.54165021,7.238285696,5.014469506,a,Class 1 +2.149956112,14.18695148,4.495586504,1.193989236,0.629565843,10.71726557,a,Class 1 +0.633065458,10.57661883,3.911208047,4.737683148,10.67249983,11.44130896,c,Class 2 +10.98958055,8.538690522,2.221702088,7.94460522,7.268542052,13.0506614,c,Class 2 +14.05448371,5.906069731,11.02070992,14.78464345,1.395098041,12.45034592,a,Class 1 +4.849203233,14.92593789,14.83374088,13.33589083,10.91265222,6.015994872,a,Class 2 +1.788538553,1.189933547,13.37927743,7.078881338,0.115268965,1.102757553,b,Class 1 +1.520260264,4.390949317,8.961363089,9.116191933,4.902286012,13.82917543,b,Class 2 +5.143515013,1.626830627,5.011771888,14.53607373,9.254769126,5.987742339,b,Class 2 +0.383485623,5.893120492,2.198084919,3.607295516,11.2909701,14.19259294,c,Class 1 +3.543625982,1.817300049,12.79701902,9.150819857,4.270171936,1.046802952,b,Class 2 +9.014121301,8.894615211,14.32697976,12.05396604,6.610724668,12.9453385,a,Class 1 +2.178293829,11.00240774,3.4661733,4.216419592,14.36522422,3.571201671,c,Class 1 +9.218901263,8.682081223,12.48795288,8.796277452,13.72799658,1.414017549,c,Class 2 +1.417376539,13.2588434,13.00750995,9.108292367,5.332117011,3.7214796,a,Class 2 +11.40541996,10.59274384,11.90631845,4.497592473,4.532009755,4.117336922,a,Class 1 +5.547732807,6.107428176,13.30160131,8.442144861,9.854871343,3.268384157,a,Class 2 +2.558435481,12.36056669,7.777967112,6.812644994,8.532351866,6.71817697,a,Class 2 +2.349328005,11.73919423,11.20515163,11.47196866,13.24600243,1.770939874,b,Class 1 +13.34706077,13.86142631,0.291296401,0.12119829,5.885044406,8.475403207,b,Class 2 +5.351503888,6.40942837,11.07531808,8.972571254,3.233818614,12.43439266,b,Class 2 +6.693621558,13.96686031,1.475546478,12.35803005,0.873347546,0.688133753,c,Class 2 +10.48922559,6.646089272,7.4076759,7.873827219,5.742578275,1.806450848,c,Class 2 +1.365010518,0.840426698,6.044826791,12.33437799,5.33827304,14.55706457,c,Class 2 +6.145883127,12.20161505,1.162956248,11.67002394,6.279495076,5.709716727,a,Class 1 +12.99028641,3.448828215,4.946279072,10.87002826,14.83427318,9.154544604,c,Class 2 +1.109266891,2.564645156,10.64938657,7.677215295,8.625541169,8.960849049,c,Class 1 +6.891117595,13.9566784,0.952437927,6.585976751,13.16019122,7.78218351,b,Class 1 diff --git a/examples/factories.py b/examples/factories.py index a49901f..7ada8e8 100644 --- a/examples/factories.py +++ b/examples/factories.py @@ -3,6 +3,7 @@ from niaaml.preprocessing.feature_transform import FeatureTransformAlgorithmFactory from niaaml.fitness import FitnessFactory from niaaml.preprocessing.encoding import EncoderFactory +from niaaml.preprocessing.imputation import ImputerFactory """ In this example, we show how to use all of the implemented factories to create new object instances using their class names. You may also @@ -15,6 +16,7 @@ fta_factory = FeatureTransformAlgorithmFactory() f_factory = FitnessFactory() e_factory = EncoderFactory() +i_factory = ImputerFactory() # get an instance of the MultiLayerPerceptron class mlp = classifier_factory.get_result('MultiLayerPerceptron') @@ -31,4 +33,7 @@ #get an instance of the OneHotEncoder class ohe = e_factory.get_result('OneHotEncoder') -# variables mlp, pso, normalizer, precision and ohe contain instances of the classes with the passed names \ No newline at end of file +#get an instance of the SimpleImputer class +imp = i_factory.get_result('SimpleImputer') + +# variables mlp, pso, normalizer, precision, ohe and imp contain instances of the classes with the passed names \ No newline at end of file diff --git a/examples/feature_imputing.py b/examples/feature_imputing.py new file mode 100644 index 0000000..e2c3e7a --- /dev/null +++ b/examples/feature_imputing.py @@ -0,0 +1,23 @@ +from niaaml.preprocessing.imputation import SimpleImputer, impute_features +import os +from niaaml.data import CSVDataReader + +""" +In this example, we show how to individually use an implemented missing features' imputer and its methods. In this case we use SimpleImputer for demonstration, but +you can use any of the implemented imputers in the same way. +""" + +# prepare data reader using csv file +data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) + +# instantiate SimpleImputer +si = SimpleImputer() + +# fit, transform and print to output the feature in the dataset (index 6) +features = data_reader.get_x() +si.fit(features[[6]]) +f = si.transform(features[[6]]) +print(f) + +# if you wish to get array of imputers for all of the features with missing values in a dataset (and transformed DataFrame of features), you may use the utility method impute_features +transformed_features, imputers = impute_features(features, 'SimpleImputer') \ No newline at end of file diff --git a/examples/optimize_run_pipeline_missing_values.py b/examples/optimize_run_pipeline_missing_values.py new file mode 100644 index 0000000..97e241b --- /dev/null +++ b/examples/optimize_run_pipeline_missing_values.py @@ -0,0 +1,47 @@ +from niaaml import Pipeline +from niaaml.classifiers import MultiLayerPerceptron +from niaaml.preprocessing.feature_selection import VarianceThreshold +from niaaml.preprocessing.feature_transform import Normalizer +from niaaml.data import CSVDataReader +from niaaml.preprocessing.encoding import encode_categorical_features +from niaaml.preprocessing.imputation import impute_features +import os +import numpy +import pandas + +""" +In this example, we show how to individually use the Pipeline class. You may use this if you want to test out a specific classification pipeline. +We will use a dataset that contains categorical and numerical features with missing values. +""" + +# prepare data reader using csv file +data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) + +features = data_reader.get_x() + +# we use the utility method impute_features to get imputers for the features with missing values, but you may instantiate and fit +# imputers separately and pass them as a dictionary (as long as they are implemented as this framework suggests), with keys as column names or indices (if there is no header in the csv) +# there should be as many imputers as the features with missing values +# this example uses Simple Imputer +features, imputers = impute_features(features, 'SimpleImputer') + +# exactly the same goes for encoders +_, encoders = encode_categorical_features(features, 'OneHotEncoder') + +# instantiate a Pipeline object +pipeline = Pipeline( + feature_selection_algorithm=VarianceThreshold(), + feature_transform_algorithm=Normalizer(), + classifier=MultiLayerPerceptron(), + categorical_features_encoders=encoders, + imputers=imputers +) + +# run pipeline optimization process (returns fitness value, but sets the best parameters for classifier, feature selection algorithm and feature transform algorithm during the process) +pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 10, 50, 'ParticleSwarmAlgorithm', 'Accuracy') + +# run the pipeline using dummy data +# you could run the pipeline before the optimization process, but get wrong predictions as nothing in the pipeline is fit for the given dataset +predicted = pipeline.run(pandas.DataFrame([[10.32440339, 3.195964543, 1.215275549, 3.741461311, 11.6736581, 6.435247906, 'a']])) + +# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file \ No newline at end of file diff --git a/examples/run_pipeline_optimizer_csv_data_missing.py b/examples/run_pipeline_optimizer_csv_data_missing.py new file mode 100644 index 0000000..b5a7412 --- /dev/null +++ b/examples/run_pipeline_optimizer_csv_data_missing.py @@ -0,0 +1,32 @@ +import os +from niaaml import PipelineOptimizer, Pipeline +from niaaml.data import CSVDataReader + +""" +In this example, we show how to use the PipelineOptimizer class. This example is using an instance of CSVDataReader. +The instantiated PipelineOptimizer will try and assemble the best pipeline with the components that are specified in its constructor. +We use a dataset with 1 categorical feature and missing values to demonstrate a use of PipelineOptimizer instance with automatic feature encoding and imputation. +""" + +# prepare data reader using csv file +data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) + +# instantiate PipelineOptimizer that chooses among specified classifiers, feature selection algorithms and feature transform algorithms +# OneHotEncoder is used for encoding categorical features in this example +# SimpleImputer is used for imputing missing values in this example +pipeline_optimizer = PipelineOptimizer( + data=data_reader, + classifiers=['AdaBoost', 'Bagging', 'MultiLayerPerceptron', 'RandomForest', 'ExtremelyRandomizedTrees', 'LinearSVC'], + feature_selection_algorithms=['SelectKBest', 'SelectPercentile', 'ParticleSwarmOptimization', 'VarianceThreshold'], + feature_transform_algorithms=['Normalizer', 'StandardScaler'], + categorical_features_encoder='OneHotEncoder', + imputer='SimpleImputer' +) + +# runs the optimization process +# one of the possible pipelines in this case is: SelectPercentile -> Normalizer -> RandomForest +# returns the best found pipeline +# the chosen fitness function and optimization algorithm are Accuracy and Particle Swarm Algorithm +pipeline = pipeline_optimizer.run('Accuracy', 20, 20, 400, 400, 'ParticleSwarmAlgorithm', 'ParticleSwarmAlgorithm') + +# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file \ No newline at end of file diff --git a/niaaml/__init__.py b/niaaml/__init__.py index 299f0e4..54b4aff 100644 --- a/niaaml/__init__.py +++ b/niaaml/__init__.py @@ -27,4 +27,4 @@ ] __project__ = 'niaaml' -__version__ = '1.0.0rc1' +__version__ = '1.0.0rc2' diff --git a/niaaml/pipeline.py b/niaaml/pipeline.py index c25e53c..e66b495 100644 --- a/niaaml/pipeline.py +++ b/niaaml/pipeline.py @@ -34,7 +34,8 @@ class Pipeline: __classifier (Classifier): Classifier implementation. __selected_features_mask (Iterable[bool]): Mask of selected features during the feature selection process. __best_stats (OptimizationStats): Statistics of the most successful setup of parameters. - __categorical_features_encoders (Iterable[FeatureEncoder]): Actual instances of FeatureEncoder for all categorical features. + __categorical_features_encoders (Dict[FeatureEncoder]): Instances of FeatureEncoder for all categorical features. + __imputers (Dict[Imputer]): Dictionary of instances of Imputer for all columns that contained missing values during optimization process. __niapy_algorithm_utility (AlgorithmUtility): Class used for getting an optimiziation algorithm using its name. """ @@ -47,22 +48,25 @@ def __init__(self, **kwargs): self.__selected_features_mask = None self.__best_stats = None self.__categorical_features_encoders = None + self.__imputers = None self.__niapy_algorithm_utility = AlgorithmUtility() self._set_parameters(**kwargs) - def _set_parameters(self, classifier, feature_selection_algorithm=None, feature_transform_algorithm=None, categorical_features_encoders = None, **kwargs): + def _set_parameters(self, classifier, feature_selection_algorithm=None, feature_transform_algorithm=None, categorical_features_encoders = None, imputers = None, **kwargs): r"""Set the parameters/arguments of the task. Arguments: feature_selection_algorithm (Optional[FeatureSelectionAlgorithm]): Feature selection algorithm implementation. feature_transform_algorithm (Optional[FeatureTransformAlgorithm]): Feature transform algorithm implementation. classifier (Classifier): Classifier implementation. - categorical_features_encoders (Iterable[FeatureEncoders]): Actual instances of FeatureEncoder for all categorical features. + categorical_features_encoders (Dict[FeatureEncoders]): Actual instances of FeatureEncoder for all categorical features. + imputers (Dict[Imputer]): Instances of Imputer for all features that contained missing values during optimization process. """ self.__feature_selection_algorithm = feature_selection_algorithm self.__feature_transform_algorithm = feature_transform_algorithm self.__classifier = classifier self.__categorical_features_encoders = categorical_features_encoders + self.__imputers = imputers def get_feature_selection_algorithm(self): r"""Get deep copy of the feature selection algorithm. @@ -122,9 +126,14 @@ def set_stats(self, value): self.__best_stats = value def set_categorical_features_encoders(self, value): - r"""Set selected features mask. + r"""Set categorical features' encoders. """ self.__categorical_features_encoders = value + + def set_imputers(self, value): + r"""Set imputers. + """ + self.__imputers = value def optimize(self, x, y, population_size, number_of_evaluations, optimization_algorithm, fitness_function): r"""Optimize pipeline's hyperparameters. @@ -141,17 +150,19 @@ def optimize(self, x, y, population_size, number_of_evaluations, optimization_al float: Best fitness value found in optimization process. """ + if self.__imputers is not None: + for key in self.__imputers: + x.loc[:, key] = self.__imputers[key].transform(x[[key]]) + if self.__categorical_features_encoders is not None: - types = x.dtypes to_drop = [] enc_features = pd.DataFrame() - ind = 0 - for i in range(len(types)): - if types[i] != np.dtype('float64'): - tr = self.__categorical_features_encoders[ind].transform(x[[i]]) - to_drop.append(i) - enc_features = pd.concat([enc_features, tr], axis=1) - ind += 1 + cols = [col for col in x.columns if x[col].dtype != np.dtype('float64') and x[col].dtype != np.dtype('int64')] + for c in cols: + self.__categorical_features_encoders[c].fit(x[[c]]) + tr = self.__categorical_features_encoders[c].transform(x[[c]]) + to_drop.append(c) + enc_features = pd.concat([enc_features, tr], axis=1) x = x.drop(to_drop, axis=1) x = pd.concat([x, enc_features], axis=1) @@ -183,17 +194,18 @@ def run(self, x): Returns: pandas.core.series.Series: n predicted classes of the samples in the x array. """ + if self.__imputers is not None: + for key in self.__imputers: + x.loc[:, key] = self.__imputers[key].transform(x[[key]]) + if self.__categorical_features_encoders is not None: - types = x.dtypes to_drop = [] enc_features = pd.DataFrame() - ind = 0 - for i in range(len(types)): - if types[i] != np.dtype('float64'): - tr = self.__categorical_features_encoders[ind].transform(x[[i]]) - to_drop.append(i) - enc_features = pd.concat([enc_features, tr], axis=1) - ind += 1 + cols = [col for col in x.columns if x[col].dtype != np.dtype('float64') and x[col].dtype != np.dtype('int64')] + for c in cols: + tr = self.__categorical_features_encoders[c].transform(x[[c]]) + to_drop.append(c) + enc_features = pd.concat([enc_features, tr], axis=1) x = x.drop(to_drop, axis=1) x = pd.concat([x, enc_features], axis=1) @@ -266,14 +278,21 @@ def to_string(self): stats_string = '\t' + self.__best_stats.to_string().replace('\n', '\n\t') if self.__best_stats is not None else '\tStatistics is not available.' features_string = '\t' + str(self.__selected_features_mask) if self.__selected_features_mask is not None else '\tFeature selection result is not available.' + imputers_string = '' + if self.__imputers is not None: + imputers_string += 'Missing features\' imputers (feature\'s name or index: imputer\'s name):\n' + for key in self.__imputers: + imputers_string += '\t* ' + str(key) + ': ' + self.__imputers[key].to_string() + '\n' + imputers_string += '\n' + encoders_string = '' if self.__categorical_features_encoders is not None: - encoders_string += 'Categorical features encoders (in order):\n' - for i in range(len(self.__categorical_features_encoders)): - encoders_string += '\t* ' + self.__categorical_features_encoders[i].to_string() + '\n' + encoders_string += 'Categorical features\' encoders (feature\'s name or index: encoder\'s name):\n' + for key in self.__categorical_features_encoders: + encoders_string += '\t* ' + str(key) + ': ' + self.__categorical_features_encoders[key].to_string() + '\n' encoders_string += '\n' - return 'Classifier:\n{classifier}\n\nFeature selection algorithm:\n{fsa}\n\nFeature transform algorithm:\n{fta}\n\nMask of selected features (True if selected, False if not):\n{feat}\n\n{enc}Statistics:\n{stats}'.format(classifier=classifier_string, fsa=feature_selection_algorithm_string, fta=feature_transform_algorithm_string, enc=encoders_string, feat=features_string, stats=stats_string) + return 'Classifier:\n{classifier}\n\nFeature selection algorithm:\n{fsa}\n\nFeature transform algorithm:\n{fta}\n\nMask of selected features (True if selected, False if not):\n{feat}\n\n{imp}{enc}Statistics:\n{stats}'.format(classifier=classifier_string, fsa=feature_selection_algorithm_string, fta=feature_transform_algorithm_string, imp=imputers_string, enc=encoders_string, feat=features_string, stats=stats_string) class _PipelineBenchmark(Benchmark): r"""NiaPy Benchmark class implementation. diff --git a/niaaml/pipeline_optimizer.py b/niaaml/pipeline_optimizer.py index 2a53a5f..0c9e021 100644 --- a/niaaml/pipeline_optimizer.py +++ b/niaaml/pipeline_optimizer.py @@ -8,6 +8,7 @@ from NiaPy.algorithms.utility import AlgorithmUtility from niaaml.utilities import get_bin_index from niaaml.preprocessing.encoding.utility import encode_categorical_features +from niaaml.preprocessing.imputation.utility import impute_features import pandas as pd __all__ = [ @@ -32,7 +33,9 @@ class PipelineOptimizer: __feature_transform_algorithms (Optional[Iterable[str]]): Array of names of possible feature transform algorithms. __classifiers (Iterable[Classifier]): Array of names of possible classifiers. __categorical_features_encoder (str): Name of the encoder used for categorical features. - __categorical_features_encoders (Iterable[FeatureEncoder]): Actual instances of FeatureEncoder for all categorical features. + __categorical_features_encoders (Dict[FeatureEncoder]): Actual instances of FeatureEncoder for all categorical features. + __imputer (str): Name of the imputer used for features that contain missing values. + __imputers (Dict[Imputer]): Actual instances of Imputer for all features that contain missing values. __niapy_algorithm_utility (AlgorithmUtility): Utility class used to get an optimization algorithm. """ @@ -46,11 +49,13 @@ def __init__(self, **kwargs): self.__classifiers = None self.__categorical_features_encoder = None self.__categorical_features_encoders = None + self.__imputer = None + self.__imputers = None self.__niapy_algorithm_utility = AlgorithmUtility() self._set_parameters(**kwargs) - def _set_parameters(self, data, classifiers, feature_selection_algorithms = None, feature_transform_algorithms = None, categorical_features_encoder = None, **kwargs): + def _set_parameters(self, data, classifiers, feature_selection_algorithms = None, feature_transform_algorithms = None, categorical_features_encoder = None, imputer = None, **kwargs): r"""Set the parameters/arguments of the task. Arguments: @@ -59,6 +64,7 @@ def _set_parameters(self, data, classifiers, feature_selection_algorithms = None feature_transform_algorithms (Optional[Iterable[str]]): Array of names of possible feature transform algorithms. classifiers (Iterable[Classifier]): Array of names of possible classifiers. categorical_features_encoder (Optional[str]): Name of the encoder used for categorical features. + imputer (Optional[str]): Name of the imputer used for features that contain missing values. """ self.__data = data @@ -72,6 +78,7 @@ def _set_parameters(self, data, classifiers, feature_selection_algorithms = None self.__classifiers = classifiers self.__feature_selection_algorithms = feature_selection_algorithms self.__categorical_features_encoder = categorical_features_encoder + self.__imputer = imputer def get_data(self): r"""Get data. @@ -124,10 +131,15 @@ def run(self, fitness_name, pipeline_population_size, inner_population_size, num algo = self.__niapy_algorithm_utility.get_algorithm(optimization_algorithm) algo.NP = pipeline_population_size + features = self.__data.get_x() + + if self.__imputer is not None: + features, self.__imputers = impute_features(features, self.__imputer) + if self.__categorical_features_encoder is not None: - features = self.__data.get_x() features, self.__categorical_features_encoders = encode_categorical_features(features, self.__categorical_features_encoder) - self.__data.set_x(features) + + self.__data.set_x(features) benchmark = _PipelineOptimizerBenchmark(self, fitness_name, inner_population_size, number_of_inner_evaluations, inner_optimization_algorithm if inner_optimization_algorithm is not None else optimization_algorithm) task = StoppingTask( @@ -140,6 +152,7 @@ def run(self, fitness_name, pipeline_population_size, inner_population_size, num pipeline = benchmark.get_pipeline() if pipeline is not None: pipeline.set_categorical_features_encoders(self.__categorical_features_encoders) + pipeline.set_imputers(self.__imputers) return pipeline diff --git a/niaaml/preprocessing/__init__.py b/niaaml/preprocessing/__init__.py index 2b5c38e..185fc2d 100644 --- a/niaaml/preprocessing/__init__.py +++ b/niaaml/preprocessing/__init__.py @@ -2,10 +2,12 @@ from niaaml.preprocessing import feature_selection from niaaml.preprocessing import feature_transform from niaaml.preprocessing import encoding +from niaaml.preprocessing import imputation __all__ = [ 'feature_selection', 'feature_transform', 'encoding', + 'imputation', 'PreprocessingAlgorithm' ] \ No newline at end of file diff --git a/niaaml/preprocessing/encoding/utility.py b/niaaml/preprocessing/encoding/utility.py index 643dac3..ed43f7d 100644 --- a/niaaml/preprocessing/encoding/utility.py +++ b/niaaml/preprocessing/encoding/utility.py @@ -13,27 +13,25 @@ def encode_categorical_features(features, encoder): Arguments: features (pandas.core.frame.DataFrame): DataFrame of features. - encoder (str): Number of bins on the interval [0.0, 1.0]. + encoder (str): Name of the encoder to use. Returns: - Iterable[FeatureEncoder]: Encoder for each categorical feature encoded. Tuple[pandas.core.frame.DataFrame, Iterable[FeatureEncoder]]: 1. Converted dataframe. - 2. List of encoders for all categorical features. + 2. Dictionary of encoders for all categorical features. """ enc = EncoderFactory().get_result(encoder) - encoders = [] - types = features.dtypes + encoders = {} to_drop = [] enc_features = pd.DataFrame() - for i in range(len(types)): - if types[i] != np.dtype('float64') and types[i] != np.dtype('int64'): - enc.fit(features[[i]]) - tr = enc.transform(features[[i]]) - to_drop.append(i) - enc_features = pd.concat([enc_features, tr], axis=1) - encoders.append(enc) + cols = [col for col in features.columns if features[col].dtype != np.dtype('float64') and features[col].dtype != np.dtype('int64')] + for c in cols: + enc.fit(features[[c]]) + tr = enc.transform(features[[c]]) + to_drop.append(c) + enc_features = pd.concat([enc_features, tr], axis=1) + encoders[c] = enc features = features.drop(to_drop, axis=1) features = pd.concat([features, enc_features], axis=1) return features, encoders if len(encoders) > 0 else None diff --git a/niaaml/preprocessing/imputation/__init__.py b/niaaml/preprocessing/imputation/__init__.py new file mode 100644 index 0000000..c679943 --- /dev/null +++ b/niaaml/preprocessing/imputation/__init__.py @@ -0,0 +1,11 @@ +from niaaml.preprocessing.imputation.imputer import Imputer +from niaaml.preprocessing.imputation.simple_imputer import SimpleImputer +from niaaml.preprocessing.imputation.utility import ImputerFactory +from niaaml.preprocessing.imputation.utility import impute_features + +__all__ = [ + 'Imputer', + 'SimpleImputer', + 'ImputerFactory', + 'impute_features' +] \ No newline at end of file diff --git a/niaaml/preprocessing/imputation/imputer.py b/niaaml/preprocessing/imputation/imputer.py new file mode 100644 index 0000000..3f55762 --- /dev/null +++ b/niaaml/preprocessing/imputation/imputer.py @@ -0,0 +1,52 @@ +__all__ = [ + 'Imputer' +] + +class Imputer(): + r"""Class for implementing imputers. + + Date: + 2020 + + Author: + Luka Pečnik + + License: + MIT + + Attributes: + Name (str): Name of the imputer. + """ + Name = None + + def __init__(self, **kwargs): + r"""Initialize imputer. + """ + return None + + def fit(self, feature): + r"""Fit imputer. + + Arguments: + feature (pandas.core.frame.DataFrame): A column from DataFrame of features. + """ + return None + + def transform(self, feature): + r"""Transform feature's values. + + Arguments: + feature (pandas.core.frame.DataFrame): A column from DataFrame of features. + + Returns: + pandas.core.frame.DataFrame: A transformed column. + """ + return None + + def to_string(self): + r"""User friendly representation of the object. + + Returns: + str: User friendly representation of the object. + """ + return '{name}' \ No newline at end of file diff --git a/niaaml/preprocessing/imputation/simple_imputer.py b/niaaml/preprocessing/imputation/simple_imputer.py new file mode 100644 index 0000000..12ce18c --- /dev/null +++ b/niaaml/preprocessing/imputation/simple_imputer.py @@ -0,0 +1,59 @@ +from sklearn.impute import SimpleImputer as SI +from niaaml.preprocessing.imputation.imputer import Imputer +import numpy as np + +__all__ = [ + 'SimpleImputer' +] + +class SimpleImputer(Imputer): + r"""Implementation of simple imputer. + + Date: + 2020 + + Author: + Luka Pečnik + + License: + MIT + """ + Name = 'Simple Imputer' + + def __init__(self, **kwargs): + r"""Initialize imputer. + """ + self.__simple_imputer = SI(missing_values=np.nan) + + + def fit(self, feature): + r"""Fit imputer. + + Arguments: + feature (pandas.core.frame.DataFrame): A column from DataFrame of features. + """ + if feature.dtypes.iloc[0] != np.dtype('int64') or feature.dtypes.iloc[0] != np.dtype('float64'): + replacement_val = feature.mode().iloc[0, 0] + self.__simple_imputer.set_params(**{'fill_value': replacement_val, 'strategy': 'constant'}) + self.__simple_imputer.fit(feature) + else: + self.__simple_imputer.fit(feature) + + def transform(self, feature): + r"""Transform feature's values. + + Arguments: + feature (pandas.core.frame.DataFrame): A column from DataFrame of features. + + Returns: + pandas.core.frame.DataFrame: A transformed column. + """ + return self.__simple_imputer.transform(feature) + + def to_string(self): + r"""User friendly representation of the object. + + Returns: + str: User friendly representation of the object. + """ + return Imputer.to_string(self).format(name=self.Name) \ No newline at end of file diff --git a/niaaml/preprocessing/imputation/utility.py b/niaaml/preprocessing/imputation/utility.py new file mode 100644 index 0000000..1c2e333 --- /dev/null +++ b/niaaml/preprocessing/imputation/utility.py @@ -0,0 +1,47 @@ +import niaaml.preprocessing.imputation as imp +from niaaml.utilities import Factory + +__all__ = [ + 'ImputerFactory', + 'impute_features' +] + +def impute_features(features, imputer): + """Impute features with missing data. + + Arguments: + features (pandas.core.frame.DataFrame): DataFrame of features. + imputer (str): Name of the imputer to use. + + Returns: + Tuple[pandas.core.frame.DataFrame, Dict[Imputer]]: + 1. Converted dataframe. + 2. Dictionary of imputers for all features with missing data. + """ + imp = ImputerFactory().get_result(imputer) + + imputers = {} + cols = [col for col in features.columns if features[col].isnull().any()] + for c in cols: + imp.fit(features[[c]]) + features.loc[:, c] = imp.transform(features[[c]]) + imputers[c] = imp + + return features, imputers if len(imputers) > 0 else None + +class ImputerFactory(Factory): + r"""Class with string mappings to imputers. + + Attributes: + _entities (Dict[str, Imputer]): Mapping from strings to imputers. + + See Also: + * :class:`niaaml.utilities.Factory` + """ + + def _set_parameters(self, **kwargs): + r"""Set the parameters/arguments of the factory. + """ + self._entities = { + 'SimpleImputer': imp.SimpleImputer + } diff --git a/niaaml/tests/test_imputer.py b/niaaml/tests/test_imputer.py new file mode 100644 index 0000000..ba1ac7e --- /dev/null +++ b/niaaml/tests/test_imputer.py @@ -0,0 +1,32 @@ +from unittest import TestCase +from niaaml.preprocessing.imputation import SimpleImputer, impute_features +from niaaml.data import BasicDataReader +import numpy +import pandas + +class ImputerTestCase(TestCase): + def setUp(self): + x = numpy.concatenate((numpy.random.uniform(low=0.0, high=15.0, size=(100, 6)), numpy.array([numpy.random.choice(['a', 'b'], size=(100,))]).T), axis=1) + x[50, 6] = numpy.nan + x[30, 2] = numpy.nan + y = numpy.random.choice(['Class 1', 'Class 2'], size=100) + self.__data_reader = BasicDataReader(x=x, y=y) + + def test_impute_works_fine(self): + features = self.__data_reader.get_x() + imputer1 = SimpleImputer() + imputer1.fit(features[[2]]) + f = pandas.DataFrame(imputer1.transform(features[[2]])) + self.assertFalse(f[0].isnull().any()) + + imputer2 = SimpleImputer() + imputer2.fit(features[[6]]) + f = pandas.DataFrame(imputer2.transform(features[[6]])) + self.assertFalse(f[0].isnull().any()) + + def test_utility_method_works_fine(self): + features = self.__data_reader.get_x().astype({0: 'float64', 1: 'float64', 2: 'float64', 3: 'float64', 4: 'float64', 5: 'float64'}) + features.iloc[50, 6] = numpy.nan + features, imputers = impute_features(features, 'SimpleImputer') + self.assertEqual(len(imputers), 2) + self.assertEqual(features.shape[1], 7) diff --git a/niaaml/tests/test_imputer_factory.py b/niaaml/tests/test_imputer_factory.py new file mode 100644 index 0000000..3aa178d --- /dev/null +++ b/niaaml/tests/test_imputer_factory.py @@ -0,0 +1,25 @@ +from unittest import TestCase +from niaaml.preprocessing.imputation import Imputer, ImputerFactory + +class ImputerFactoryTestCase(TestCase): + def setUp(self): + self.__factory = ImputerFactory() + + def test_get_result_works_fine(self): + for entry in self.__factory._entities: + instance = self.__factory.get_result(entry) + self.assertIsNotNone(instance) + self.assertIsInstance(instance, Imputer) + + with self.assertRaises(TypeError): + self.__factory.get_result('non_existent_name') + + def test_get_dictionary_works_fine(self): + d = self.__factory.get_name_to_classname_mapping() + d_keys = d.keys() + e_keys = self.__factory._entities.keys() + + self.assertEqual(len(e_keys), len(d_keys)) + + for k in d: + self.assertIsNotNone(d[k]) \ No newline at end of file diff --git a/niaaml/tests/test_pipeline.py b/niaaml/tests/test_pipeline.py index fc1da1f..40c127f 100644 --- a/niaaml/tests/test_pipeline.py +++ b/niaaml/tests/test_pipeline.py @@ -10,33 +10,39 @@ import pandas class PipelineTestCase(TestCase): - def setUp(self): - self.__pipeline = Pipeline( - feature_selection_algorithm=SelectKBest(), - feature_transform_algorithm=Normalizer(), - classifier=RandomForest() - ) - def test_pipeline_optimize_works_fine(self): + pipeline = Pipeline( + feature_selection_algorithm=SelectKBest(), + feature_transform_algorithm=Normalizer(), + classifier=RandomForest() + ) + data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) - self.assertIsInstance(self.__pipeline.get_classifier(), RandomForest) - self.assertIsInstance(self.__pipeline.get_feature_selection_algorithm(), SelectKBest) - self.assertIsInstance(self.__pipeline.get_feature_transform_algorithm(), Normalizer) + self.assertIsInstance(pipeline.get_classifier(), RandomForest) + self.assertIsInstance(pipeline.get_feature_selection_algorithm(), SelectKBest) + self.assertIsInstance(pipeline.get_feature_transform_algorithm(), Normalizer) - accuracy = self.__pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy') + accuracy = pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy') - self.assertGreaterEqual(accuracy, -1.0) - self.assertLessEqual(accuracy, 0.0) + if accuracy != float('inf'): + self.assertGreaterEqual(accuracy, -1.0) + self.assertLessEqual(accuracy, 0.0) - self.assertIsInstance(self.__pipeline.get_classifier(), RandomForest) - self.assertIsInstance(self.__pipeline.get_feature_selection_algorithm(), SelectKBest) - self.assertIsInstance(self.__pipeline.get_feature_transform_algorithm(), Normalizer) + self.assertIsInstance(pipeline.get_classifier(), RandomForest) + self.assertIsInstance(pipeline.get_feature_selection_algorithm(), SelectKBest) + self.assertIsInstance(pipeline.get_feature_transform_algorithm(), Normalizer) def test_pipeline_run_works_fine(self): + pipeline = Pipeline( + feature_selection_algorithm=SelectKBest(), + feature_transform_algorithm=Normalizer(), + classifier=RandomForest() + ) + data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes.csv', has_header=True, contains_classes=True) - self.__pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy') - predicted = self.__pipeline.run(pandas.DataFrame(numpy.random.uniform(low=0.0, high=15.0, size=(30, data_reader.get_x().shape[1])))) + pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 20, 40, 'ParticleSwarmAlgorithm', 'Accuracy') + predicted = pipeline.run(pandas.DataFrame(numpy.random.uniform(low=0.0, high=15.0, size=(30, data_reader.get_x().shape[1])))) self.assertEqual(predicted.shape, (30, )) @@ -46,33 +52,51 @@ def test_pipeline_run_works_fine(self): self.assertTrue(len(s2) > 0 and len(s2) <= 2) def test_pipeline_export_works_fine(self): + pipeline = Pipeline( + feature_selection_algorithm=SelectKBest(), + feature_transform_algorithm=Normalizer(), + classifier=RandomForest() + ) + with tempfile.TemporaryDirectory() as tmp: - self.__pipeline.export(os.path.join(tmp, 'pipeline')) + pipeline.export(os.path.join(tmp, 'pipeline')) self.assertTrue(os.path.exists(os.path.join(tmp, 'pipeline.ppln'))) self.assertEqual(1, len([name for name in os.listdir(tmp)])) - self.__pipeline.export(os.path.join(tmp, 'pipeline.ppln')) + pipeline.export(os.path.join(tmp, 'pipeline.ppln')) self.assertTrue(os.path.exists(os.path.join(tmp, 'pipeline.ppln'))) self.assertEqual(1, len([name for name in os.listdir(tmp)])) def test_pipeline_export_text_works_fine(self): + pipeline = Pipeline( + feature_selection_algorithm=SelectKBest(), + feature_transform_algorithm=Normalizer(), + classifier=RandomForest() + ) + with tempfile.TemporaryDirectory() as tmp: - self.__pipeline.export_text(os.path.join(tmp, 'pipeline')) + pipeline.export_text(os.path.join(tmp, 'pipeline')) self.assertTrue(os.path.exists(os.path.join(tmp, 'pipeline.txt'))) self.assertEqual(1, len([name for name in os.listdir(tmp)])) - self.__pipeline.export_text(os.path.join(tmp, 'pipeline.txt')) + pipeline.export_text(os.path.join(tmp, 'pipeline.txt')) self.assertTrue(os.path.exists(os.path.join(tmp, 'pipeline.txt'))) self.assertEqual(1, len([name for name in os.listdir(tmp)])) - self.assertIsNotNone(self.__pipeline.to_string()) - self.assertGreater(len(self.__pipeline.to_string()), 0) + self.assertIsNotNone(pipeline.to_string()) + self.assertGreater(len(pipeline.to_string()), 0) def test_pipeline_setters_work_fine(self): - self.__pipeline.set_classifier(AdaBoost()) - self.__pipeline.set_feature_selection_algorithm(SelectPercentile()) - self.__pipeline.set_feature_transform_algorithm(StandardScaler()) - self.__pipeline.set_selected_features_mask(numpy.ones([1, 1, 0, 0], dtype=bool)) + pipeline = Pipeline( + feature_selection_algorithm=SelectKBest(), + feature_transform_algorithm=Normalizer(), + classifier=RandomForest() + ) + + pipeline.set_classifier(AdaBoost()) + pipeline.set_feature_selection_algorithm(SelectPercentile()) + pipeline.set_feature_transform_algorithm(StandardScaler()) + pipeline.set_selected_features_mask(numpy.ones([1, 1, 0, 0], dtype=bool)) self.__y = numpy.array(['Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 2', 'Class 2', 'Class 2', 'Class 2', 'Class 1', 'Class 1', 'Class 2', @@ -82,9 +106,9 @@ def test_pipeline_setters_work_fine(self): 'Class 1', 'Class 1', 'Class 1', 'Class 2', 'Class 1', 'Class 1', 'Class 2', 'Class 2', 'Class 1', 'Class 2', 'Class 1', 'Class 2', 'Class 2', 'Class 2']) - self.__pipeline.set_stats(OptimizationStats(self.__predicted, self.__y)) + pipeline.set_stats(OptimizationStats(self.__predicted, self.__y)) - self.assertIsInstance(self.__pipeline.get_classifier(), AdaBoost) - self.assertIsInstance(self.__pipeline.get_feature_selection_algorithm(), SelectPercentile) - self.assertIsInstance(self.__pipeline.get_feature_transform_algorithm(), StandardScaler) - self.assertIsInstance(self.__pipeline.get_stats(), OptimizationStats) \ No newline at end of file + self.assertIsInstance(pipeline.get_classifier(), AdaBoost) + self.assertIsInstance(pipeline.get_feature_selection_algorithm(), SelectPercentile) + self.assertIsInstance(pipeline.get_feature_transform_algorithm(), StandardScaler) + self.assertIsInstance(pipeline.get_stats(), OptimizationStats) \ No newline at end of file diff --git a/niaaml/tests/test_pipeline_optimizer.py b/niaaml/tests/test_pipeline_optimizer.py index ede92b1..27a13f3 100644 --- a/niaaml/tests/test_pipeline_optimizer.py +++ b/niaaml/tests/test_pipeline_optimizer.py @@ -34,4 +34,21 @@ def test_pipeline_optimizer_getters_work_fine(self): self.assertTrue((numpy.array(['AdaBoost', 'Bagging']) == numpy.array(classifiers)).all()) self.assertTrue((numpy.array(['SelectKBest', 'SelectPercentile']) == numpy.array(fsas)).all()) - self.assertTrue((numpy.array([None, 'Normalizer', 'StandardScaler'] == numpy.array(ftas))).all()) \ No newline at end of file + self.assertTrue((numpy.array([None, 'Normalizer', 'StandardScaler'] == numpy.array(ftas))).all()) + + def test_pipeline_optimizeer_missing_values_categorical_attributes_run_works_fine(self): + data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/tests_files/dataset_header_classes_cat_miss.csv', has_header=True, contains_classes=True) + pipeline_optimizer = PipelineOptimizer( + data=self.__data_reader, + feature_selection_algorithms=['SelectKBest', 'SelectPercentile'], + feature_transform_algorithms=['Normalizer', 'StandardScaler'], + classifiers=['AdaBoost', 'Bagging'], + categorical_features_encoder='OneHotEncoder', + imputer='SimpleImputer' + ) + + pipeline = self.__pipeline_optimizer.run('Accuracy', 10, 10, 20, 20, 'ParticleSwarmAlgorithm') + self.assertIsInstance(pipeline, Pipeline) + self.assertTrue(isinstance(pipeline.get_classifier(), AdaBoost) or isinstance(pipeline.get_classifier(), Bagging)) + self.assertTrue(isinstance(pipeline.get_feature_selection_algorithm(), SelectKBest) or isinstance(pipeline.get_feature_selection_algorithm(), SelectPercentile)) + self.assertTrue(pipeline.get_feature_transform_algorithm() is None or isinstance(pipeline.get_feature_transform_algorithm(), Normalizer) or isinstance(pipeline.get_feature_transform_algorithm(), StandardScaler)) \ No newline at end of file diff --git a/niaaml/tests/tests_files/dataset_header_classes_cat_miss.csv b/niaaml/tests/tests_files/dataset_header_classes_cat_miss.csv new file mode 100644 index 0000000..885464c --- /dev/null +++ b/niaaml/tests/tests_files/dataset_header_classes_cat_miss.csv @@ -0,0 +1,101 @@ +h1,h2,h3,h4,h5,h6,,h7 +11.18628795,8.553093649,2.337681394,7.740221556,2.076186822,12.53763102,a,Class 1 +10.14583771,8.771100076,9.617580733,11.28898328,3.731375463,3.61496461,b,Class 1 +7.608378163,9.215208082,12.21721984,0.217171385,10.84786072,0.068879981,a,Class 1 +9.741381036,4.373282412,0.46661895,9.670759021,3.769897645,3.166427585,a,Class 1 +6.326460397,4.950181274,0.955970145,13.27044224,13.3803711,6.7042629,a,Class 2 +6.614221714,5.139908792,6.322918249,7.613060017,2.965112605,8.757576779,a,Class 1 +9.77079694,13.22809724,,4.486662894,1.637543898,3.145057057,c,Class 2 +5.986399993,3.575034439,4.436729038,10.27403804,6.085388637,9.638152072,c,Class 2 +9.306449023,0.419825519,5.207460405,3.352657815,8.955085451,13.74251068,b,Class 2 +12.85278467,0.554461277,13.25029775,7.70837554,12.81365721,8.448635991,,Class 2 +10.32440339,3.195964543,1.215275549,3.741461311,11.6736581,6.435247906,b,Class 1 +,6.754763706,3.568497268,6.787252535,4.528132031,11.24413956,c,Class 1 +4.92766004,8.894839385,1.599234587,6.491628957,1.003488457,8.990811618,c,Class 1 +14.62465143,8.299301507,12.02718728,3.868350191,5.604315633,13.4574281,a,Class 1 +5.548176135,6.507710991,0.798813746,13.28445746,7.037019737,13.71370375,a,Class 2 +4.414357119,9.250023962,8.716711231,7.733484723,3.661143919,14.63698956,a,Class 1 +7.794274494,0.554776276,6.717665128,3.422141362,12.80249591,3.744667173,a,Class 2 +10.46207608,14.78604433,11.14501886,13.28194261,13.35036026,8.342502238,b,Class 1 +11.38516345,11.33272181,,4.978216028,8.668419104,6.052792392,b,Class 2 +11.19640261,10.3436337,0.939034115,14.91069148,7.269366966,12.53406348,c,Class 2 +11.06549373,0.091051491,13.96718884,12.53348993,4.476687297,11.87992708,c,Class 1 +5.721763439,10.70136406,4.677923895,12.04602629,6.630499903,13.04574224,a,Class 2 +14.87203026,4.717515614,12.16090195,10.17484858,1.258457287,3.762734746,a,Class 1 +9.517250388,14.61073986,10.55186687,12.13409641,,14.9085867,a,Class 2 +5.490151571,11.07922707,2.912349404,11.26243041,6.909836863,12.93169762,a,Class 1 +3.597959325,7.3606205,1.89533481,8.407778067,12.94742999,9.956797585,a,Class 2 +13.99187099,6.16144391,4.430074749,10.48992388,6.724889945,11.63545045,b,Class 1 +13.87167852,11.47473231,12.91040409,5.329482463,12.41092153,9.923540019,a,Class 2 +6.884021,0.536048784,13.77495679,6.51467553,4.70254023,8.780237509,a,Class 2 +2.208914531,12.70665676,13.62555578,9.598180651,7.438779306,7.81610053,,Class 1 +8.659531425,8.209053873,6.907242925,9.847209807,7.643627147,1.24454444,a,Class 1 +1.448257785,10.22497998,1.269324615,3.714269901,13.03906827,2.870250771,c,Class 1 +10.19241437,4.058700021,8.886001739,5.828695389,2.605134041,1.19188785,c,Class 1 +7.777359497,10.96783191,4.890083745,5.284618971,4.411218163,8.605757632,b,Class 2 +1.056011622,7.844004878,10.65020289,4.234763934,6.43943205,1.262495126,b,Class 2 +7.648844009,,9.539688734,13.66072313,0.330411845,5.610949661,b,Class 1 +4.321962749,5.604955856,7.525456962,2.795185293,0.557651224,9.096120183,c,Class 2 +7.580303996,14.13657189,2.208779404,12.65807527,8.616258995,14.2742891,c,Class 1 +6.617679318,12.17838447,6.219814209,9.278219597,2.627013838,10.26198055,a,Class 1 +10.42636852,11.37466476,1.605370071,13.38238859,13.4486372,0.658796404,a,Class 1 +10.52477845,5.275716432,11.83515271,0.617870822,0.921374579,8.348557261,a,Class 2 +12.42122246,0.012249697,7.74555252,12.02705019,3.442939685,7.110063876,a,Class 1 +2.721191099,14.56211777,12.2194075,8.457083772,1.843488398,8.775189039,b,Class 2 +11.93193151,7.265208519,0.45505228,4.217468632,13.6978792,11.24703349,b,Class 2 +0.493376888,0.414245824,5.492426678,3.926579473,5.14363276,9.3274729,c,Class 2 +11.99067679,2.224771613,14.51607498,13.038479,2.048398725,10.21056055,c,Class 2 +4.39009476,2.715892095,13.65208099,1.276459275,3.1947636,5.738578547,a,Class 2 +9.517850998,7.870570236,11.66133708,3.158457987,0.994101959,1.760078291,a,Class 2 +1.67226824,1.257990444,13.35645397,6.432977593,8.173353149,10.47964661,,Class 2 +11.40110217,3.755922456,14.78250639,9.12235283,5.463228968,8.004612121,a,Class 1 +0.51828403,7.467344863,0.403372329,1.324884922,6.204846153,0.397427501,a,Class 1 +,3.700288257,14.23433924,1.836880065,0.168958671,1.260377664,b,Class 2 +0.927565538,0.256079044,8.244925899,10.78666638,13.47379713,0.009413535,a,Class 2 +11.12587642,0.512971929,10.37022985,9.927926894,8.924001776,5.446182158,a,Class 1 +8.296166587,3.881765358,12.50788295,6.751744679,3.270039419,13.16076438,a,Class 2 +5.342454277,14.06289262,6.052115238,4.60660751,14.92130785,9.251614117,a,Class 1 +10.76413906,3.108242418,7.407200789,0.124640979,6.315064545,6.974791847,c,Class 2 +11.60334067,7.869475964,14.06285885,3.010169778,0.21862987,10.7119562,c,Class 1 +0.675107878,10.62554075,9.16909931,1.51930099,9.054828927,8.018314854,b,Class 2 +0.136391516,12.15438651,13.10410369,4.54379884,1.467941336,3.708272962,b,Class 2 +11.20314646,7.917973833,7.205146518,14.47482833,5.385158132,3.962870806,b,Class 1 +12.96777011,7.276652989,12.46734536,8.774357457,14.49755617,1.021454967,c,Class 1 +7.259751863,5.37753719,8.753494011,1.105904802,7.423842186,7.060245922,c,Class 1 +5.550401633,10.28344926,8.849268232,10.35224505,11.42901447,2.015178403,a,Class 2 +8.724250626,5.144158413,8.881589983,5.654339781,3.348767179,7.567443724,a,Class 2 +1.505308287,8.327887318,5.967980754,5.861512631,1.942362782,12.08752455,a,Class 1 +11.95352828,13.83709019,1.484043207,14.9990425,0.358430191,0.936128377,a,Class 2 +14.1424292,5.653091086,14.75697191,6.534335531,14.59624216,4.217427045,b,Class 1 +2.096214566,13.41972927,5.026757888,10.15382225,10.69199037,8.119000359,b,Class 2 +4.658969577,8.152082829,10.69897004,3.807611391,9.432866697,7.469063458,c,Class 1 +9.880365248,6.857500577,9.50270486,6.185811124,6.801593649,1.426651215,c,Class 2 +11.38053935,5.64968146,13.16726558,3.969547861,3.409401613,6.754962952,a,Class 1 +2.549597194,3.81373774,3.381883424,12.54165021,7.238285696,5.014469506,a,Class 1 +2.149956112,14.18695148,4.495586504,1.193989236,0.629565843,10.71726557,a,Class 1 +0.633065458,10.57661883,3.911208047,4.737683148,10.67249983,11.44130896,a,Class 2 +10.98958055,8.538690522,2.221702088,7.94460522,7.268542052,13.0506614,a,Class 2 +14.05448371,5.906069731,11.02070992,14.78464345,1.395098041,12.45034592,b,Class 1 +4.849203233,14.92593789,14.83374088,13.33589083,10.91265222,6.015994872,a,Class 2 +1.788538553,1.189933547,13.37927743,7.078881338,0.115268965,1.102757553,a,Class 1 +1.520260264,4.390949317,8.961363089,9.116191933,4.902286012,13.82917543,a,Class 2 +5.143515013,1.626830627,5.011771888,14.53607373,9.254769126,5.987742339,a,Class 2 +0.383485623,5.893120492,2.198084919,3.607295516,11.2909701,14.19259294,c,Class 1 +3.543625982,1.817300049,12.79701902,9.150819857,4.270171936,1.046802952,c,Class 2 +9.014121301,8.894615211,14.32697976,12.05396604,6.610724668,12.9453385,b,Class 1 +2.178293829,11.00240774,3.4661733,4.216419592,14.36522422,3.571201671,b,Class 1 +9.218901263,8.682081223,12.48795288,8.796277452,13.72799658,1.414017549,b,Class 2 +1.417376539,13.2588434,13.00750995,9.108292367,5.332117011,3.7214796,c,Class 2 +11.40541996,10.59274384,11.90631845,4.497592473,4.532009755,4.117336922,c,Class 1 +5.547732807,6.107428176,13.30160131,8.442144861,9.854871343,3.268384157,a,Class 2 +2.558435481,12.36056669,7.777967112,6.812644994,8.532351866,6.71817697,a,Class 2 +2.349328005,11.73919423,11.20515163,11.47196866,13.24600243,1.770939874,a,Class 1 +13.34706077,13.86142631,0.291296401,0.12119829,5.885044406,8.475403207,a,Class 2 +5.351503888,6.40942837,11.07531808,8.972571254,3.233818614,12.43439266,b,Class 2 +6.693621558,13.96686031,1.475546478,12.35803005,0.873347546,0.688133753,b,Class 2 +10.48922559,6.646089272,7.4076759,7.873827219,5.742578275,1.806450848,c,Class 2 +1.365010518,0.840426698,6.044826791,12.33437799,5.33827304,14.55706457,c,Class 2 +6.145883127,12.20161505,1.162956248,11.67002394,6.279495076,5.709716727,a,Class 1 +12.99028641,3.448828215,4.946279072,10.87002826,14.83427318,9.154544604,a,Class 2 +1.109266891,2.564645156,10.64938657,7.677215295,8.625541169,8.960849049,a,Class 1 +6.891117595,13.9566784,0.952437927,6.585976751,13.16019122,7.78218351,a,Class 1 diff --git a/pyproject.toml b/pyproject.toml index 8cca2ee..26f34ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "NiaAML" -version = "1.0.0rc1" +version = "1.0.0rc2" description = "Python automated machine learning framework." license = "MIT" authors = ["Luka Pečnik ", "Iztok Fister Jr. "]