-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #30 from lukapecnik/feature-imputation-implemented
feature missing values imputation
- Loading branch information
Showing
20 changed files
with
684 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
11.18628795,8.553093649,2.337681394,7.740221556,2.076186822,12.53763102,a,Class 1 | ||
10.14583771,8.771100076,9.617580733,11.28898328,3.731375463,3.61496461,a,Class 1 | ||
7.608378163,9.215208082,12.21721984,0.217171385,10.84786072,0.068879981,b,Class 1 | ||
9.741381036,4.373282412,0.46661895,9.670759021,3.769897645,3.166427585,b,Class 1 | ||
6.326460397,4.950181274,0.955970145,13.27044224,13.3803711,6.7042629,b,Class 2 | ||
6.614221714,5.139908792,6.322918249,7.613060017,2.965112605,8.757576779,c,Class 1 | ||
9.77079694,13.22809724,12.27910247,4.486662894,1.637543898,3.145057057,b,Class 2 | ||
5.986399993,3.575034439,4.436729038,10.27403804,6.085388637,9.638152072,a,Class 2 | ||
9.306449023,0.419825519,5.207460405,3.352657815,8.955085451,13.74251068,c,Class 2 | ||
12.85278467,0.554461277,13.25029775,7.70837554,12.81365721,8.448635991,c,Class 2 | ||
10.32440339,3.195964543,1.215275549,3.741461311,11.6736581,6.435247906,a,Class 1 | ||
12.45890387,6.754763706,3.568497268,6.787252535,4.528132031,11.24413956,,Class 1 | ||
4.92766004,8.894839385,1.599234587,6.491628957,1.003488457,8.990811618,a,Class 1 | ||
14.62465143,,12.02718728,3.868350191,5.604315633,13.4574281,a,Class 1 | ||
5.548176135,6.507710991,0.798813746,13.28445746,7.037019737,13.71370375,b,Class 2 | ||
4.414357119,9.250023962,8.716711231,7.733484723,3.661143919,14.63698956,b,Class 1 | ||
7.794274494,0.554776276,6.717665128,3.422141362,12.80249591,3.744667173,b,Class 2 | ||
10.46207608,14.78604433,11.14501886,13.28194261,13.35036026,8.342502238,c,Class 1 | ||
11.38516345,11.33272181,1.919660335,4.978216028,8.668419104,6.052792392,c,Class 2 | ||
11.19640261,10.3436337,0.939034115,14.91069148,7.269366966,12.53406348,c,Class 2 | ||
11.06549373,,13.96718884,12.53348993,4.476687297,11.87992708,a,Class 1 | ||
5.721763439,10.70136406,4.677923895,12.04602629,6.630499903,13.04574224,c,Class 2 | ||
14.87203026,4.717515614,12.16090195,10.17484858,1.258457287,3.762734746,c,Class 1 | ||
9.517250388,14.61073986,10.55186687,12.13409641,4.195938316,14.9085867,b,Class 2 | ||
5.490151571,11.07922707,2.912349404,11.26243041,6.909836863,12.93169762,b,Class 1 | ||
3.597959325,7.3606205,1.89533481,8.407778067,12.94742999,9.956797585,a,Class 2 | ||
13.99187099,6.16144391,4.430074749,10.48992388,6.724889945,11.63545045,a,Class 1 | ||
13.87167852,11.47473231,12.91040409,5.329482463,12.41092153,9.923540019,c,Class 2 | ||
6.884021,0.536048784,13.77495679,6.51467553,4.70254023,8.780237509,,Class 2 | ||
2.208914531,12.70665676,13.62555578,9.598180651,7.438779306,7.81610053,b,Class 1 | ||
8.659531425,8.209053873,6.907242925,9.847209807,7.643627147,1.24454444,b,Class 1 | ||
1.448257785,10.22497998,1.269324615,3.714269901,13.03906827,2.870250771,b,Class 1 | ||
10.19241437,,8.886001739,5.828695389,2.605134041,1.19188785,b,Class 1 | ||
7.777359497,10.96783191,4.890083745,5.284618971,4.411218163,8.605757632,b,Class 2 | ||
1.056011622,7.844004878,10.65020289,4.234763934,6.43943205,1.262495126,a,Class 2 | ||
7.648844009,10.14403542,9.539688734,13.66072313,0.330411845,5.610949661,a,Class 1 | ||
4.321962749,5.604955856,7.525456962,2.795185293,0.557651224,9.096120183,c,Class 2 | ||
7.580303996,14.13657189,2.208779404,12.65807527,8.616258995,14.2742891,c,Class 1 | ||
6.617679318,12.17838447,6.219814209,9.278219597,2.627013838,10.26198055,a,Class 1 | ||
10.42636852,11.37466476,1.605370071,13.38238859,13.4486372,0.658796404,a,Class 1 | ||
10.52477845,5.275716432,11.83515271,0.617870822,0.921374579,8.348557261,b,Class 2 | ||
12.42122246,0.012249697,7.74555252,12.02705019,3.442939685,7.110063876,b,Class 1 | ||
2.721191099,14.56211777,12.2194075,8.457083772,1.843488398,8.775189039,b,Class 2 | ||
11.93193151,7.265208519,0.45505228,4.217468632,13.6978792,11.24703349,c,Class 2 | ||
0.493376888,0.414245824,5.492426678,3.926579473,5.14363276,9.3274729,b,Class 2 | ||
11.99067679,2.224771613,14.51607498,13.038479,2.048398725,10.21056055,a,Class 2 | ||
4.39009476,2.715892095,13.65208099,1.276459275,3.1947636,5.738578547,c,Class 2 | ||
9.517850998,7.870570236,11.66133708,3.158457987,0.994101959,1.760078291,c,Class 2 | ||
1.67226824,1.257990444,13.35645397,6.432977593,8.173353149,10.47964661,a,Class 2 | ||
11.40110217,3.755922456,14.78250639,9.12235283,5.463228968,8.004612121,a,Class 1 | ||
0.51828403,7.467344863,0.403372329,1.324884922,6.204846153,0.397427501,a,Class 1 | ||
8.56890712,3.700288257,14.23433924,1.836880065,0.168958671,1.260377664,a,Class 2 | ||
0.927565538,0.256079044,8.244925899,10.78666638,13.47379713,0.009413535,b,Class 2 | ||
11.12587642,0.512971929,10.37022985,9.927926894,8.924001776,5.446182158,b,Class 1 | ||
8.296166587,3.881765358,12.50788295,6.751744679,3.270039419,13.16076438,b,Class 2 | ||
5.342454277,14.06289262,6.052115238,4.60660751,14.92130785,9.251614117,c,Class 1 | ||
10.76413906,3.108242418,7.407200789,0.124640979,6.315064545,6.974791847,c,Class 2 | ||
11.60334067,7.869475964,14.06285885,3.010169778,0.21862987,10.7119562,c,Class 1 | ||
0.675107878,10.62554075,9.16909931,1.51930099,9.054828927,8.018314854,a,Class 2 | ||
0.136391516,12.15438651,13.10410369,4.54379884,1.467941336,3.708272962,c,Class 2 | ||
11.20314646,7.917973833,7.205146518,14.47482833,5.385158132,3.962870806,c,Class 1 | ||
12.96777011,7.276652989,12.46734536,8.774357457,14.49755617,1.021454967,b,Class 1 | ||
7.259751863,5.37753719,8.753494011,1.105904802,7.423842186,7.060245922,b,Class 1 | ||
5.550401633,10.28344926,8.849268232,10.35224505,11.42901447,2.015178403,a,Class 2 | ||
8.724250626,5.144158413,8.881589983,5.654339781,3.348767179,7.567443724,a,Class 2 | ||
1.505308287,8.327887318,5.967980754,5.861512631,1.942362782,12.08752455,c,Class 1 | ||
11.95352828,13.83709019,1.484043207,14.9990425,0.358430191,0.936128377,b,Class 2 | ||
14.1424292,5.653091086,14.75697191,6.534335531,14.59624216,4.217427045,b,Class 1 | ||
2.096214566,13.41972927,5.026757888,10.15382225,10.69199037,8.119000359,b,Class 2 | ||
4.658969577,8.152082829,10.69897004,3.807611391,9.432866697,7.469063458,b,Class 1 | ||
9.880365248,6.857500577,9.50270486,6.185811124,6.801593649,1.426651215,b,Class 2 | ||
11.38053935,5.64968146,13.16726558,3.969547861,3.409401613,6.754962952,b,Class 1 | ||
2.549597194,3.81373774,3.381883424,12.54165021,7.238285696,5.014469506,a,Class 1 | ||
2.149956112,14.18695148,4.495586504,1.193989236,0.629565843,10.71726557,a,Class 1 | ||
0.633065458,10.57661883,3.911208047,4.737683148,10.67249983,11.44130896,c,Class 2 | ||
10.98958055,8.538690522,2.221702088,7.94460522,7.268542052,13.0506614,c,Class 2 | ||
14.05448371,5.906069731,11.02070992,14.78464345,1.395098041,12.45034592,a,Class 1 | ||
4.849203233,14.92593789,14.83374088,13.33589083,10.91265222,6.015994872,a,Class 2 | ||
1.788538553,1.189933547,13.37927743,7.078881338,0.115268965,1.102757553,b,Class 1 | ||
1.520260264,4.390949317,8.961363089,9.116191933,4.902286012,13.82917543,b,Class 2 | ||
5.143515013,1.626830627,5.011771888,14.53607373,9.254769126,5.987742339,b,Class 2 | ||
0.383485623,5.893120492,2.198084919,3.607295516,11.2909701,14.19259294,c,Class 1 | ||
3.543625982,1.817300049,12.79701902,9.150819857,4.270171936,1.046802952,b,Class 2 | ||
9.014121301,8.894615211,14.32697976,12.05396604,6.610724668,12.9453385,a,Class 1 | ||
2.178293829,11.00240774,3.4661733,4.216419592,14.36522422,3.571201671,c,Class 1 | ||
9.218901263,8.682081223,12.48795288,8.796277452,13.72799658,1.414017549,c,Class 2 | ||
1.417376539,13.2588434,13.00750995,9.108292367,5.332117011,3.7214796,a,Class 2 | ||
11.40541996,10.59274384,11.90631845,4.497592473,4.532009755,4.117336922,a,Class 1 | ||
5.547732807,6.107428176,13.30160131,8.442144861,9.854871343,3.268384157,a,Class 2 | ||
2.558435481,12.36056669,7.777967112,6.812644994,8.532351866,6.71817697,a,Class 2 | ||
2.349328005,11.73919423,11.20515163,11.47196866,13.24600243,1.770939874,b,Class 1 | ||
13.34706077,13.86142631,0.291296401,0.12119829,5.885044406,8.475403207,b,Class 2 | ||
5.351503888,6.40942837,11.07531808,8.972571254,3.233818614,12.43439266,b,Class 2 | ||
6.693621558,13.96686031,1.475546478,12.35803005,0.873347546,0.688133753,c,Class 2 | ||
10.48922559,6.646089272,7.4076759,7.873827219,5.742578275,1.806450848,c,Class 2 | ||
1.365010518,0.840426698,6.044826791,12.33437799,5.33827304,14.55706457,c,Class 2 | ||
6.145883127,12.20161505,1.162956248,11.67002394,6.279495076,5.709716727,a,Class 1 | ||
12.99028641,3.448828215,4.946279072,10.87002826,14.83427318,9.154544604,c,Class 2 | ||
1.109266891,2.564645156,10.64938657,7.677215295,8.625541169,8.960849049,c,Class 1 | ||
6.891117595,13.9566784,0.952437927,6.585976751,13.16019122,7.78218351,b,Class 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from niaaml.preprocessing.imputation import SimpleImputer, impute_features | ||
import os | ||
from niaaml.data import CSVDataReader | ||
|
||
""" | ||
In this example, we show how to individually use an implemented missing features' imputer and its methods. In this case we use SimpleImputer for demonstration, but | ||
you can use any of the implemented imputers in the same way. | ||
""" | ||
|
||
# prepare data reader using csv file | ||
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) | ||
|
||
# instantiate SimpleImputer | ||
si = SimpleImputer() | ||
|
||
# fit, transform and print to output the feature in the dataset (index 6) | ||
features = data_reader.get_x() | ||
si.fit(features[[6]]) | ||
f = si.transform(features[[6]]) | ||
print(f) | ||
|
||
# if you wish to get array of imputers for all of the features with missing values in a dataset (and transformed DataFrame of features), you may use the utility method impute_features | ||
transformed_features, imputers = impute_features(features, 'SimpleImputer') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from niaaml import Pipeline | ||
from niaaml.classifiers import MultiLayerPerceptron | ||
from niaaml.preprocessing.feature_selection import VarianceThreshold | ||
from niaaml.preprocessing.feature_transform import Normalizer | ||
from niaaml.data import CSVDataReader | ||
from niaaml.preprocessing.encoding import encode_categorical_features | ||
from niaaml.preprocessing.imputation import impute_features | ||
import os | ||
import numpy | ||
import pandas | ||
|
||
""" | ||
In this example, we show how to individually use the Pipeline class. You may use this if you want to test out a specific classification pipeline. | ||
We will use a dataset that contains categorical and numerical features with missing values. | ||
""" | ||
|
||
# prepare data reader using csv file | ||
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) | ||
|
||
features = data_reader.get_x() | ||
|
||
# we use the utility method impute_features to get imputers for the features with missing values, but you may instantiate and fit | ||
# imputers separately and pass them as a dictionary (as long as they are implemented as this framework suggests), with keys as column names or indices (if there is no header in the csv) | ||
# there should be as many imputers as the features with missing values | ||
# this example uses Simple Imputer | ||
features, imputers = impute_features(features, 'SimpleImputer') | ||
|
||
# exactly the same goes for encoders | ||
_, encoders = encode_categorical_features(features, 'OneHotEncoder') | ||
|
||
# instantiate a Pipeline object | ||
pipeline = Pipeline( | ||
feature_selection_algorithm=VarianceThreshold(), | ||
feature_transform_algorithm=Normalizer(), | ||
classifier=MultiLayerPerceptron(), | ||
categorical_features_encoders=encoders, | ||
imputers=imputers | ||
) | ||
|
||
# run pipeline optimization process (returns fitness value, but sets the best parameters for classifier, feature selection algorithm and feature transform algorithm during the process) | ||
pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 10, 50, 'ParticleSwarmAlgorithm', 'Accuracy') | ||
|
||
# run the pipeline using dummy data | ||
# you could run the pipeline before the optimization process, but get wrong predictions as nothing in the pipeline is fit for the given dataset | ||
predicted = pipeline.run(pandas.DataFrame([[10.32440339, 3.195964543, 1.215275549, 3.741461311, 11.6736581, 6.435247906, 'a']])) | ||
|
||
# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import os | ||
from niaaml import PipelineOptimizer, Pipeline | ||
from niaaml.data import CSVDataReader | ||
|
||
""" | ||
In this example, we show how to use the PipelineOptimizer class. This example is using an instance of CSVDataReader. | ||
The instantiated PipelineOptimizer will try and assemble the best pipeline with the components that are specified in its constructor. | ||
We use a dataset with 1 categorical feature and missing values to demonstrate a use of PipelineOptimizer instance with automatic feature encoding and imputation. | ||
""" | ||
|
||
# prepare data reader using csv file | ||
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True) | ||
|
||
# instantiate PipelineOptimizer that chooses among specified classifiers, feature selection algorithms and feature transform algorithms | ||
# OneHotEncoder is used for encoding categorical features in this example | ||
# SimpleImputer is used for imputing missing values in this example | ||
pipeline_optimizer = PipelineOptimizer( | ||
data=data_reader, | ||
classifiers=['AdaBoost', 'Bagging', 'MultiLayerPerceptron', 'RandomForest', 'ExtremelyRandomizedTrees', 'LinearSVC'], | ||
feature_selection_algorithms=['SelectKBest', 'SelectPercentile', 'ParticleSwarmOptimization', 'VarianceThreshold'], | ||
feature_transform_algorithms=['Normalizer', 'StandardScaler'], | ||
categorical_features_encoder='OneHotEncoder', | ||
imputer='SimpleImputer' | ||
) | ||
|
||
# runs the optimization process | ||
# one of the possible pipelines in this case is: SelectPercentile -> Normalizer -> RandomForest | ||
# returns the best found pipeline | ||
# the chosen fitness function and optimization algorithm are Accuracy and Particle Swarm Algorithm | ||
pipeline = pipeline_optimizer.run('Accuracy', 20, 20, 400, 400, 'ParticleSwarmAlgorithm', 'ParticleSwarmAlgorithm') | ||
|
||
# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,4 +27,4 @@ | |
] | ||
|
||
__project__ = 'niaaml' | ||
__version__ = '1.0.0rc1' | ||
__version__ = '1.0.0rc2' |
Oops, something went wrong.