Skip to content

Commit

Permalink
Merge pull request #30 from lukapecnik/feature-imputation-implemented
Browse files Browse the repository at this point in the history
feature missing values imputation
  • Loading branch information
lukapecnik authored Dec 8, 2020
2 parents e4b269d + 2c40071 commit 9274f96
Show file tree
Hide file tree
Showing 20 changed files with 684 additions and 77 deletions.
100 changes: 100 additions & 0 deletions examples/example_files/dataset_categorical_missing.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
11.18628795,8.553093649,2.337681394,7.740221556,2.076186822,12.53763102,a,Class 1
10.14583771,8.771100076,9.617580733,11.28898328,3.731375463,3.61496461,a,Class 1
7.608378163,9.215208082,12.21721984,0.217171385,10.84786072,0.068879981,b,Class 1
9.741381036,4.373282412,0.46661895,9.670759021,3.769897645,3.166427585,b,Class 1
6.326460397,4.950181274,0.955970145,13.27044224,13.3803711,6.7042629,b,Class 2
6.614221714,5.139908792,6.322918249,7.613060017,2.965112605,8.757576779,c,Class 1
9.77079694,13.22809724,12.27910247,4.486662894,1.637543898,3.145057057,b,Class 2
5.986399993,3.575034439,4.436729038,10.27403804,6.085388637,9.638152072,a,Class 2
9.306449023,0.419825519,5.207460405,3.352657815,8.955085451,13.74251068,c,Class 2
12.85278467,0.554461277,13.25029775,7.70837554,12.81365721,8.448635991,c,Class 2
10.32440339,3.195964543,1.215275549,3.741461311,11.6736581,6.435247906,a,Class 1
12.45890387,6.754763706,3.568497268,6.787252535,4.528132031,11.24413956,,Class 1
4.92766004,8.894839385,1.599234587,6.491628957,1.003488457,8.990811618,a,Class 1
14.62465143,,12.02718728,3.868350191,5.604315633,13.4574281,a,Class 1
5.548176135,6.507710991,0.798813746,13.28445746,7.037019737,13.71370375,b,Class 2
4.414357119,9.250023962,8.716711231,7.733484723,3.661143919,14.63698956,b,Class 1
7.794274494,0.554776276,6.717665128,3.422141362,12.80249591,3.744667173,b,Class 2
10.46207608,14.78604433,11.14501886,13.28194261,13.35036026,8.342502238,c,Class 1
11.38516345,11.33272181,1.919660335,4.978216028,8.668419104,6.052792392,c,Class 2
11.19640261,10.3436337,0.939034115,14.91069148,7.269366966,12.53406348,c,Class 2
11.06549373,,13.96718884,12.53348993,4.476687297,11.87992708,a,Class 1
5.721763439,10.70136406,4.677923895,12.04602629,6.630499903,13.04574224,c,Class 2
14.87203026,4.717515614,12.16090195,10.17484858,1.258457287,3.762734746,c,Class 1
9.517250388,14.61073986,10.55186687,12.13409641,4.195938316,14.9085867,b,Class 2
5.490151571,11.07922707,2.912349404,11.26243041,6.909836863,12.93169762,b,Class 1
3.597959325,7.3606205,1.89533481,8.407778067,12.94742999,9.956797585,a,Class 2
13.99187099,6.16144391,4.430074749,10.48992388,6.724889945,11.63545045,a,Class 1
13.87167852,11.47473231,12.91040409,5.329482463,12.41092153,9.923540019,c,Class 2
6.884021,0.536048784,13.77495679,6.51467553,4.70254023,8.780237509,,Class 2
2.208914531,12.70665676,13.62555578,9.598180651,7.438779306,7.81610053,b,Class 1
8.659531425,8.209053873,6.907242925,9.847209807,7.643627147,1.24454444,b,Class 1
1.448257785,10.22497998,1.269324615,3.714269901,13.03906827,2.870250771,b,Class 1
10.19241437,,8.886001739,5.828695389,2.605134041,1.19188785,b,Class 1
7.777359497,10.96783191,4.890083745,5.284618971,4.411218163,8.605757632,b,Class 2
1.056011622,7.844004878,10.65020289,4.234763934,6.43943205,1.262495126,a,Class 2
7.648844009,10.14403542,9.539688734,13.66072313,0.330411845,5.610949661,a,Class 1
4.321962749,5.604955856,7.525456962,2.795185293,0.557651224,9.096120183,c,Class 2
7.580303996,14.13657189,2.208779404,12.65807527,8.616258995,14.2742891,c,Class 1
6.617679318,12.17838447,6.219814209,9.278219597,2.627013838,10.26198055,a,Class 1
10.42636852,11.37466476,1.605370071,13.38238859,13.4486372,0.658796404,a,Class 1
10.52477845,5.275716432,11.83515271,0.617870822,0.921374579,8.348557261,b,Class 2
12.42122246,0.012249697,7.74555252,12.02705019,3.442939685,7.110063876,b,Class 1
2.721191099,14.56211777,12.2194075,8.457083772,1.843488398,8.775189039,b,Class 2
11.93193151,7.265208519,0.45505228,4.217468632,13.6978792,11.24703349,c,Class 2
0.493376888,0.414245824,5.492426678,3.926579473,5.14363276,9.3274729,b,Class 2
11.99067679,2.224771613,14.51607498,13.038479,2.048398725,10.21056055,a,Class 2
4.39009476,2.715892095,13.65208099,1.276459275,3.1947636,5.738578547,c,Class 2
9.517850998,7.870570236,11.66133708,3.158457987,0.994101959,1.760078291,c,Class 2
1.67226824,1.257990444,13.35645397,6.432977593,8.173353149,10.47964661,a,Class 2
11.40110217,3.755922456,14.78250639,9.12235283,5.463228968,8.004612121,a,Class 1
0.51828403,7.467344863,0.403372329,1.324884922,6.204846153,0.397427501,a,Class 1
8.56890712,3.700288257,14.23433924,1.836880065,0.168958671,1.260377664,a,Class 2
0.927565538,0.256079044,8.244925899,10.78666638,13.47379713,0.009413535,b,Class 2
11.12587642,0.512971929,10.37022985,9.927926894,8.924001776,5.446182158,b,Class 1
8.296166587,3.881765358,12.50788295,6.751744679,3.270039419,13.16076438,b,Class 2
5.342454277,14.06289262,6.052115238,4.60660751,14.92130785,9.251614117,c,Class 1
10.76413906,3.108242418,7.407200789,0.124640979,6.315064545,6.974791847,c,Class 2
11.60334067,7.869475964,14.06285885,3.010169778,0.21862987,10.7119562,c,Class 1
0.675107878,10.62554075,9.16909931,1.51930099,9.054828927,8.018314854,a,Class 2
0.136391516,12.15438651,13.10410369,4.54379884,1.467941336,3.708272962,c,Class 2
11.20314646,7.917973833,7.205146518,14.47482833,5.385158132,3.962870806,c,Class 1
12.96777011,7.276652989,12.46734536,8.774357457,14.49755617,1.021454967,b,Class 1
7.259751863,5.37753719,8.753494011,1.105904802,7.423842186,7.060245922,b,Class 1
5.550401633,10.28344926,8.849268232,10.35224505,11.42901447,2.015178403,a,Class 2
8.724250626,5.144158413,8.881589983,5.654339781,3.348767179,7.567443724,a,Class 2
1.505308287,8.327887318,5.967980754,5.861512631,1.942362782,12.08752455,c,Class 1
11.95352828,13.83709019,1.484043207,14.9990425,0.358430191,0.936128377,b,Class 2
14.1424292,5.653091086,14.75697191,6.534335531,14.59624216,4.217427045,b,Class 1
2.096214566,13.41972927,5.026757888,10.15382225,10.69199037,8.119000359,b,Class 2
4.658969577,8.152082829,10.69897004,3.807611391,9.432866697,7.469063458,b,Class 1
9.880365248,6.857500577,9.50270486,6.185811124,6.801593649,1.426651215,b,Class 2
11.38053935,5.64968146,13.16726558,3.969547861,3.409401613,6.754962952,b,Class 1
2.549597194,3.81373774,3.381883424,12.54165021,7.238285696,5.014469506,a,Class 1
2.149956112,14.18695148,4.495586504,1.193989236,0.629565843,10.71726557,a,Class 1
0.633065458,10.57661883,3.911208047,4.737683148,10.67249983,11.44130896,c,Class 2
10.98958055,8.538690522,2.221702088,7.94460522,7.268542052,13.0506614,c,Class 2
14.05448371,5.906069731,11.02070992,14.78464345,1.395098041,12.45034592,a,Class 1
4.849203233,14.92593789,14.83374088,13.33589083,10.91265222,6.015994872,a,Class 2
1.788538553,1.189933547,13.37927743,7.078881338,0.115268965,1.102757553,b,Class 1
1.520260264,4.390949317,8.961363089,9.116191933,4.902286012,13.82917543,b,Class 2
5.143515013,1.626830627,5.011771888,14.53607373,9.254769126,5.987742339,b,Class 2
0.383485623,5.893120492,2.198084919,3.607295516,11.2909701,14.19259294,c,Class 1
3.543625982,1.817300049,12.79701902,9.150819857,4.270171936,1.046802952,b,Class 2
9.014121301,8.894615211,14.32697976,12.05396604,6.610724668,12.9453385,a,Class 1
2.178293829,11.00240774,3.4661733,4.216419592,14.36522422,3.571201671,c,Class 1
9.218901263,8.682081223,12.48795288,8.796277452,13.72799658,1.414017549,c,Class 2
1.417376539,13.2588434,13.00750995,9.108292367,5.332117011,3.7214796,a,Class 2
11.40541996,10.59274384,11.90631845,4.497592473,4.532009755,4.117336922,a,Class 1
5.547732807,6.107428176,13.30160131,8.442144861,9.854871343,3.268384157,a,Class 2
2.558435481,12.36056669,7.777967112,6.812644994,8.532351866,6.71817697,a,Class 2
2.349328005,11.73919423,11.20515163,11.47196866,13.24600243,1.770939874,b,Class 1
13.34706077,13.86142631,0.291296401,0.12119829,5.885044406,8.475403207,b,Class 2
5.351503888,6.40942837,11.07531808,8.972571254,3.233818614,12.43439266,b,Class 2
6.693621558,13.96686031,1.475546478,12.35803005,0.873347546,0.688133753,c,Class 2
10.48922559,6.646089272,7.4076759,7.873827219,5.742578275,1.806450848,c,Class 2
1.365010518,0.840426698,6.044826791,12.33437799,5.33827304,14.55706457,c,Class 2
6.145883127,12.20161505,1.162956248,11.67002394,6.279495076,5.709716727,a,Class 1
12.99028641,3.448828215,4.946279072,10.87002826,14.83427318,9.154544604,c,Class 2
1.109266891,2.564645156,10.64938657,7.677215295,8.625541169,8.960849049,c,Class 1
6.891117595,13.9566784,0.952437927,6.585976751,13.16019122,7.78218351,b,Class 1
7 changes: 6 additions & 1 deletion examples/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from niaaml.preprocessing.feature_transform import FeatureTransformAlgorithmFactory
from niaaml.fitness import FitnessFactory
from niaaml.preprocessing.encoding import EncoderFactory
from niaaml.preprocessing.imputation import ImputerFactory

"""
In this example, we show how to use all of the implemented factories to create new object instances using their class names. You may also
Expand All @@ -15,6 +16,7 @@
fta_factory = FeatureTransformAlgorithmFactory()
f_factory = FitnessFactory()
e_factory = EncoderFactory()
i_factory = ImputerFactory()

# get an instance of the MultiLayerPerceptron class
mlp = classifier_factory.get_result('MultiLayerPerceptron')
Expand All @@ -31,4 +33,7 @@
#get an instance of the OneHotEncoder class
ohe = e_factory.get_result('OneHotEncoder')

# variables mlp, pso, normalizer, precision and ohe contain instances of the classes with the passed names
#get an instance of the SimpleImputer class
imp = i_factory.get_result('SimpleImputer')

# variables mlp, pso, normalizer, precision, ohe and imp contain instances of the classes with the passed names
23 changes: 23 additions & 0 deletions examples/feature_imputing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from niaaml.preprocessing.imputation import SimpleImputer, impute_features
import os
from niaaml.data import CSVDataReader

"""
In this example, we show how to individually use an implemented missing features' imputer and its methods. In this case we use SimpleImputer for demonstration, but
you can use any of the implemented imputers in the same way.
"""

# prepare data reader using csv file
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True)

# instantiate SimpleImputer
si = SimpleImputer()

# fit, transform and print to output the feature in the dataset (index 6)
features = data_reader.get_x()
si.fit(features[[6]])
f = si.transform(features[[6]])
print(f)

# if you wish to get array of imputers for all of the features with missing values in a dataset (and transformed DataFrame of features), you may use the utility method impute_features
transformed_features, imputers = impute_features(features, 'SimpleImputer')
47 changes: 47 additions & 0 deletions examples/optimize_run_pipeline_missing_values.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from niaaml import Pipeline
from niaaml.classifiers import MultiLayerPerceptron
from niaaml.preprocessing.feature_selection import VarianceThreshold
from niaaml.preprocessing.feature_transform import Normalizer
from niaaml.data import CSVDataReader
from niaaml.preprocessing.encoding import encode_categorical_features
from niaaml.preprocessing.imputation import impute_features
import os
import numpy
import pandas

"""
In this example, we show how to individually use the Pipeline class. You may use this if you want to test out a specific classification pipeline.
We will use a dataset that contains categorical and numerical features with missing values.
"""

# prepare data reader using csv file
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True)

features = data_reader.get_x()

# we use the utility method impute_features to get imputers for the features with missing values, but you may instantiate and fit
# imputers separately and pass them as a dictionary (as long as they are implemented as this framework suggests), with keys as column names or indices (if there is no header in the csv)
# there should be as many imputers as the features with missing values
# this example uses Simple Imputer
features, imputers = impute_features(features, 'SimpleImputer')

# exactly the same goes for encoders
_, encoders = encode_categorical_features(features, 'OneHotEncoder')

# instantiate a Pipeline object
pipeline = Pipeline(
feature_selection_algorithm=VarianceThreshold(),
feature_transform_algorithm=Normalizer(),
classifier=MultiLayerPerceptron(),
categorical_features_encoders=encoders,
imputers=imputers
)

# run pipeline optimization process (returns fitness value, but sets the best parameters for classifier, feature selection algorithm and feature transform algorithm during the process)
pipeline.optimize(data_reader.get_x(), data_reader.get_y(), 10, 50, 'ParticleSwarmAlgorithm', 'Accuracy')

# run the pipeline using dummy data
# you could run the pipeline before the optimization process, but get wrong predictions as nothing in the pipeline is fit for the given dataset
predicted = pipeline.run(pandas.DataFrame([[10.32440339, 3.195964543, 1.215275549, 3.741461311, 11.6736581, 6.435247906, 'a']]))

# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file
32 changes: 32 additions & 0 deletions examples/run_pipeline_optimizer_csv_data_missing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os
from niaaml import PipelineOptimizer, Pipeline
from niaaml.data import CSVDataReader

"""
In this example, we show how to use the PipelineOptimizer class. This example is using an instance of CSVDataReader.
The instantiated PipelineOptimizer will try and assemble the best pipeline with the components that are specified in its constructor.
We use a dataset with 1 categorical feature and missing values to demonstrate a use of PipelineOptimizer instance with automatic feature encoding and imputation.
"""

# prepare data reader using csv file
data_reader = CSVDataReader(src=os.path.dirname(os.path.abspath(__file__)) + '/example_files/dataset_categorical_missing.csv', has_header=False, contains_classes=True)

# instantiate PipelineOptimizer that chooses among specified classifiers, feature selection algorithms and feature transform algorithms
# OneHotEncoder is used for encoding categorical features in this example
# SimpleImputer is used for imputing missing values in this example
pipeline_optimizer = PipelineOptimizer(
data=data_reader,
classifiers=['AdaBoost', 'Bagging', 'MultiLayerPerceptron', 'RandomForest', 'ExtremelyRandomizedTrees', 'LinearSVC'],
feature_selection_algorithms=['SelectKBest', 'SelectPercentile', 'ParticleSwarmOptimization', 'VarianceThreshold'],
feature_transform_algorithms=['Normalizer', 'StandardScaler'],
categorical_features_encoder='OneHotEncoder',
imputer='SimpleImputer'
)

# runs the optimization process
# one of the possible pipelines in this case is: SelectPercentile -> Normalizer -> RandomForest
# returns the best found pipeline
# the chosen fitness function and optimization algorithm are Accuracy and Particle Swarm Algorithm
pipeline = pipeline_optimizer.run('Accuracy', 20, 20, 400, 400, 'ParticleSwarmAlgorithm', 'ParticleSwarmAlgorithm')

# pipeline variable contains Pipeline object that can be used for further classification, exported as an object (that can be later loaded and used) or exported as text file
2 changes: 1 addition & 1 deletion niaaml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@
]

__project__ = 'niaaml'
__version__ = '1.0.0rc1'
__version__ = '1.0.0rc2'
Loading

0 comments on commit 9274f96

Please sign in to comment.