-
Notifications
You must be signed in to change notification settings - Fork 2
/
encoder_pipelines.py
70 lines (51 loc) · 2.04 KB
/
encoder_pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
from os import path
import pandas as pd
import numpy as np
import json
from dsbox.datapreprocessing.cleaner import Encoder
from dsbox.datapreprocessing.cleaner.encoder import Params
from sklearn.preprocessing import Imputer
from sklearn.ensemble import BaggingClassifier
# ta1-pipeline-config.json structure:
# {
# "problem_schema":"path/to/problem_schema.json",
# "dataset_schema":"path/to/dataset_schema.json",
# "data_root":"path/to/data/root/folder/",
# "output_file":"path/to/output/file"
# }
# Load the json configuration file
with open("ta1-pipeline-config.json", 'r') as inputFile:
jsonCall = json.load(inputFile)
inputFile.close()
# Load the json dataset description file
with open(jsonCall['dataset_schema'], 'r') as inputFile:
datasetSchema = json.load(inputFile)
inputFile.close()
# Load the input files from the data_root folder path information
dataRoot = jsonCall['data_root']
trainData = pd.read_csv( path.join(dataRoot, 'trainData.csv.gz') )
trainTargets = pd.read_csv( path.join(dataRoot, 'trainTargets.csv.gz') )
testData = pd.read_csv( path.join(dataRoot, 'testData.csv.gz') )
print(trainData.head())
print(trainTargets.head())
print(np.asarray(trainTargets['Class']))
print(testData.head())
# Initialize the DSBox Encoder
enc = Encoder()
enc.set_training_data(inputs=trainData)
enc.fit()
print(type(enc.get_params()))
print(enc.get_params())
imputer = Imputer()
model = BaggingClassifier()
print(trainData.columns)
encodedTrainData = enc.produce(inputs=trainData)
processedTrainData = imputer.fit_transform(encodedTrainData)
trainedModel = model.fit(processedTrainData, np.asarray(trainTargets['Class']))
print(encodedTrainData.columns)
predictedTargets = trainedModel.predict(imputer.fit_transform(enc.produce(inputs=testData)))
print(predictedTargets)
# Outputs the predicted targets in the location specified in the JSON configuration file
with open(jsonCall['output_file'], 'w') as outputFile:
output = pd.DataFrame(predictedTargets).to_csv(outputFile, index_label='d3mIndex', header=['Class'])