-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRF2.py
123 lines (98 loc) · 3.83 KB
/
RF2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import csv
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
if __name__== '__main__':
data_path = "Data"
# load csv files. We use np.loadtxt. Delimiter is ","
# and the text-only header row will be skipped.
print("Loading data...")
x_train = np.loadtxt(data_path + os.sep + "x_train.csv",
delimiter = ",", skiprows = 1)
x_test = np.loadtxt(data_path + os.sep + "x_test.csv",
delimiter = ",", skiprows = 1)
y_train = np.loadtxt(data_path + os.sep + "y_train.csv",
delimiter = ",", skiprows = 1)
print ("All files loaded. Preprocessing...")
# remove the first column(Id)
x_train = x_train[:,1:]
x_test = x_test[:,1:]
y_train = y_train[:,1:]
# Every 100 rows correspond to one gene.
# Extract all 100-row-blocks into a list using np.split.
num_genes_train = x_train.shape[0] / 100
num_genes_test = x_test.shape[0] / 100
print("Train / test data has %d / %d genes." % \
(num_genes_train, num_genes_test))
x_train = np.split(x_train, num_genes_train)
x_test = np.split(x_test, num_genes_test)
# Reshape by raveling each 100x5 array into a 500-length vector
x_train = [g.ravel() for g in x_train]
x_test = [g.ravel() for g in x_test]
# convert data from list to array
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_train = np.ravel(y_train)
# Now x_train should be 15485 x 500 and x_test 3871 x 500.
# y_train is 15485-long vector.
print("x_train shape is %s" % str(x_train.shape))
print("y_train shape is %s" % str(y_train.shape))
print("x_test shape is %s" % str(x_test.shape))
print('Data preprocessing done...')
print("Next steps FOR YOU:")
print("-" * 30)
print("1. Define a classifier using sklearn")
print("2. Assess its accuracy using cross-validation (optional)")
print("3. Fine tune the parameters and return to 2 until happy (optional)")
print("4. Create submission file. Should be similar to y_train.csv.")
print("5. Submit at kaggle.com and sit back.")
# x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)
##
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)
print(y_pred_proba.shape)
csv_file=open("rf.csv","w")
csv_file.write("GeneId,Prediction\n")
i=1
for pred in y_pred_proba:
# print(pred, pred[0], pred[1])
m = pred[1]
csv_file.write(str(i)+","+str(m)+"\n")
i=i+1
# indices = np.arange(len(y_pred_proba))
# print(indices.shape)
# print(y_pred_proba.shape)
# two_rows = np.array(indices, y_proba_max)
# print(two_rows.shape)
# np.savetxt("rf.csv",y_pred_proba, delimiter=",")
# data = []
# for i in range(np.shape(y_pred_proba)[0]):
# data.append(np.round(np.max(y_pred_(iroba[i]),2))
#
## b=dict(enumerate(data))
#
# with open("test_csv_path.csv", "w", newline='') as csv_file:
# writer = csv.writer(csv_file, delimiter=',')
#
# for line in data:
# writer.writerow(line)
# acuracy = roc_auc_score(y_test, y_pred)
# print(acuracy)
##
# with open(csvfile, "wb") as csv_file:
# writer = csv.writer(csv_file, delimiter = ',')
# for val in data:
# writer.writerow([val])
# file = open('test_csv_path', 'r+')
## header = next(file)
# print('GeneId\tPrediction')
# for i, f in enumerate(file):
# print("%s\t%s" %(f.strip(),))
# file.close()
#