-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
96 lines (92 loc) · 6.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import warnings
warnings.filterwarnings('ignore')
import json
import pandas
import random
import numpy as np
import hashlib
chunksize = 10 ** 4 #chunk size for pandas to keep data load within memory constraints
path_data = {}
edge_data = {}
feature_data = {}
page = 0 #chunk counter
reader_numeric = pandas.read_csv('data/train/train_numeric.csv', chunksize=chunksize)
reader_categorical = pandas.read_csv('data/train/train_categorical.csv', chunksize=chunksize)
reader_date = pandas.read_csv('data/train/train_date.csv', chunksize=chunksize)
reader = zip(reader_numeric, reader_categorical, reader_date) #combine 3 datasets
for numeric, categorical, date in reader:
print ("page " + str(page))
page = page + 1
numeric_chunk = pandas.DataFrame(numeric)
categorical_chunk = pandas.DataFrame(categorical)
date_chunk = pandas.DataFrame(date)
cat_columns = list(categorical_chunk.columns.values) #store list of categorical and numeric feature labels
num_columns = list(numeric_chunk.columns.values)
for index, part in date_chunk.iterrows():
part_id = int(part['Id'])
old_timestamp_indx = 0
path = []
defective = int(numeric_chunk['Response'][index])
#iterate over ever part in the date file, using the date features to lookup the associated numeric and categorical feature belonging to this timestamp
for feature,timestamp in part.items():
if feature != "Unnamed: 0" and feature != "Id":
line = feature.split("_")[0]
station = feature.split("_")[1]
if pandas.notnull(timestamp):
split_feature = feature.split("D")
timestamp_indx = int(split_feature[1])#get the index of the timestamp feature to be used as lookup for cat and num features
for i in reversed(range(old_timestamp_indx + 1, timestamp_indx)): #cat and num features are timestamped by the date feature index immediately following the cat or num feature index,
#iterate backwards from current timestamp and look up cat and numeric features until both are null or until the previous timestamp
feature_indx = split_feature[0] + "F" + str(i)
if feature_indx in cat_columns:
feature_val = categorical_chunk[feature_indx][index]
feature_type = "categorical"
elif feature_indx in num_columns:
feature_val = numeric_chunk[feature_indx][index]
feature_type = "numeric"
else:
feature_val = None
if pandas.notnull(feature_val):
if feature_indx in feature_data.keys():
feature_data[feature_indx]["total_count"] = feature_data[feature_indx]["total_count"] + 1
feature_data[feature_indx]["defective_count"] = feature_data[feature_indx]["defective_count"] + defective
feature_data[feature_indx]["defective_rate"] = feature_data[feature_indx]["defective_count"]/feature_data[feature_indx]["total_count"]
feature_data[feature_indx]["values"].append([timestamp,feature_val,defective])
else:
feature_data[feature_indx] = {"total_count":1,"defective_count":defective,"defective_rate":defective,
"values":[timestamp,feature_val,defective],"feature_type":feature_type,"station":station,"line":line,"feature":feature_indx,"example_val":feature_val}
path.append({"feature":feature_indx,"timestamp":timestamp,"station":station,"value":feature_val,"feature_type":feature_type,"defective":defective})
else:
old_timestamp_indx = timestamp_indx
break
sorted_path = sorted(path, key=lambda k: k['timestamp'])
if sorted_path:
path_string = "".join([path["feature"] for path in sorted_path])
path_hash = hashlib.md5(path_string.encode()).hexdigest()
if path_hash in path_data.keys():
path_data[path_hash]["total_count"] = path_data[path_hash]["total_count"] + 1
path_data[path_hash]["defective_count"] = path_data[path_hash]["defective_count"] + sorted_path[0]["defective"]
path_data[path_hash]["defective_rate"] = path_data[path_hash]["defective_count"]/path_data[path_hash]["total_count"]
else:
path_data[path_hash] = {"total_count":1,"defective_count":sorted_path[0]["defective"],"defective_rate":sorted_path[0]["defective"],"path":[path["feature"] for path in sorted_path]}
for i in range(len(sorted_path) -1):
edge = sorted_path[i]["feature"] + "-" + sorted_path[i+1]["feature"]
if edge in edge_data.keys():
edge_data[edge]["total_count"] = edge_data[edge]["total_count"] + 1
edge_data[edge]["defective_count"] = edge_data[edge]["defective_count"] + sorted_path[i]["defective"]
edge_data[edge]["defective_rate"] = edge_data[edge]["defective_count"]/edge_data[edge]["total_count"]
else:
edge_data[edge] = {"total_count":1,"defective_count":sorted_path[i]["defective"],"defective_rate":sorted_path[i]["defective"],
"start_feature":sorted_path[i]["feature"] ,"end_feature":sorted_path[i+1]["feature"]}
edge_list = [data for data in edge_data.values()] #convert to list of dicts for d3.js consumption
path_list = sorted([data for data in path_data.values()],key=lambda k:k['total_count'],reverse=True)
#remove feature values to reduce dataset for visulization
feature_list = sorted([{data_key:feature_data for data_key,feature_data in data.items() if data_key != 'values'} for key,data in feature_data.items()],key=lambda k:k['feature'])
with open('edge_data.json', 'w') as outfile:
json.dump(edge_list, outfile)
with open('path_data.json', 'w') as outfile:
json.dump(path_list, outfile)
with open('feature_data.json', 'w') as outfile:
json.dump(feature_list, outfile)
with open('data/all_feature_data.json', 'w') as outfile:
json.dump(feature_data, outfile)