-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path7_calculate_cycles.py
186 lines (153 loc) · 6.87 KB
/
7_calculate_cycles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import csv
import sys
import time
import datetime
from collections import defaultdict
from struct import *
import bisect
from utilwebisadb import set_csv_field_size
def load_dicts_from_csv(csv_file):
node_to_node_id = {}
node_id_to_succ_ids_and_edge_info = defaultdict(list)
i = 0
with open(csv_file) as f:
reader = csv.reader((line.replace('\0','') for line in f))
for row in reader:
instance = (row[6], row[7], row[8])
clazz = (row[9], row[10], row[11])
if instance == clazz: # remove all reflexive edges because these cycles are always true i.e. city is a city, action is a action, answer is a answer
continue
instance_id = node_to_node_id.get(instance)
if instance_id == None:
instance_id = i
node_to_node_id[instance] = i
i += 1
clazz_id = node_to_node_id.get(clazz)
if clazz_id == None:
clazz_id = i
node_to_node_id[clazz] = i
i += 1
node_id_to_succ_ids_and_edge_info[instance_id].append(pack('LLL', clazz_id, int(row[0]), int(float(row[3])))) # neighbour id, edge id, edge/relation frequency
return node_to_node_id.values(), node_id_to_succ_ids_and_edge_info
def move(element, first_set, second_set):
first_set.remove(element)
second_set.add(element)
#http://stackoverflow.com/questions/26840413/insert-a-custom-object-in-a-sorted-list
#http://stackoverflow.com/questions/27672494/using-bisect-insort-with-key
class MyElement:
def __init__(self, id, freq, index):
self.id = id
self.freq = freq
self.index = index
def __lt__(self, other):
return self.freq < other.freq
def __repr__(self):
return 'MyElement({}, {}, {})'.format(self.id, self.freq, self.index)
class Min_Max_of_Sliced_List:
def __init__(self):
self.list = []
self.sortedList = []
def push(self, id, freq):
if id == None:
return
self.list.append((id, freq))
bisect.insort_left(self.sortedList, MyElement(id, freq, len(self.list)))
def pop(self):
if len(self.list) == 0:
return
pop_id, pop_freq = self.list.pop()
#delete in sorted list
i = bisect.bisect_left(self.sortedList, MyElement(0, pop_freq, 0))
while True:
if self.sortedList[i].id == pop_id:
del self.sortedList[i]
break
i += 1
def find_min_max_from_index(self, index):
min, max = 0,0
for item in self.sortedList:
if item.index >= index:
min = item
break
for item in reversed(self.sortedList):
if item.index >= index:
max = item
break
return min.id, min.freq, max.id, max.freq
def dfs_iterative(nodes, succ, out_file, log): #G):
white = set() # unvisited
gray = set() # on the stack
black = set() # = finished (we backtracked from it, seen everywhere we can reach from it)
total_length = len(nodes)
for node in nodes:
white.add(node)
log_counter = 0
with open(out_file, "w") as out:
while len(white) > 0:
start_node = next(iter(white))
# true means enter function , false means returning from the function/recusrion see
# http://stackoverflow.com/questions/21952770/how-to-convert-this-non-tail-recursion-function-to-a-loop-or-a-tail-recursion-ve
stack = [(False, start_node, (None, None)), (True, start_node, (None, None))]
call_stack = []
call_stack_dict = {}
sorted_stack = Min_Max_of_Sliced_List()
while stack:
(enter_or_return, current, (current_edge_id, current_edge_freq)) = stack.pop()
if current in black:#already observed
continue
if enter_or_return: #entering the function
move(current, white, gray)
call_stack.append(current)
call_stack_dict[current] = len(call_stack) - 1
sorted_stack.push(current_edge_id, current_edge_freq)
for packed in succ[current]:
neighbor, edge_id, freq = unpack('LLL',packed)
if neighbor in white:
stack.append((False, neighbor, (None, None)))
stack.append((True, neighbor, (edge_id, freq)))
elif neighbor in gray:
#path = call_stack[call_stack_dict[neighbor]:]
#path.append(neighbor)
sorted_stack.push(edge_id,freq)
min_id, min_freq,max_id,max_freq = sorted_stack.find_min_max_from_index(call_stack_dict[neighbor] + 1)
sorted_stack.pop()
out.write('no,{}\nyes,{}\n'.format(min_id, max_id))
else: #returning back from the function
move(current, gray, black)
del call_stack_dict[call_stack.pop()]
sorted_stack.pop()
log_counter += 1
log.write("{} - out {}/{}\n".format(datetime.datetime.now(), log_counter, total_length))
def calc_cycles(log_file, out_file, dict_file):
with open(log_file, "w") as log:
startTime = time.time()
nodes, succ= load_dicts_from_csv(dict_file)
log.write('finished loading graph in {} s\n'.format(time.time() - startTime))
startTime = time.time()
dfs_iterative(nodes, succ, out_file, log)
log.write('finished processing graph in {} s\n'.format(time.time() - startTime))
def post_process_cycles(full_file, cycle_file, out_file):
yes_set = set()
no_set = set()
with open(cycle_file) as f:
for row in csv.reader(f):
if row[0] == 'no':
no_set.add(int(row[1]))
elif row[0] == 'yes':
yes_set.add(int(row[1]))
#remove all ids which occur in both sets
only_yes_set = yes_set.difference(no_set)
only_no_set = no_set.difference(yes_set)
with open(full_file) as full, open(out_file, "w", newline='') as out:
full_reader = csv.reader(full)
out_writer = csv.writer(out)
for row in full_reader:
if int(row[0]) in only_yes_set:
out_writer.writerow(row + ['yes'])
elif int(row[0]) in only_no_set:
out_writer.writerow(row + ['no'])
if __name__ == "__main__":
set_csv_field_size()
for i in [1]: # 20,10,5,3,2,1,0
calc_cycles('webisa_{}_cycles_log.txt'.format(i), 'webisa_{}_cycles_raw.csv'.format(i), 'webisa_{}.csv'.format(i))
post_process_cycles('webisa_{}.csv'.format(i), 'webisa_{}_cycles_raw.csv'.format(i), 'webisa_{}_cycles_results.csv'.format(i))