-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcal_distance.py
181 lines (164 loc) · 5.98 KB
/
cal_distance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#-*- encoding: utf-8 -*-
import os
import pdb
import sys
from scipy.spatial import distance
reload(sys)
sys.setdefaultencoding('utf-8')
# if section is claim
def read_sentence_vectors(filename, firstline=False):
vectors = []
av_vector = []
num_vectors = 0
fin = open(filename, "r")
while True:
line = fin.readline().decode("utf-8")
if not line: break
line = line.strip()
if line == "": continue
if line.find("-nan") >= 0: continue
vec_tokens = line.split(" ")
tokens = vec_tokens[-100:]
vector = []
for token in tokens:
vector.append(float(token))
vectors.append(vector)
num_vectors += 1
if num_vectors == 1: av_vector = vector
else:
for idx in range(len(av_vector)):
av_vector[idx] += vector[idx]
if firstline: break
for idx in range(len(av_vector)):
av_vector[idx] /= num_vectors
fin.close()
return vectors, av_vector
# sum of min distances
def cal_cosine_wmd(vecs1, vecs2):
sum_distance = 0
for i1, vec1 in enumerate(vecs1):
min_distance = 100
min_idx = -1
for i2, vec2 in enumerate(vecs2):
cos_distance = distance.cosine(vec1, vec2)
if min_distance > cos_distance:
min_distance = cos_distance
min_idx = i2
sum_distance += min_distance
#print str(i1) + ":" + str(min_idx) + " min distance is " + str(min_distance)
return sum_distance
# min of min distances
def cal_cosine_mwmd(vecs1, vecs2):
min_min_distance = 100
min_idx1 = -1
min_idx2 = -1
for i1, vec1 in enumerate(vecs1):
min_distance = 100
min_idx = -1
for i2, vec2 in enumerate(vecs2):
cos_distance = distance.cosine(vec1, vec2)
if min_distance > cos_distance:
min_distance = cos_distance
min_idx = i2 + 1
if min_min_distance > min_distance:
min_min_distance = min_distance
min_idx1 = i1 + 1
min_idx2 = min_idx
#print str(i1) + ":" + str(min_idx) + " min distance is " + str(min_distance)
return min_min_distance, min_idx1, min_idx2
# sum of min distances
def cal_euclidean_wmd(vecs1, vecs2):
total_distance = 0
for vec1 in vecs1:
min_distance = 100
for vec2 in vecs2:
euc_distance = distance.euclidean(vec1, vec2)
if min_distance > euc_distance:
min_distance = euc_distance
total_distance += min_distance
return total_distance
# min of min distances
def cal_euclidean_mwmd(vecs1, vecs2):
min_min_distance = 100
for vec1 in vecs1:
min_distance = 100
for vec2 in vecs2:
euc_distance = distance.euclidean(vec1, vec2)
if min_distance > euc_distance:
min_distance = euc_distance
if min_min_distance > min_distance:
min_min_distance = min_distance
return min_min_distance
if __name__ == "__main__":
if len(sys.argv) < 2:
print "run with 2 arguments for a distance calculation (2 sentence vector files)"
print "or run with 1 argument to find nearest neighbor (1 sentence vector file)"
sys.exit()
elif len(sys.argv) == 3:
if not os.path.exists(sys.argv[1]) or not os.path.exists(sys.argv[2]):
print "cannot find vector files"
sys.exit()
vectors1, vector1 = read_sentence_vectors(sys.argv[1])
vectors2, vector2 = read_sentence_vectors(sys.argv[2])
cos_distance_ave = distance.cosine(vector1, vector2)
print "cosine distance of average " + str(cos_distance_ave)
euc_distance_ave = distance.euclidean(vector1, vector2)
print "euclidean distance of average " + str(euc_distance_ave)
cos_distance_wmd = cal_cosine_wmd(vectors1, vectors2)
print "cosine distance of wmd for " + str(len(vectors1)) + " vectors is " + str(cos_distance_wmd)
euc_distance_wmd = cal_euclidean_wmd(vector1, vector2)
print "euclidean distance of wmd for " + str(len(vectors1)) + " vectors is " + str(euc_distance_wmd)
elif len(sys.argv) == 2:
if not os.path.exists(sys.argv[1]):
print "cannot find vector file"
sys.exit()
tokens = sys.argv[1].split("/")
subdir = tokens[0]
if subdir[-1] == "/":
subdir = subdir[:-1]
section = tokens[1]
patent_id = tokens[-1].split("_")[0]
vecs_dir = subdir + "/" + section + "/"
files = os.listdir(vecs_dir)
if len(files) < 1:
print "wrong patent vector directory: " + vecs_dir
sys.exit()
if section == "claims":
vectors1, vector1 = read_sentence_vectors(sys.argv[1], True) # only the firstline
else:
vectors1, vector1 = read_sentence_vectors(sys.argv[1]) # full lines
rank1 = {}
rank2 = {}
rank3 = {}
rank4 = {}
count = 0
for file in files:
if file.endswith("vec"):
print vecs_dir + file
vectors2, vector2 = read_sentence_vectors(vecs_dir + file)
#cos_distance_ave = distance.cosine(vector1, vector2)
#rank1[file] = cos_distance_ave
#euc_distance_ave = distance.euclidean(vector1, vector2)
#rank2[file] = euc_distance_ave
cos_distance_wmd, idx1, idx2 = cal_cosine_mwmd(vectors1, vectors2)
rank3[file] = [cos_distance_wmd, idx1, idx2]
#euc_distance_wmd = cal_euclidean_mwmd(vector1, vector2)
#rank4[file] = cos_distance_wmd
count += 1
if count > 100000: break
#fp = open(patent_id + "_" + tokens[1] + "_cos_ave_rank.txt", "w")
#for f in sorted(rank1, key=rank1.get):
# fp.write(f + " " + str(rank1[f]) + "\n")
#fp.close()
#fp = open(patent_id + "_" + tokens[1] + "_euc_ave_rank.txt", "w")
#for f in sorted(rank2, key=rank2.get):
# fp.write(f + " " + str(rank2[f]) + "\n")
#fp.close()
fp = open(patent_id + "_" + tokens[1] + "_cos_wmd_rank.txt", "w")
for f in sorted(rank3, key=rank3.get):
fp.write(f + " " + str(rank3[f]) + "\n")
fp.close()
#fp = open(patent_id + "_" + tokens[1] + "_euc_wmd_rank.txt", "w")
#for f in sorted(rank4, key=rank4.get):
# fp.write(f + " " + str(rank4[f]) + "\n")
#fp.close()