-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilar-to.py
53 lines (42 loc) · 1.3 KB
/
similar-to.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from docopt import docopt
from collections import OrderedDict
from math import sqrt
def get_doc_topics(f):
results = []
with open(f, "r") as docs:
for doc in docs:
doc = doc.split()
_id = doc[0]
name = doc[1].split("/")[-1:][0].replace(".html.txt","")
topics = [float(i) for i in doc[2:]]
results.append((name, topics))
return results
def get_band_vector(band,doc_topics):
for b,v in doc_topics:
if band == b:
return v
raise Exception("Band not in corpus")
def similarity(v1, v2):
s = sum([i1 * i2 for i1, i2 in zip(v1, v2)])
s /= sqrt(sum([i**2 for i in v1]))
s /= sqrt(sum([i**2 for i in v2]))
return s
def main():
args = docopt("""
Usage:
xxx.py [options] <doc-topics> <band>
Options:
--bands NUM Number of close bands to show [default: 10]
""")
doc_topics = get_doc_topics(args["<doc-topics>"])
band=args["<band>"]
bands = int(args["--bands"])
vector = get_band_vector(band, doc_topics)
band_sim = []
for b,v in doc_topics:
if b != band:
band_sim.append((b, similarity(vector,v)))
for b,v in sorted(band_sim, reverse=True, key=lambda y: y[1])[:bands]:
print(b,v)
if __name__ == '__main__':
main()