-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.py
87 lines (68 loc) · 2.49 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# from itertools import islice
# from tqdm.notebook import tqdm
# from re import sub
# dict_idf = {}
# with tqdm(total=num_lines) as pbar:
# for i, line in tqdm(islice(enumerate(file), 1, None)):
# try:
# cells = line.split()
# idf = float(sub("[^0-9.]", "", cells[3]))
# dict_idf[cells[0]] = idf
# except:
# print("Error on: " + line)
# finally:
# pbar.update(1)
# from sklearn.feature_extraction.text import CountVectorizer
# from numpy import array, log
# vectorizer = CountVectorizer()
# tf = vectorizer.fit_transform([x.lower() for x in array_text])
# tf = tf.toarray()
# tf = log(tf + 1)
# tfidf = tf.copy()
# words = array(vectorizer.get_feature_names())
# for k in tqdm(dict_idf.keys()):
# if k in words:
# tfidf[:, words == k] = tfidf[:, words == k] * dict_idf[k]
# pbar.update(1)
# for j in range(tfidf.shape[0]):
# print("Keywords of article", str(j+1), words[tfidf[j, :].argsort()[-5:][::-1]])
class TfIdf:
def __init__(self):
self.weighted = False
self.documents = []
self.corpus_dict = {}
def add_document(self, doc_name, list_of_words):
# building a dictionary
doc_dict = {}
for w in list_of_words:
doc_dict[w] = doc_dict.get(w, 0.) + 1.0
self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0
# normalizing the dictionary
length = float(len(list_of_words))
for k in doc_dict:
doc_dict[k] = doc_dict[k] / length
# add the normalized document to the corpus
self.documents.append([doc_name, doc_dict])
def similarities(self, list_of_words):
"""
Returns a list of all the [docname, similarity_score] pairs relative to a list of words.
"""
# building the query dictionary
query_dict = {}
for w in list_of_words:
query_dict[w] = query_dict.get(w, 0.0) + 1.0
# normalizing the query
length = float(len(list_of_words))
for k in query_dict:
query_dict[k] = query_dict[k] / length
# computing the list of similarities
sims = []
for doc in self.documents:
score = 0.0
doc_dict = doc[1]
for k in query_dict:
if k in doc_dict:
score += (query_dict[k] / self.corpus_dict[k]) + (
doc_dict[k] / self.corpus_dict[k])
sims.append([doc[0], score])
return sims