-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfdf.py
87 lines (74 loc) · 2.88 KB
/
tfdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import math
import time
from nltk.corpus import reuters, inaugural, gutenberg
class Search:
corpus = gutenberg
def __init__(self):
# self.index = self.inverse_index()
self.df_dict = self.df()
self.tf_dict = self.tf()
self.tf_idf_dict = self.tf_idf()
self.doc_size_dict = self.doc_size()
return
# dictionary for w
def inverse_index(self):
index = {}
for id_doc in self.corpus.fileids():
for word in self.corpus.words(id_doc):
if word not in index:
index[word] = set()
index[word].add(id_doc)
return index
def tf(self):
tf_dict = {}
for doc_id in self.corpus.fileids():
tf_dict[doc_id] = dict()
for word in self.corpus.words(doc_id):
if word.lower() in tf_dict[doc_id]:
tf_dict[doc_id][word.lower()] += 1
else:
tf_dict[doc_id][word.lower()] = 1
return tf_dict
def df(self):
df_dict = {}
for doc_id in self.corpus.fileids():
for word in {word.lower() for word in set(self.corpus.words(doc_id))}:
if word.lower() in df_dict:
df_dict[word.lower()] += 1
else:
df_dict[word.lower()] = 1
print('df finished')
return df_dict
def idf(self, term):
if self.df_dict[term.lower()] > len(self.corpus.fileids()):
print(term, self.df_dict[term.lower()])
return math.log(len(self.corpus.fileids()) / self.df_dict[term.lower()])
def tf_idf(self):
tf_idf_dict = dict(self.tf_dict)
for doc in self.tf_dict:
tf_idf_dict[doc] = dict()
for word in self.tf_dict[doc]:
tmp = self.idf(word)
tf_idf_dict[doc][word.lower()] = self.tf_dict[doc][word.lower()] * tmp
print('tf-idf finished')
return tf_idf_dict
def doc_size(self): # doc size is equal to sigma of all term frequencies squared
doc_size_dict = dict()
for doc_id in self.corpus.fileids():
sigma = 0
for word in self.tf_dict[doc_id]:
sigma += self.tf_dict[doc_id][word] ** 2
doc_size_dict[doc_id] = math.sqrt(sigma)
return doc_size_dict
# query is a list of words that comprises search terms
def doc_score(self, query, doc_id):
score = 0
for term in query:
if term.lower() in self.tf_dict[doc_id]:
score += (self.tf_dict[doc_id][term.lower()] / self.doc_size_dict[doc_id]) * self.idf(term.lower())
return score
def query_result(self, query):
result = list()
for doc_id in self.corpus.fileids():
result.append((doc_id, self.doc_score(query, doc_id)))
return sorted(result, key=lambda x: x[1], reverse=True)