-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch.py
121 lines (106 loc) · 4.92 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from tools import *
import math
import index
from index import *
import operator
class Search:
possibleTypes = ['bool','tf','tf-idf', 'proba']
def setType(self, type):
if (type in self.possibleTypes):
self.type = type
else:
raise ValueError('unknown search type')
def setQuery(self, query):
self.query = query
# print 'Quey set to ' + query
return 0
def getPostingFromIndexInverse(self, index, word):
return index[word]['poids'].keys()
def andPosting(self, a, b):
return list(set(a) & set(b))
def orPosting(self, a, b):
return list(set(a) | set(b))
def notPosting(self, a, b):
return list(set(a) - set(b))
def booleanSearch(self, indexInverse):
tquery = tokenisation(self.query)
posts = self.getPostingFromIndexInverse(indexInverse, tquery[0])
for i in range(len(tquery)):
if tquery[i] == 'AND':
posts = self.andPosting(
posts, self.getPostingFromIndexInverse
(indexInverse, tquery[i + 1]))
if tquery[i] == 'OR':
posts = self.orPosting(
posts, self.getPostingFromIndexInverse
(indexInverse, tquery[i + 1]))
if tquery[i] == 'NOT':
posts = self.notPosting(
posts, self.getPostingFromIndexInverse
(indexInverse, tquery[i + 1]))
return posts
def otherSearch(self, indexInverse):
source = 'CACM\cacm.all'
commonwords = 'CACM\common_words'
indexedQueryObject = index.Index(source, commonwords)
indexedQuery = indexedQueryObject.indexText(self.query)
tableauArticle=[]
results = rec_dd()
for mot in indexedQuery.keys():
if mot in indexInverse.keys():
for article in indexInverse[mot]['poids'].keys():
if self.type!='proba' or indexInverse[mot]['poids'][article]['proba']!=0:
if article not in tableauArticle:
tableauArticle += [article]
if results[article]:
if self.type == 'tf':
results[article] += indexedQuery[mot][
'weight'] * indexInverse[mot]['poids'][article]['tf']
elif self.type == 'tf-idf':
results[article] += indexedQuery[mot][
'weight'] * indexInverse[mot]['poids'][article]['tf-idf']
elif self.type == 'proba':
results[article] += math.log10((1-indexInverse[mot]['poids'][article]['proba'])/indexInverse[mot]['poids'][article]['proba'])
else:
if self.type == 'tf':
results[article] = indexedQuery[mot][
'weight'] * indexInverse[mot]['poids'][article]['tf']
elif self.type == 'tf-idf':
results[article] = indexedQuery[mot][
'weight'] * indexInverse[mot]['poids'][article]['tf-idf']
elif self.type =='proba':
if indexInverse[mot]['poids'][article]['proba'] == 1 :
indexInverse[mot]['poids'][article]['proba'] = 0.999999999
results[article] = math.log10((1-indexInverse[mot]['poids'][article]['proba'])/indexInverse[mot]['poids'][article]['proba'])
return sorted(results.items(), key=operator.itemgetter(1), reverse=True)[:self.resultsLimit]
def executeSearch(self, indexInverse):
if self.type in ['tf-idf', 'tf', 'proba']:
return self.otherSearch(indexInverse)
else:
return self.booleanSearch(indexInverse)
def presentResults(self, results):
sorted_results = sorted(
results.items(), key=operator.itemgetter(1), reverse=True)
i = 1
for result in sorted_results:
print str(i) + " : " + result[0] + " with a score of " + str(result[1])
i += 1
if i>30:
break
return sorted_results
def calculCosinus(self, vectorB):
produitcroise = 0
sommeCarreA = 0
sommeCarreB = 0
for motA in vectorA.keys():
sommeCarreA += vectorA[motA] * vectorA[motA]
if motA in vectorB.keys():
produitcroise += vectorA[motA] * vectorB[motA]
for motB in vectorB.keys():
sommeCarreB += vectorB[motB] * vectorB[motB]
if sommeCarreB + sommeCarreA != 0:
return produitcroise / (math.sqrt(sommeCarreA) + math.sqrt
(sommeCarreB))
return 0
def setLimit(self, limit):
self.resultsLimit = limit