-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdolphy.py
292 lines (266 loc) · 9.46 KB
/
dolphy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
# Dolphy - high performance, pure Python text indexer
# Tom Dyson, April 2006
# TODO: Phrase searching
# TODO: Boolean queries
# TODO: Paging support
# TODO: Tests
import re
from nltk_lite import tokenize
import time
import dbhash
import marshal
from sets import Set
STOPWORDS = '/Users/tomdyson/Documents/code/python/dolphy/data/stopwords.txt'
class Index:
"""
A persisted hash of terms and documents
T_foo = {doc_1:[position_1, position_2], doc_2:[position_1]}
D_doc_1 = {uri:URI, title:Title, author:Author,
modified:Modification Date, str_modified:Readable date,
tokens: Tokens, raw_content: Raw content}
"""
def __init__(self, filename):
self.filename = filename
self.db = dbhash.open(self.filename, flag='c')
self.max_id = int(self.db.get('M_max_id', 0))
self.sort_by = self.sortByPositionAndFrequency
def add(self, document, defer_recalculate = True):
# Retreive (if known URI) or assign document ID
known_ID = self.db.get('U_'+ document.uri)
if known_ID:
document.id = int(known_ID)
else:
self.max_id += 1
self.db['M_max_id'] = str(self.max_id)
document.id = self.max_id
self.db["U_" + document.uri] = str(document.id)
# Add an entry for each document's metadata
doc_details = {}
doc_details['uri'] = document.uri
doc_details['title'] = document.title
doc_details['author'] = document.author
doc_details['modified'] = document.modified
modified = time.localtime(document.modified)
doc_details['str_modified'] = time.strftime('%d %B %Y', modified)
try:
# TODO: better name for raw_content
# TODO: check attribute exists, don't just try to read it
doc_details['raw_content'] = document.raw_data
except AttributeError:
pass
self.db["D_%s" % document.id] = marshal.dumps(doc_details)
# Add/update the entry for each term in the document
for term in document.tokens:
if self.db.has_key('T_' + term):
term_data = marshal.loads(self.db['T_' + term])
term_data[document.id] = (document.tokens[term], document.length)
else:
term_data = {}
term_data[document.id] = (document.tokens[term], document.length)
self.db['T_' + term] = marshal.dumps(term_data)
def remove(self, document_URI, defer_recalculate = True):
pass
def update(self, document, defer_recalculate = True):
pass
def recalculate(self):
pass
def sortByPositionAndFrequency(self, documents):
"""Rank documents by term frequency and position"""
results = []
for doc in documents:
score = 0
positions, length = documents[doc]
length = float(length)
# helen's ranking: position 1/x has multiplier 2,
# position x has multiplier 1
for position in positions:
score += (2 * (length - 1)) / (position + length - 2)
results.append((score, doc))
results.sort()
results.reverse()
return results
def sortByDate(self, documents):
results = []
for doc in documents:
doc_details = marshal.loads(self.db['D_' + str(doc)])
results.append((doc_details['modified'], doc))
results.sort()
results.reverse()
return results
def mergeMatches(self, doc_groups, merge_type="intersection"):
""" Combine sets of matching documents, e.g. for each term
in a multi-term query. Supports intersections (for AND
queries) and unions (for ORs). """
# TODO: less frequent terms (across all documents) should
# be weighted - perhaps this is where to return the weighting
# (or just the frequency)
combined_set = {}
doc_groups_copy = list(doc_groups)
intersected = Set(doc_groups_copy.pop().keys())
if merge_type == "intersection":
for doc_group in doc_groups_copy:
intersected = intersected.intersection(Set(doc_group.keys()))
for doc in intersected:
positions = []
len = 0
for doc_group in doc_groups:
positions.extend(doc_group[doc][0])
len = doc_group[doc][1] # should only have to get this once
combined_set[doc] = (positions, len)
elif merge_type == "union":
for doc_group in doc_groups_copy:
intersected = intersected.union(Set(doc_group.keys()))
for doc in intersected:
positions = []
len = 0
for doc_group in doc_groups:
if doc in doc_group:
positions.extend(doc_group[doc][0])
len = doc_group[doc][1] # should only have to get this once
combined_set[doc] = (positions, len)
print combined_set
return combined_set
def search(self, query, summarise='simple', page_start=1, page_size=10, operator="AND"):
"""Retrieve and sort documents containing the specified term(s)"""
query_terms = query.lower().strip().split(' ')
ret = []
t = Text()
porter = tokenize.PorterStemmer()
if len(query_terms) > 1:
matching_document_groups = []
for query_term in query_terms:
stemmed_query = porter.stem(query_term)
matching_documents = self.db.get('T_' + query_term)
if matching_documents:
matching_document_groups.append(marshal.loads(matching_documents))
# copy the list of matching document groups for sets
if operator == "AND": join_type = "intersection"
elif operator == "OR": join_type = "union"
documents = self.mergeMatches(matching_document_groups, join_type)
else:
query_term = query_terms[0]
stemmed_query = porter.stem(query_term)
matching_documents = self.db.get('T_' + query_term)
documents = marshal.loads(matching_documents)
ret = {}
ret['query'] = query
if matching_documents:
results = self.sort_by(documents)
ret['count'] = len(results)
ranked_documents = []
page_from = page_start - 1
page_to = page_from + page_size
for result in results[page_from:page_to]:
doc = marshal.loads(self.db['D_%s' % result[1]])
doc['score'] = result[0]
if doc.get('raw_content'):
# TODO: dispatch summarisers better
if summarise == 'highlight':
doc['summary'] = t.summarise(doc['raw_content'], query)
else:
doc['summary'] = t.simpleSummarise(doc['raw_content'])
else:
doc['summary'] = ''
ranked_documents.append(doc)
ret['hits'] = ranked_documents
return ret
def close(self):
self.db.close()
class Document:
"""
Dolphy documents have six properties:
URI - typically file location or URL
Title - typically extracted from source metadata
Author
Modification date - seconds since the epoch
Tokens - a dict of term:positions ({'foo':[3,5],})
Raw content for summaries
"""
def __init__(self, uri, title="", author="", modified=0):
self.uri = uri
self.title = title
self.author = author
# modified should be seconds since epoch, defaults to now
if int(modified) == modified:
if modified == 0:
modified == int(time.time())
self.modified = modified
else:
raise "'modified' should be seconds since the epoch"
class Text:
def __init__(self):
# TODO: consider localisation
self.stopwords = [line.strip() for line in open(STOPWORDS)]
self.whitespace_splitter = re.compile("\\w+", re.U)
self.space_or_break = re.compile("\s|\n")
self.strip_comments = re.compile('<!--.*?-->')
self.strip_tags = re.compile('<.*?>')
self.encased_word = re.compile('^\W*(\w+)\W*s?\W?$')
# TODO: I don't like defining the caching stemmer here
porter = tokenize.PorterStemmer()
self.cachingStemmer = Memoize(porter.stem)
def tokenizeAndStem(self, string):
"""Yield a stream of downcased words from a string."""
# crude HTML comment and tag removal
string = self.strip_comments.sub('',string)
string = self.strip_tags.sub('',string)
porter = tokenize.PorterStemmer()
tokenstream = tokenize.regexp(string, self.whitespace_splitter)
for token in tokenstream:
token = token.lower()
# ignore words with less than three letters,
# stem words with more than three letters
if len(token) > 2 and token not in self.stopwords:
if len(token) == 3:
yield token
else:
stemmed_token = self.cachingStemmer(token)
yield stemmed_token
def calculateTokens(self, tokens):
"""Format tokenised stream into a dict of
term:positions plus total tokens ({'foo':[3,5],},10)"""
dtokens = {}
position = 1
for token in tokens:
if dtokens.get(token):
dtokens[token].append(position)
else:
dtokens[token] = [position]
position += 1
return dtokens, position
def summarise(self, text, term, margin=5, max_phrases=3):
"""Naive contextual highlighting"""
tokens = self.space_or_break.split(text)
position = 0
phrases = []
for token in tokens:
clean_token = token.lower()
if term in clean_token.lower():
# v2. try to speed up by using compiled expression
matching_word = self.encased_word.search(clean_token)
if matching_word and matching_word.group(1) == term:
#v1. if re.search('^\W*(' + term + ')\W*s?\W?$', clean_token):
before = ' '.join(tokens[position-margin:position])
after = ' '.join(tokens[position+1:position+margin])
before = before.split('.')[-1].strip()
after = after.split('.')[0].strip()
phrase = '%s <strong>%s</strong> %s' % (before, token, after)
phrases.append(phrase)
if len(phrases) == max_phrases: break
position += 1
return ' ... '.join(phrases)
def simpleSummarise(self, text, minimum_characters=80):
"""Return minimum characters plus any remaining portion of a term"""
summary = text[0:text.find(' ', minimum_characters)]
if not summary.endswith('.'):
summary = summary + '...'
return summary.strip()
class Memoize:
# TODO: include support for pre-loading cached values, like Cache().load()
def __init__(self, fn):
self.fn = fn
self.memo = {}
def __call__(self, *args):
if not self.memo.has_key(args):
self.memo[args] = self.fn(*args)
return self.memo[args]