-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.py
executable file
·300 lines (235 loc) · 10.1 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#!/usr/bin/python3
import re
import nltk
import sys
import getopt
import os
import pickle
import linecache
from collections import defaultdict
from math import log, sqrt
#constants
STEMMER = nltk.stem.porter.PorterStemmer()
def usage():
print("usage: " + sys.argv[0] + " -i directory-of-documents -d dictionary-file -p postings-file")
def build_index(in_dir, out_dict, out_postings):
"""This function builds the dictionary and postings files from the directory.
Arguments:
in_dir {string} -- path to the directory containing the text for indexing
out_dict {string} -- path to where the dictionary file will be written to
out_postings {string} -- path to where the postings file will be written to
"""
print('indexing...')
# This is an empty method
# Pls implement your code in below
# initialize the dictionary & postings container
dictionary = myDict(out_dict)
postings = myPostings()
# length will be a dictionary mapping document IDs to their lengths
lengths = dict()
# iterate through each document in sequence
docList = [int(txtFile) for txtFile in os.listdir(in_dir)]
docList.sort()
# parse each document in the list
for docName in docList:
parse_document(docName, dictionary, postings, in_dir, lengths)
postings.savePostingsToFile(lengths, out_postings, dictionary)
#we can calculate the IDF for each token during indexing
calculateIDF(dictionary.getDictionary(), len(docList))
dictionary.save()
#parses a document into the dictionary and postings lists.
def parse_document(docId, dictionary, postings, in_dir, lengths):
"""This function parses a single document, adding the resulting vector to the dictionary and postings lists.
Arguments:
docId {int} -- the ID of the document to be parsed (assuming all document names are integers)
dictionary {myDict} -- the myDict object containing the main dictionary
postings {myPostings} -- the myPostings object containing the postings
in_dir {string} -- path to the directory containing the text for indexing
lengths {dict} -- the dictionary mapping document IDs to their lengths
"""
fileName = str(docId)
docVector = defaultdict(int)
lineNum = 1
line = linecache.getline(os.path.join(in_dir, fileName), 1)
while(line != ''):
#parse the line for tokens and add to the document's vector.
parseLine(line, docVector)
lineNum += 1
line = linecache.getline(os.path.join(in_dir, fileName), lineNum)
#calculate term frequency for the document, replacing term frequency with the weighted tf-idf (before normalization)
calculateTermFrequency(docVector)
#obtain the length of the vector and add it to the length array
lengths[docId] = calculateVectorLength(docVector)
#record the data in the dictionary and postings
recordData(docId, dictionary, postings, docVector)
def parseLine(line, docVector):
"""This function parses a line into tokens and adding their frequencies to the document vector.
Arguments:
line {string} -- the line to be tokenized
docVector {defaultdict} -- the document vector represented as a dictionary
"""
sentences = nltk.sent_tokenize(line)
for sentence in sentences:
#use the nltk word tokenizer
words = nltk.word_tokenize(sentence)
#lower and stem each word
for word in words:
word = word.lower()
word = STEMMER.stem(word)
#add it to the document vector (defaultdict handles new entries automatically)
docVector[word] += 1
def calculateTermFrequency(docVector):
"""Calculates and replaces the term frequency for the document vector with the weighted value
Arguments:
docVector {[type]} -- [description]
"""
for key, value in docVector.items():
if value != 0:
docVector[key] = 1 + log(value, 10)
else:
#protecting against invalid log values
docVector[key] = 0
def calculateVectorLength(docVector):
"""Calculates and returns the length of a document vector for cosine normalization
Arguments:
docVector {dictionary} -- the document vector represented as a dictionary
Returns:
float -- the normalization factor for the vector
"""
sum = 0
for key, value in docVector.items():
sum += value**2
return sqrt(sum)
def calculateIDF(dictionary, N):
"""Calculates and replaces the document frequency with the weighted inverse document frequency
Arguments:
dictionary {dict} -- the dictionary containing the document frequencies
N {int} -- the total document number
"""
for token, information in dictionary.items():
information[myDict.DOCUMENT_FREQUENCY] = log(N/information[myDict.DOCUMENT_FREQUENCY], 10)
def recordData(docId, dictionary, postings, docVector):
"""Records the data from a document vector into the dictionary and postings lists
Arguments:
docId {int} -- the document id
dictionary {myDict} -- the main dictionary
postings {myPostings} -- the main postings list
docVector {dict} -- a document vector represented by a dictionary
"""
for token, weightedTermFreq in docVector.items():
tokenId = dictionary.addToken(token)
postings.addTuple(docId, tokenId, weightedTermFreq)
#additional data structures
class myDict():
TOKEN_ID = 0
DOCUMENT_FREQUENCY = 1
POSTING_START = 2
ID_START = 1
def __init__(self, fileName):
self.dictionary = dict()
self.idDictionary = dict()
self.fileName = fileName
self.id = self.ID_START
def getIdDictionary(self):
return self.idDictionary
def getDictionary(self):
return self.dictionary
#updates the location of a token given the tokenId
def updateLocation(self, tokenId, location):
token = self.idDictionary[tokenId]
self.dictionary[token][self.POSTING_START] = location
#adds a vector entry(token) to the dictionary, returning its ID in the dictionary
def addToken(self, token):
"""Adds a token to the dictionary, returning its ID in the dictionary
Arguments:
token {string} -- the token to be added to the dictionary
Returns:
int -- the ID of the token in the dictionary
"""
return_id = 0
if token in self.dictionary:
#the token exists
return_id = self.dictionary[token][self.TOKEN_ID]
self.dictionary[token][self.DOCUMENT_FREQUENCY] += 1
else:
#the token is new so we assign it a new id
self.dictionary[token] = [self.id, 1, 0]
self.idDictionary[self.id] = token
return_id = self.id
self.id += 1
return return_id
#saves the dictionary to file
def save(self):
"""This function saves the dictionary to file
We use pickling to save the dictionary.
"""
if os.path.isfile(self.fileName):
#clear any previous text
out = open(self.fileName, "r+")
out.truncate(0)
out.close()
with open(self.fileName, 'wb') as f:
pickle.dump(self.dictionary, f)
class myPostings():
#postings are structure as a double layer dictionary, first tokenId - postings list,
#then documentName to term frequency/inverse document frequency
def __init__(self):
self.postings = dict()
def addTuple(self, docName, tokenId, weightedTokenFreq):
"""Adds the document name and weighted tokenFrequency tuple to the postings list of the respective tokenId
Arguments:
docName {int} -- the name of the document
tokenId {int} -- the id of the token to be added (this represents the token)
weightedTokenFreq {float} -- the calculated weighted token frequency
"""
if tokenId in self.postings:
#the tokenId already exists, so we just add the key and value to it
posting = self.postings[tokenId]
posting[docName] = weightedTokenFreq
else:
#we initialize a new dictionary for the postings list of that tokenId
self.postings[tokenId] = dict()
self.postings[tokenId][docName] = weightedTokenFreq
def savePostingsToFile(self, lengths, out_postings, dictionary):
"""Saves the postings list to file. We use pickling to store the postings list.
We store the lengths along with the postings as well.
Arguments:
lengths {dict} -- the length of each document represented by a dictionary
out_postings {string} -- the path to write the posting to
dictionary {myDict} -- we need the dictionary to update the stored location of the postings
"""
if os.path.isfile(out_postings):
#clear any previous text
out = open(out_postings, "r+")
out.truncate(0)
out.close()
out = open(out_postings, "ab")
pickle.dump(lengths, out)
for tokenId in dictionary.getIdDictionary():
posting = self.postings[tokenId]
dictionary.updateLocation(tokenId, out.tell())
pickle.dump(posting, out)
out.close()
#main method
input_directory = output_file_dictionary = output_file_postings = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:d:p:')
except getopt.GetoptError:
usage()
sys.exit(2)
for o, a in opts:
if o == '-i': # input directory
if not a.endswith('/'):
#bulletproofing
a = a + '/'
input_directory = a
elif o == '-d': # dictionary file
output_file_dictionary = a
elif o == '-p': # postings file
output_file_postings = a
else:
assert False, "unhandled option"
if input_directory == None or output_file_postings == None or output_file_dictionary == None:
usage()
sys.exit(2)
build_index(input_directory, output_file_dictionary, output_file_postings)