-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreaders_11.py
141 lines (122 loc) · 4.61 KB
/
readers_11.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
## Use to reclaim dictionary from dict_file
## returns a dict called pointers
## for token t, pointers[t] -> [pointerAddressFor t, numberOfFilesThatHave t]
## Dictionary's first two lines are metadata
## all subsequent lines are (token, df, Pointer, BytesToRead)
## Postings lists starts with the doc_num -> [docID, Score] pointer
## Then all subsequents are listed as doc_num<space>tf<?>
def pointers_to_post(dict_file):
## pointers is mapping from docID to (Pointer, df, BytesToRead)
pointers = {}
with open(dict_file, "rb") as binary_file:
file_content = binary_file.read()
lines = file_content.split(b'\n')
for line in lines[3:]:
line = line.decode()
words = line.split()
if len(words) == 4:
pointers[words[0]] = [int(words[2]), int(words[1]), int(words[3])]
return pointers
## returns the pointer to the start of
## the int -> filename mapping int the postings list
## and the number of documents
## and the size of each docID identifier
## and the size of tf_string in postings
## and compression 0 - none 1- yes
## and tokeniser 0 - standard 1 - BPE
## these were saved in the first 2 lines of the dictionary
def metaData(dict_file):
with open(dict_file, "rb") as binary_file:
lineA = b""
while True:
char = binary_file.read(1)
if char == b'\n' or not char:
break
lineA += char
line1 = lineA.decode()
words1 = line1.split()
lineB = b""
while True:
char = binary_file.read(1)
if char == b'\n' or not char:
break
lineB += char
line2 = lineB.decode()
words2 = line2.split()
return [int(words1[1]), int(words1[3]), int(words2[1]), int(words2[3]), int(words2[5]), int(words2[7])]
#return [int(words1[1]), int(words1[3]), int(words2[1]), int(words2[3]), 0, 0]
## gives [mapPointerStart, numDocs, sizeID]
def getVals(dict_file):
return metaData(dict_file)
## generates map from docID -> docName, docScore
def getMapping(postings_file, dict_file):
map_docID_to_doc = {}
[_, num_docs, _, _, _, _] = getVals(dict_file)
with open(postings_file, 'rb') as pf:
for i in range(num_docs):
lineB = b""
while True:
char = pf.read(1)
if char == b'\n':
break
lineB += char
lineA = lineB.decode()
words = lineA.split()
map_docID_to_doc[int(words[0])] = [words[1].strip(), float(words[2])]
# for token in map_docID_to_doc.keys():
# print(str(token) + " maps to document = " + map_docID_to_doc[token])
return map_docID_to_doc
def getDocs(term, pointers, postings_file):
if term in pointers.keys():
[pointer_start, _, to_read] = pointers[term]
else:
[pointer_start, _, to_read] = pointers["a"]
with open(postings_file, 'rb') as pf:
pf.seek(pointer_start)
line = pf.read(to_read)
lines = line.split("?".encode())
doc_numbers = []
doc_tfs = []
#print("reading for term = " + term + " at " + str(pointer_start) + " for len = " + str(to_read))
#print("We found = " + line.decode())
for l in lines:
ls = l.split()
#print("ls = ", ls)
if len(ls) == 2:
doc_numbers.append(int(ls[0]))
doc_tfs.append(int(ls[1]))
#docIDs = [line[i:i + id_size] for i in range(0, len(line), id_size)]
#doc_numbers = [int(id[0:id_size-tf_length]) for id in docIDs]
#doc_tfs = [int(id[id_size-tf_length:]) for id in docIDs]
return doc_numbers, doc_tfs
#print(metaData("dictionary_vbe.txt"))
#pointers = pointers_to_post("dictionary_vbe.txt")
# for key in pointers.keys():
# print("key = " + key + " maps to " + str(pointers[key]))
#print(getDocs("site", pointers, "ps_vbe.txt"))
#getVals("dictionary.txt")
#getMapping("postings_list.txt", "dictionary.txt")
## default ifile here = merges_compressed_bpe.txt
def get_merges(ifile):
merges = []
with open(ifile, 'rb') as ifi:
while True:
char = ifi.read(1)
if char == b'\n':
break
while True:
char = ifi.read(1)
if char == b'\n':
break
lineB = b""
while True:
char = ifi.read(1)
if char == b'\n':
break
lineB += char
lineA = lineB.decode()
words = lineA.split("?")
for word in words[:-1]:
wordi = word.split()
merges.append((wordi[0], wordi[1]))
return merges