forked from Limour-dev/HelloGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
h_corpus.py
94 lines (81 loc) · 3.44 KB
/
h_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import gzip
def get_all_files_in_directory(directory):
all_files = []
for root, dirs, files in os.walk(directory):
root = root[len(directory):]
if root.startswith('\\') or root.startswith('/'):
root = root[1:]
for file in files:
file_path = os.path.join(root, file)
all_files.append(file_path)
return all_files
class Fileset(list):
def __init__(self, path, ext='', _read=None):
if isinstance(path, str):
self.root = path
self.extend(f for f in get_all_files_in_directory(self.root) if f.endswith(ext))
self._read = _read
def __getitem__(self, index):
if isinstance(index, int): # index是索引
if self._read:
return self._read(os.path.join(self.root, super().__getitem__(index)))
else:
return os.path.join(self.root, super().__getitem__(index))
else: # index是切片
fileset = Fileset(None)
fileset.root = self.root
fileset._read = self._read
fileset.extend(super().__getitem__(index))
return fileset
from tokenizer import tokenizer
token_eos = 2
def readOne(filePath):
retn = []
with gzip.open(filePath, 'rt', encoding='utf-8') if filePath.endswith('.gz') else open(filePath, encoding='utf-8') as f:
for line in f:
retn += tokenizer.encode(line).ids
retn.append(token_eos)
return retn
class Hcorpus():
def __init__(self, path, ext='txt', fileset_idx=0, fileset_sub_idx=0):
self.fileset = Fileset(path, ext, readOne)
self.fileset_idx = fileset_idx
self.fileset_sub_idx = fileset_sub_idx
if self.fileset_sub_idx < 0: # 再读上一个太复杂了,直接放弃
self.fileset_sub_idx = 0
if self.fileset_idx >= len(self.fileset):
self.fileset_idx = 0
self.cache = self.fileset[self.fileset_idx]
self.fileset_idx += 1
self.cache_idx = self.fileset_sub_idx
def __call__(self, size=512):
while len(self.cache) < self.cache_idx + size:
if self.fileset_idx >= len(self.fileset):
self.fileset_idx = 0
self.fileset_sub_idx = self.cache_idx - len(self.cache)
self.cache = self.cache[self.cache_idx:] + self.fileset[self.fileset_idx]
self.cache_idx = 0
self.fileset_idx += 1
retn = self.cache[self.cache_idx:self.cache_idx + size]
self.cache_idx += size
self.fileset_sub_idx += size
return retn
def __repr__(self):
return f"Hcorpus(r'{self.fileset.root}', fileset_idx={self.fileset_idx-1}, fileset_sub_idx={self.fileset_sub_idx})"
if __name__ == '__main__':
# filePath = r'D:\datasets\h-corpus'
# tmp = Hcorpus(filePath)
# for i in range(10):
# tmp2 = tmp()
# tmp3 = tokenizer.decode(tmp2)
# print(tmp2, '\n', tmp3, '\n', len(tmp3)/len(tmp2))
# print(tmp)
# print(tokenizer.decode(tmp()))
# tmp = Hcorpus(r'D:\datasets\h-corpus', fileset_idx=1, fileset_sub_idx=132)
# print(tokenizer.decode(tmp()))
# print(tmp)
filePath = r'D:\datasets\h-corpus'
tmp = Hcorpus(filePath, ext='.txt.gz') # pigz *
# 2..20 | ForEach-Object {cd "D:\datasets\h-corpus\h-corpus-s$($_.ToString('D2'))"; pigz *.txt}
print(tokenizer.decode(tmp()))