Skip to content

Commit

Permalink
fix(data_process): Reduce memory usage
Browse files Browse the repository at this point in the history
  • Loading branch information
noahc1510 committed Mar 30, 2024
1 parent 9bfa16b commit fdf03ce
Showing 1 changed file with 31 additions and 26 deletions.
57 changes: 31 additions & 26 deletions data_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,9 @@ def _internal_wudao_process(idx, slice_data):
# process_wiki_clean()
# process_medical('./data/shibing624_medical/pretrain/medical_book_zh.json','book')
# process_medical('./data/shibing624_medical/pretrain/train_encyclopedia.json','encyclopedia')
process_baidu()
process_c4()
max_idx = process_wudao(slice_size=10)
# process_baidu()
# process_c4()
# max_idx = process_wudao(slice_size=10)

# print('data processing finished!')

Expand All @@ -270,31 +270,36 @@ def _internal_wudao_process(idx, slice_data):
'./data/c4_zh_6.bin',
'./data/c4_zh_7.bin',
'./data/c4_zh_8.bin',
# './data/wudaocorpus_zh_0.bin',
# './data/wudaocorpus_zh_1.bin',
# './data/wudaocorpus_zh_2.bin',
# './data/wudaocorpus_zh_3.bin',
# './data/wudaocorpus_zh_4.bin',
# './data/wudaocorpus_zh_5.bin',
# './data/wudaocorpus_zh_6.bin',
# './data/wudaocorpus_zh_7.bin',
# './data/wudaocorpus_zh_8.bin',
# './data/wudaocorpus_zh_9.bin',
# './data/wudaocorpus_zh_10.bin',
# './data/wudaocorpus_zh_11.bin',
# './data/wudaocorpus_zh_12.bin',
# './data/wudaocorpus_zh_13.bin',
# './data/wudaocorpus_zh_14.bin',
# './data/wudaocorpus_zh_15.bin',
# './data/wudaocorpus_zh_16.bin',
].extend([f'./data/wudaocorpus_zh_{i}.bin' for i in range(max_idx)])
'./data/wudaocorpus_zh_0.bin',
'./data/wudaocorpus_zh_1.bin',
'./data/wudaocorpus_zh_2.bin',
'./data/wudaocorpus_zh_3.bin',
'./data/wudaocorpus_zh_4.bin',
'./data/wudaocorpus_zh_5.bin',
'./data/wudaocorpus_zh_6.bin',
'./data/wudaocorpus_zh_7.bin',
'./data/wudaocorpus_zh_8.bin',
'./data/wudaocorpus_zh_9.bin',
'./data/wudaocorpus_zh_10.bin',
'./data/wudaocorpus_zh_11.bin',
'./data/wudaocorpus_zh_12.bin',
'./data/wudaocorpus_zh_13.bin',
'./data/wudaocorpus_zh_14.bin',
'./data/wudaocorpus_zh_15.bin',
'./data/wudaocorpus_zh_16.bin',
] \
# .extend([f'./data/wudaocorpus_zh_{i}.bin' for i in range(max_idx)])

if os.path.exists('./data/pretrain_data.bin'):
print("Warning: The pretrain data is existed, "
"your operation will be added at the end of the file.")

data_lst = []
for data_path in tqdm(data_path_list):
data_lst = []
with open(data_path, 'rb') as f:
data = np.fromfile(f, dtype=np.uint16)
data_lst.append(data)
arr = np.concatenate(data_lst)
print(arr.shape)
with open('./data/pretrain_data.bin', 'wb') as f:
f.write(arr.tobytes())
arr = np.concatenate(data_lst)
# print(arr.shape)
with open('./data/pretrain_data.bin', 'ab') as f:
f.write(arr.tobytes())

0 comments on commit fdf03ce

Please sign in to comment.