From ca37e1fe1ade171f6530a4b463d77e58a9450790 Mon Sep 17 00:00:00 2001
From: Noah <noahc1510@gmail.com>
Date: Sat, 30 Mar 2024 09:38:02 +0800
Subject: [PATCH 1/3] fix: Fixed AttributeError: 'ChatGLMTokenizer' object has
 no attribute 'tokenizer'. Tested on transformers==4.38.1 (#45, #63)

---
 chatglm_tokenizer/tokenization_chatglm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/chatglm_tokenizer/tokenization_chatglm.py b/chatglm_tokenizer/tokenization_chatglm.py
index d4ce416..96aa0bd 100644
--- a/chatglm_tokenizer/tokenization_chatglm.py
+++ b/chatglm_tokenizer/tokenization_chatglm.py
@@ -66,11 +66,11 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
     model_input_names = ["input_ids", "attention_mask", "position_ids"]
 
     def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs):
+        self.tokenizer = SPTokenizer(vocab_file)
         super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs)
         self.name = "GLMTokenizer"
 
         self.vocab_file = vocab_file
-        self.tokenizer = SPTokenizer(vocab_file)
         self.special_tokens = {
             "<bos>": self.tokenizer.bos_id,
             "<eos>": self.tokenizer.eos_id,

From 9bfa16b7d580b8a334081102fd56fd8459203106 Mon Sep 17 00:00:00 2001
From: Noah <noahc1510@gmail.com>
Date: Sat, 30 Mar 2024 10:00:35 +0800
Subject: [PATCH 2/3] fix: ambiguous process_wudao

---
 data_process.py | 366 +++++++++++++++++++++++++-----------------------
 1 file changed, 194 insertions(+), 172 deletions(-)

diff --git a/data_process.py b/data_process.py
index 80dca8b..9cde477 100644
--- a/data_process.py
+++ b/data_process.py
@@ -4,40 +4,44 @@
 from tqdm import tqdm
 from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer
 import pandas as pd
-#from zhconv import convert
+
+
+# from zhconv import convert
 def process_wiki_clean():
-    with open('./data/wikipedia_cn_20230720/wikipedia-cn-20230720-filtered.json','r',encoding='utf-8') as f:
-        data=json.load(f)
-    doc_ids=[]
+    with open('./data/wikipedia_cn_20230720/wikipedia-cn-20230720-filtered.json', 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    doc_ids = []
     for line in tqdm(data):
-        text=line['completion']
-        text_id=tokenizer.encode(text,add_special_tokens=False)
+        text = line['completion']
+        text_id = tokenizer.encode(text, add_special_tokens=False)
         text_id.append(tokenizer.special_tokens['<eos>'])
-        if len(text_id)>5:
-            doc_ids+=text_id
-    arr = np.array(doc_ids,dtype=np.uint16)
-    with open('./data/wiki.bin','wb') as f:
+        if len(text_id) > 5:
+            doc_ids += text_id
+    arr = np.array(doc_ids, dtype=np.uint16)
+    with open('./data/wiki.bin', 'wb') as f:
         f.write(arr.tobytes())
 
-def process_medical(data_path,name):
-    f=open(data_path,'r',encoding='utf-8')
-    doc_ids=[]
+
+def process_medical(data_path, name):
+    f = open(data_path, 'r', encoding='utf-8')
+    doc_ids = []
     while True:
-        line=f.readline()
+        line = f.readline()
         if not line:
             break
-        line=json.loads(line)
-        text=line['text']
-        text_id=tokenizer.encode(text,add_special_tokens=False)
+        line = json.loads(line)
+        text = line['text']
+        text_id = tokenizer.encode(text, add_special_tokens=False)
         text_id.append(tokenizer.special_tokens['<eos>'])
-        if len(text_id)>5:
-            doc_ids+=text_id
-    arr = np.array(doc_ids,dtype=np.uint16)
-    with open('./data/medical_{}.bin'.format(name),'wb') as f:
-        f.write(arr.tobytes()) 
+        if len(text_id) > 5:
+            doc_ids += text_id
+    arr = np.array(doc_ids, dtype=np.uint16)
+    with open('./data/medical_{}.bin'.format(name), 'wb') as f:
+        f.write(arr.tobytes())
+
 
 def sft_to_pretrain():
-    doc_ids=[]
+    doc_ids = []
 
     '''
     df=pd.read_csv('./data/medical_qa_144w.csv')
@@ -53,185 +57,202 @@ def sft_to_pretrain():
             doc_ids+=text_id
     '''
 
-    with open('./data/shibing624_medical/finetune/train_en_1.json','r',encoding='utf-8') as f:
+    with open('./data/shibing624_medical/finetune/train_en_1.json', 'r', encoding='utf-8') as f:
         for row in f:
-            line=json.loads(row)
-            q=line['input']
-            a=line['output']
-            q_id=tokenizer.encode(q,add_special_tokens=False)
-            a_id=tokenizer.encode(a,add_special_tokens=False)
-            text_id=q_id+a_id+[tokenizer.special_tokens['<eos>']]
-            if len(text_id)>5:
-                doc_ids+=text_id
-    with open('./data/shibing624_medical/finetune/test_en_1.json','r',encoding='utf-8') as f:
+            line = json.loads(row)
+            q = line['input']
+            a = line['output']
+            q_id = tokenizer.encode(q, add_special_tokens=False)
+            a_id = tokenizer.encode(a, add_special_tokens=False)
+            text_id = q_id + a_id + [tokenizer.special_tokens['<eos>']]
+            if len(text_id) > 5:
+                doc_ids += text_id
+    with open('./data/shibing624_medical/finetune/test_en_1.json', 'r', encoding='utf-8') as f:
         for row in f:
-            line=json.loads(row)
-            q=line['input']
-            a=line['output']
-            q_id=tokenizer.encode(q,add_special_tokens=False)
-            a_id=tokenizer.encode(a,add_special_tokens=False)
-            text_id=q_id+a_id+[tokenizer.special_tokens['<eos>']]
-            if len(text_id)>5:
-                doc_ids+=text_id
-    with open('./data/shibing624_medical/finetune/valid_en_1.json','r',encoding='utf-8') as f:
+            line = json.loads(row)
+            q = line['input']
+            a = line['output']
+            q_id = tokenizer.encode(q, add_special_tokens=False)
+            a_id = tokenizer.encode(a, add_special_tokens=False)
+            text_id = q_id + a_id + [tokenizer.special_tokens['<eos>']]
+            if len(text_id) > 5:
+                doc_ids += text_id
+    with open('./data/shibing624_medical/finetune/valid_en_1.json', 'r', encoding='utf-8') as f:
         for row in f:
-            line=json.loads(row)
-            q=line['input']
-            a=line['output']
-            q_id=tokenizer.encode(q,add_special_tokens=False)
-            a_id=tokenizer.encode(a,add_special_tokens=False)
-            text_id=q_id+a_id+[tokenizer.special_tokens['<eos>']]
-            if len(text_id)>5:
-                doc_ids+=text_id
+            line = json.loads(row)
+            q = line['input']
+            a = line['output']
+            q_id = tokenizer.encode(q, add_special_tokens=False)
+            a_id = tokenizer.encode(a, add_special_tokens=False)
+            text_id = q_id + a_id + [tokenizer.special_tokens['<eos>']]
+            if len(text_id) > 5:
+                doc_ids += text_id
 
-    with open('./data/shibing624_medical/finetune/train_zh_0.json','r',encoding='utf-8') as f:
+    with open('./data/shibing624_medical/finetune/train_zh_0.json', 'r', encoding='utf-8') as f:
         for row in f:
-            line=json.loads(row)
-            q=line['instruction']+line['input']
-            a=line['output']
-            q_id=tokenizer.encode(q,add_special_tokens=False)
-            a_id=tokenizer.encode(a,add_special_tokens=False)
-            text_id=q_id+a_id+[tokenizer.special_tokens['<eos>']]
-            if len(text_id)>5:
-                doc_ids+=text_id
-    with open('./data/shibing624_medical/finetune/test_zh_0.json','r',encoding='utf-8') as f:
+            line = json.loads(row)
+            q = line['instruction'] + line['input']
+            a = line['output']
+            q_id = tokenizer.encode(q, add_special_tokens=False)
+            a_id = tokenizer.encode(a, add_special_tokens=False)
+            text_id = q_id + a_id + [tokenizer.special_tokens['<eos>']]
+            if len(text_id) > 5:
+                doc_ids += text_id
+    with open('./data/shibing624_medical/finetune/test_zh_0.json', 'r', encoding='utf-8') as f:
         for row in f:
-            line=json.loads(row)
-            q=line['instruction']+line['input']
-            a=line['output']
-            q_id=tokenizer.encode(q,add_special_tokens=False)
-            a_id=tokenizer.encode(a,add_special_tokens=False)
-            text_id=q_id+a_id+[tokenizer.special_tokens['<eos>']]
-            if len(text_id)>5:
-                doc_ids+=text_id
-    with open('./data/shibing624_medical/finetune/valid_zh_0.json','r',encoding='utf-8') as f:
+            line = json.loads(row)
+            q = line['instruction'] + line['input']
+            a = line['output']
+            q_id = tokenizer.encode(q, add_special_tokens=False)
+            a_id = tokenizer.encode(a, add_special_tokens=False)
+            text_id = q_id + a_id + [tokenizer.special_tokens['<eos>']]
+            if len(text_id) > 5:
+                doc_ids += text_id
+    with open('./data/shibing624_medical/finetune/valid_zh_0.json', 'r', encoding='utf-8') as f:
         for row in f:
-            line=json.loads(row)
-            q=line['instruction']+line['input']
-            a=line['output']
-            q_id=tokenizer.encode(q,add_special_tokens=False)
-            a_id=tokenizer.encode(a,add_special_tokens=False)
-            text_id=q_id+a_id+[tokenizer.special_tokens['<eos>']]
-            if len(text_id)>5:
-                doc_ids+=text_id
+            line = json.loads(row)
+            q = line['instruction'] + line['input']
+            a = line['output']
+            q_id = tokenizer.encode(q, add_special_tokens=False)
+            a_id = tokenizer.encode(a, add_special_tokens=False)
+            text_id = q_id + a_id + [tokenizer.special_tokens['<eos>']]
+            if len(text_id) > 5:
+                doc_ids += text_id
 
-    arr = np.array(doc_ids,dtype=np.uint16)
+    arr = np.array(doc_ids, dtype=np.uint16)
     print(arr.shape)
-    with open('./data/medical_qa.bin','wb') as f:
+    with open('./data/medical_qa.bin', 'wb') as f:
         f.write(arr.tobytes())
 
+
 def process_baidu():
     BATCH_SIZE = 1000000
 
-    cnt=0
-    batch_cnt=0
-    token=0
-    doc_ids=[]
+    cnt = 0
+    batch_cnt = 0
+    token = 0
+    doc_ids = []
+
+    f1 = open('./data/563w_baidubaike/563w_baidubaike.json', 'r', encoding='utf-8')
 
-    f1=open('./data/563w_baidubaike/563w_baidubaike.json','r',encoding='utf-8')
-    
     while True:
         line = f1.readline()
         if not line:
             break
-        line=json.loads(line)
-        text=''
+        line = json.loads(line)
+        text = ''
         try:
-            text+=line['title']+'：'+line['summary']
+            text += line['title'] + '：' + line['summary']
         except:
             pass
         for per in line['sections']:
-            text+=per['title']+'：'+per['content']+'。'
-        text_id=tokenizer.encode(text,add_special_tokens=False)
+            text += per['title'] + '：' + per['content'] + '。'
+        text_id = tokenizer.encode(text, add_special_tokens=False)
         text_id.append(tokenizer.special_tokens['<eos>'])
-        if len(text_id)>5:
-            doc_ids+=text_id
-        cnt+=1
-        if cnt%BATCH_SIZE==0:
-            batch_cnt+=1
-            arr = np.array(doc_ids,dtype=np.uint16)
-            doc_ids=[]
-            print('cnt:',cnt,'arr_shape:',arr.shape)
-            with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt),'wb') as f2:
+        if len(text_id) > 5:
+            doc_ids += text_id
+        cnt += 1
+        if cnt % BATCH_SIZE == 0:
+            batch_cnt += 1
+            arr = np.array(doc_ids, dtype=np.uint16)
+            doc_ids = []
+            print('cnt:', cnt, 'arr_shape:', arr.shape)
+            with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt), 'wb') as f2:
                 f2.write(arr.tobytes())
             del arr
 
     if not doc_ids:
-        batch_cnt+=1
-        arr = np.array(doc_ids,dtype=np.uint16)
-        print('cnt:',cnt,'arr_shape:',arr.shape)
-        with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt),'wb') as f:
+        batch_cnt += 1
+        arr = np.array(doc_ids, dtype=np.uint16)
+        print('cnt:', cnt, 'arr_shape:', arr.shape)
+        with open('./data/baidubaike_563w_{}.bin'.format(batch_cnt), 'wb') as f:
             f.write(arr.tobytes())
-    
+
+
 def process_c4():
     c4_zh_paths = glob.glob('./data/c4_zh/*')
-    c4_zh_paths=sorted(c4_zh_paths)
+    c4_zh_paths = sorted(c4_zh_paths)
     print(len(c4_zh_paths))
-    cnt=0
-    token=0
-    doc_ids=[]
+    cnt = 0
+    token = 0
+    doc_ids = []
     for per in tqdm(c4_zh_paths):
-        with open(per,'r') as f:
+        with open(per, 'r') as f:
             for line in f:
                 text = json.loads(line)
                 text = text['text']
-                text_id=tokenizer.encode(text,add_special_tokens=False)
+                text_id = tokenizer.encode(text, add_special_tokens=False)
                 text_id.append(tokenizer.special_tokens['<eos>'])
-                if len(text_id)>5:
-                    doc_ids+=text_id
-                cnt+=1
+                if len(text_id) > 5:
+                    doc_ids += text_id
+                cnt += 1
 
-    arr = np.array(doc_ids,dtype=np.uint16)
-    with open('./data/c4_zh.bin','wb') as f:
+    arr = np.array(doc_ids, dtype=np.uint16)
+    with open('./data/c4_zh.bin', 'wb') as f:
         f.write(arr.tobytes())
     print(arr.shape)
 
-def process_wudao():
+
+def process_wudao(slice_size):
     wudao_zh_paths = glob.glob('./data/WuDaoCorpus2.0_base_200G/*')
-    wudao_zh_paths=sorted(wudao_zh_paths)
-    print(len(wudao_zh_paths))#很多子文件
-    cnt=0
-    token=0
-    doc_ids=[]
-    for per in tqdm(wudao_zh_paths[320:]):#wudao_zh_paths[i:j]手动分片，一片片处理，不然太大一次性处理不完
-        with open(per,'r') as f:
-            data=json.load(f)
-            for text in data:
-                text = text['title'] + text['content']
-                text_id=tokenizer.encode(text,add_special_tokens=False)
-                text_id.append(tokenizer.special_tokens['<eos>'])
-                if len(text_id)>5:
-                    doc_ids+=text_id
-                #
-                # if cnt%10000==0:
-                #     print(cnt)
-                cnt+=1
-                #token+=len(text_id)
-                #break
-        #
-        # arr = np.array(doc_ids,dtype=np.uint16)
-        # with open('./data/c4-zh/{}.bin'.format(per.split('/')[-1].split('.')[0]),'wb') as f:
-        #     f.write(arr.tobytes())
-        # print(arr.shape)
-    arr = np.array(doc_ids,dtype=np.uint16)
-    with open('./data/wudaocorpus_zh_16.bin','wb') as f:
-        f.write(arr.tobytes())
-    print(arr.shape)
+    wudao_zh_paths = sorted(wudao_zh_paths)
+    print(len(wudao_zh_paths))  # 很多子文件
 
-if __name__=="__main__":
+    def _internal_wudao_process(idx, slice_data):
+        cnt = 0
+        token = 0
+        doc_ids = []
+        for per in tqdm(slice_data):  # wudao_zh_paths[i:j]手动分片，一片片处理，不然太大一次性处理不完
+            with open(per, 'r') as f:
+                data = json.load(f)
+                for text in data:
+                    text = text['title'] + text['content']
+                    text_id = tokenizer.encode(text, add_special_tokens=False)
+                    text_id.append(tokenizer.special_tokens['<eos>'])
+                    if len(text_id) > 5:
+                        doc_ids += text_id
+                    #
+                    # if cnt%10000==0:
+                    #     print(cnt)
+                    cnt += 1
+                    # token+=len(text_id)
+                    # break
+            #
+            # arr = np.array(doc_ids,dtype=np.uint16)
+            # with open('./data/c4-zh/{}.bin'.format(per.split('/')[-1].split('.')[0]),'wb') as f:
+            #     f.write(arr.tobytes())
+            # print(arr.shape)
+        arr = np.array(doc_ids, dtype=np.uint16)
+        with open(f'./data/wudaocorpus_zh_{idx}.bin', 'wb') as f:
+            f.write(arr.tobytes())
+        print(arr.shape)
+
+    max_idx = 0
+    for idx, i in enumerate(range(len(wudao_zh_paths))[::slice_size]):
+        if i + slice_size < len(wudao_zh_paths):
+            slice_data = wudao_zh_paths[i:i + slice_size]
+        else:
+            slice_data = wudao_zh_paths[i:]
+        _internal_wudao_process(idx, slice_data)
+        max_idx = idx
+
+    return max_idx
+
+
+if __name__ == "__main__":
     tokenizer = ChatGLMTokenizer(vocab_file='./chatglm_tokenizer/tokenizer.model')
     # 数据预处理-如果下载分词处理后的数据，可以不用执行以下函数
     # process_wiki_clean()
     # process_medical('./data/shibing624_medical/pretrain/medical_book_zh.json','book')
     # process_medical('./data/shibing624_medical/pretrain/train_encyclopedia.json','encyclopedia')
-    # process_baidu()
-    # process_c4()
-    # process_wudao()
+    process_baidu()
+    process_c4()
+    max_idx = process_wudao(slice_size=10)
 
     # print('data processing finished!')
 
     # 分词处理后的文件列表
-    data_path_list=[
+    data_path_list = [
         './data/baidubaike_563w_1.bin',
         './data/baidubaike_563w_2.bin',
         './data/baidubaike_563w_3.bin',
@@ -249,30 +270,31 @@ def process_wudao():
         './data/c4_zh_6.bin',
         './data/c4_zh_7.bin',
         './data/c4_zh_8.bin',
-        './data/wudaocorpus_zh_0.bin',
-        './data/wudaocorpus_zh_1.bin',
-        './data/wudaocorpus_zh_2.bin',
-        './data/wudaocorpus_zh_3.bin',
-        './data/wudaocorpus_zh_4.bin',
-        './data/wudaocorpus_zh_5.bin',
-        './data/wudaocorpus_zh_6.bin',
-        './data/wudaocorpus_zh_7.bin',
-        './data/wudaocorpus_zh_8.bin',
-        './data/wudaocorpus_zh_9.bin',
-        './data/wudaocorpus_zh_10.bin',
-        './data/wudaocorpus_zh_11.bin',
-        './data/wudaocorpus_zh_12.bin',
-        './data/wudaocorpus_zh_13.bin',
-        './data/wudaocorpus_zh_14.bin',
-        './data/wudaocorpus_zh_15.bin',
-        './data/wudaocorpus_zh_16.bin',
-    ]
-    data_lst=[]
+        # './data/wudaocorpus_zh_0.bin',
+        # './data/wudaocorpus_zh_1.bin',
+        # './data/wudaocorpus_zh_2.bin',
+        # './data/wudaocorpus_zh_3.bin',
+        # './data/wudaocorpus_zh_4.bin',
+        # './data/wudaocorpus_zh_5.bin',
+        # './data/wudaocorpus_zh_6.bin',
+        # './data/wudaocorpus_zh_7.bin',
+        # './data/wudaocorpus_zh_8.bin',
+        # './data/wudaocorpus_zh_9.bin',
+        # './data/wudaocorpus_zh_10.bin',
+        # './data/wudaocorpus_zh_11.bin',
+        # './data/wudaocorpus_zh_12.bin',
+        # './data/wudaocorpus_zh_13.bin',
+        # './data/wudaocorpus_zh_14.bin',
+        # './data/wudaocorpus_zh_15.bin',
+        # './data/wudaocorpus_zh_16.bin',
+    ].extend([f'./data/wudaocorpus_zh_{i}.bin' for i in range(max_idx)])
+
+    data_lst = []
     for data_path in tqdm(data_path_list):
-        with open(data_path,'rb') as f:
-            data=np.fromfile(f,dtype=np.uint16)
+        with open(data_path, 'rb') as f:
+            data = np.fromfile(f, dtype=np.uint16)
             data_lst.append(data)
     arr = np.concatenate(data_lst)
     print(arr.shape)
-    with open('./data/pretrain_data.bin','wb') as f:
+    with open('./data/pretrain_data.bin', 'wb') as f:
         f.write(arr.tobytes())

From 8111eb2111bad63272646d2389913158b4e652f5 Mon Sep 17 00:00:00 2001
From: Noah <noahc1510@gmail.com>
Date: Sat, 30 Mar 2024 14:41:10 +0800
Subject: [PATCH 3/3] fix(data_process): Reduce memory usage

---
 data_process.py | 58 +++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/data_process.py b/data_process.py
index 9cde477..6d81389 100644
--- a/data_process.py
+++ b/data_process.py
@@ -4,6 +4,7 @@
 from tqdm import tqdm
 from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer
 import pandas as pd
+import os
 
 
 # from zhconv import convert
@@ -245,9 +246,9 @@ def _internal_wudao_process(idx, slice_data):
     # process_wiki_clean()
     # process_medical('./data/shibing624_medical/pretrain/medical_book_zh.json','book')
     # process_medical('./data/shibing624_medical/pretrain/train_encyclopedia.json','encyclopedia')
-    process_baidu()
-    process_c4()
-    max_idx = process_wudao(slice_size=10)
+    # process_baidu()
+    # process_c4()
+    # max_idx = process_wudao(slice_size=10)
 
     # print('data processing finished!')
 
@@ -270,31 +271,36 @@ def _internal_wudao_process(idx, slice_data):
         './data/c4_zh_6.bin',
         './data/c4_zh_7.bin',
         './data/c4_zh_8.bin',
-        # './data/wudaocorpus_zh_0.bin',
-        # './data/wudaocorpus_zh_1.bin',
-        # './data/wudaocorpus_zh_2.bin',
-        # './data/wudaocorpus_zh_3.bin',
-        # './data/wudaocorpus_zh_4.bin',
-        # './data/wudaocorpus_zh_5.bin',
-        # './data/wudaocorpus_zh_6.bin',
-        # './data/wudaocorpus_zh_7.bin',
-        # './data/wudaocorpus_zh_8.bin',
-        # './data/wudaocorpus_zh_9.bin',
-        # './data/wudaocorpus_zh_10.bin',
-        # './data/wudaocorpus_zh_11.bin',
-        # './data/wudaocorpus_zh_12.bin',
-        # './data/wudaocorpus_zh_13.bin',
-        # './data/wudaocorpus_zh_14.bin',
-        # './data/wudaocorpus_zh_15.bin',
-        # './data/wudaocorpus_zh_16.bin',
-    ].extend([f'./data/wudaocorpus_zh_{i}.bin' for i in range(max_idx)])
+        './data/wudaocorpus_zh_0.bin',
+        './data/wudaocorpus_zh_1.bin',
+        './data/wudaocorpus_zh_2.bin',
+        './data/wudaocorpus_zh_3.bin',
+        './data/wudaocorpus_zh_4.bin',
+        './data/wudaocorpus_zh_5.bin',
+        './data/wudaocorpus_zh_6.bin',
+        './data/wudaocorpus_zh_7.bin',
+        './data/wudaocorpus_zh_8.bin',
+        './data/wudaocorpus_zh_9.bin',
+        './data/wudaocorpus_zh_10.bin',
+        './data/wudaocorpus_zh_11.bin',
+        './data/wudaocorpus_zh_12.bin',
+        './data/wudaocorpus_zh_13.bin',
+        './data/wudaocorpus_zh_14.bin',
+        './data/wudaocorpus_zh_15.bin',
+        './data/wudaocorpus_zh_16.bin',
+    ] \
+        # .extend([f'./data/wudaocorpus_zh_{i}.bin' for i in range(max_idx)])
+
+    if os.path.exists('./data/pretrain_data.bin'):
+        print("Warning: The pretrain data is existed, "
+              "your operation will be added at the end of the file.")
 
-    data_lst = []
     for data_path in tqdm(data_path_list):
+        data_lst = []
         with open(data_path, 'rb') as f:
             data = np.fromfile(f, dtype=np.uint16)
             data_lst.append(data)
-    arr = np.concatenate(data_lst)
-    print(arr.shape)
-    with open('./data/pretrain_data.bin', 'wb') as f:
-        f.write(arr.tobytes())
+        arr = np.concatenate(data_lst)
+        # print(arr.shape)
+        with open('./data/pretrain_data.bin', 'ab') as f:
+            f.write(arr.tobytes())