diff --git a/scripts/minicpm_tokenizer.py b/scripts/minicpm_tokenizer.py
new file mode 100644
index 0000000..a63e70e
--- /dev/null
+++ b/scripts/minicpm_tokenizer.py
@@ -0,0 +1,132 @@
+from transformers import AutoTokenizer, PreTrainedTokenizerFast
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import json
+import argparse
+
+
+class TokenizerGLM3_Http():
+
+ def __init__(self):
+
+ path = 'minicpm_tokenizer'
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
+ # self.tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b",
+ # trust_remote_code=True)
+
+ def encode(self, prompt):
+ # tokenizer.apply_chat_template(
+ # prompt,
+ # add_generation_prompt=True,
+ # return_tensors="pt")
+ # token_ids = self.tokenizer.apply_chat_template(prompt)
+ history = []
+ history.append({"role": "user", "content": prompt})
+ history_str = self.tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
+ print(history_str)
+ token_ids = self.tokenizer.encode(history_str)
+ return token_ids
+
+ def decode(self, token_ids):
+ return self.tokenizer.decode(token_ids, clean_up_tokenization_spaces=False)
+
+ @property
+ def bos_id(self):
+ return self.tokenizer.bos_token_id
+
+ @property
+ def eos_id(self):
+ return self.tokenizer.eos_token_id
+
+ @property
+ def bos_token(self):
+ return self.tokenizer.bos_token
+
+ @property
+ def eos_token(self):
+ return self.tokenizer.eos_token
+
+
+tokenizer = TokenizerGLM3_Http()
+
+print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id, tokenizer.eos_token)
+print(tokenizer.encode("hello world"))
+
+
+class Request(BaseHTTPRequestHandler):
+ #通过类继承,新定义类
+ timeout = 5
+ server_version = 'Apache'
+
+ def do_GET(self):
+ print(self.path)
+ #在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
+ self.send_response(200)
+ self.send_header("type","get") #设置响应头,可省略或设置多个
+ self.end_headers()
+
+ if self.path == '/bos_id':
+ bos_id = tokenizer.bos_id
+ # print(bos_id)
+ # to json
+ if bos_id is None:
+ msg = json.dumps({'bos_id': -1})
+ else:
+ msg = json.dumps({'bos_id': bos_id})
+ elif self.path == '/eos_id':
+ eos_id = tokenizer.eos_id
+ if eos_id is None:
+ msg = json.dumps({'eos_id': -1})
+ else:
+ msg = json.dumps({'eos_id': eos_id})
+ else:
+ msg = 'error'
+
+ print(msg)
+ msg = str(msg).encode() #转为str再转为byte格式
+
+ self.wfile.write(msg) #将byte格式的信息返回给客户端
+
+ def do_POST(self):
+ #在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
+ data = self.rfile.read(int(self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
+ data = data.decode() #将byte格式转为str格式
+
+ self.send_response(200)
+ self.send_header("type","post") #设置响应头,可省略或设置多个
+ self.end_headers()
+
+ if self.path == '/encode':
+ req = json.loads(data)
+ prompt = req['text']
+ token_ids = tokenizer.encode(prompt)
+ if token_ids is None:
+ msg = json.dumps({'token_ids': -1})
+ else:
+ msg = json.dumps({'token_ids': token_ids})
+
+ elif self.path == '/decode':
+ req = json.loads(data)
+ token_ids = req['token_ids']
+ text = tokenizer.decode(token_ids)
+ if text is None:
+ msg = json.dumps({'text': ""})
+ else:
+ msg = json.dumps({'text': text})
+ else:
+ msg = 'error'
+ print(msg)
+ msg = str(msg).encode() #转为str再转为byte格式
+
+ self.wfile.write(msg) #将byte格式的信息返回给客户端
+
+if __name__ == "__main__":
+
+ args = argparse.ArgumentParser()
+ args.add_argument('--host', type=str, default='localhost')
+ args.add_argument('--port', type=int, default=8080)
+ args = args.parse_args()
+
+ host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
+ print('http://%s:%s' % host)
+ server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
+ server.serve_forever() #开启服务
\ No newline at end of file
diff --git a/scripts/minicpm_tokenizer/special_tokens_map.json b/scripts/minicpm_tokenizer/special_tokens_map.json
new file mode 100644
index 0000000..451134b
--- /dev/null
+++ b/scripts/minicpm_tokenizer/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/scripts/minicpm_tokenizer/tokenizer.json b/scripts/minicpm_tokenizer/tokenizer.json
new file mode 100644
index 0000000..209e947
--- /dev/null
+++ b/scripts/minicpm_tokenizer/tokenizer.json
@@ -0,0 +1,294435 @@
+{
+ "version": "1.0",
+ "truncation": null,
+ "padding": null,
+ "added_tokens": [
+ {
+ "id": 0,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 1,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 2,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ }
+ ],
+ "normalizer": {
+ "type": "Sequence",
+ "normalizers": [
+ {
+ "type": "Prepend",
+ "prepend": "▁"
+ },
+ {
+ "type": "Replace",
+ "pattern": {
+ "String": " "
+ },
+ "content": "▁"
+ }
+ ]
+ },
+ "pre_tokenizer": null,
+ "post_processor": {
+ "type": "TemplateProcessing",
+ "single": [
+ {
+ "SpecialToken": {
+ "id": "",
+ "type_id": 0
+ }
+ },
+ {
+ "Sequence": {
+ "id": "A",
+ "type_id": 0
+ }
+ }
+ ],
+ "pair": [
+ {
+ "SpecialToken": {
+ "id": "",
+ "type_id": 0
+ }
+ },
+ {
+ "Sequence": {
+ "id": "A",
+ "type_id": 0
+ }
+ },
+ {
+ "SpecialToken": {
+ "id": "",
+ "type_id": 1
+ }
+ },
+ {
+ "Sequence": {
+ "id": "B",
+ "type_id": 1
+ }
+ }
+ ],
+ "special_tokens": {
+ "": {
+ "id": "",
+ "ids": [
+ 1
+ ],
+ "tokens": [
+ ""
+ ]
+ }
+ }
+ },
+ "decoder": {
+ "type": "Sequence",
+ "decoders": [
+ {
+ "type": "Replace",
+ "pattern": {
+ "String": "▁"
+ },
+ "content": " "
+ },
+ {
+ "type": "ByteFallback"
+ },
+ {
+ "type": "Fuse"
+ },
+ {
+ "type": "Strip",
+ "content": " ",
+ "start": 1,
+ "stop": 0
+ }
+ ]
+ },
+ "model": {
+ "type": "BPE",
+ "dropout": null,
+ "unk_token": "",
+ "continuing_subword_prefix": null,
+ "end_of_word_suffix": null,
+ "fuse_unk": true,
+ "byte_fallback": true,
+ "vocab": {
+ "": 0,
+ "": 1,
+ "": 2,
+ "": 3,
+ "": 4,
+ "\n": 5,
+ "\t": 6,
+ "": 7,
+ "": 8,
+ "": 9,
+ "": 10,
+ "": 11,
+ "
": 12,
+ "": 13,
+ " | | ": 14,
+ "": 15,
+ "": 16,
+ "": 17,
+ "": 18,
+ "": 21,
+ "": 22,
+ "
": 23,
+ "": 24,
+ "": 25,
+ "": 26,
+ "": 27,
+ "": 28,
+ "": 29,
+ "": 30,
+ "": 31,
+ "": 32,
+ "
": 33,
+ "
": 34,
+ "
": 35,
+ "": 36,
+ "": 37,
+ "": 38,
+ "
": 39,
+ "": 40,
+ "": 41,
+ "
": 42,
+ "": 43,
+ "
": 44,
+ "
": 45,
+ "": 46,
+ "": 47,
+ "
": 48,
+ "": 49,
+ "": 50,
+ "": 51,
+ "0": 52,
+ "1": 53,
+ "2": 54,
+ "3": 55,
+ "4": 56,
+ "5": 57,
+ "6": 58,
+ "7": 59,
+ "8": 60,
+ "9": 61,
+ "+": 62,
+ "-": 63,
+ "=": 64,
+ ",": 65,
+ "。": 66,
+ "!": 67,
+ "?": 68,
+ "、": 69,
+ ":": 70,
+ "¥": 71,
+ ".": 72,
+ "!": 73,
+ "?": 74,
+ "...": 75,
+ "。。。": 76,
+ "。。。。。。": 77,
+ "《": 78,
+ "》": 79,
+ "【": 80,
+ "】": 81,
+ "『": 82,
+ "』": 83,
+ "```": 84,
+ "": 86,
+ "---": 87,
+ "": 88,
+ ";": 89,
+ ".": 90,
+ "=": 91,
+ "<": 92,
+ ">": 93,
+ "-": 94,
+ "+": 95,
+ "%": 96,
+ "‼": 97,
+ "㊣": 98,
+ "/": 99,
+ "|": 100,
+ "": 101,
+ "": 102,
+ "": 103,
+ "": 104,
+ "": 105,
+ "": 106,
+ "": 107,
+ "": 108,
+ "": 109,
+ "": 110,
+ "": 111,
+ "": 112,
+ "": 113,
+ "": 114,
+ "": 115,
+ "": 116,
+ "": 117,
+ "": 118,
+ "": 119,
+ "": 120,
+ "": 121,
+ "": 122,
+ "": 123,
+ "": 124,
+ "": 125,
+ "": 126,
+ "": 127,
+ "": 128,
+ "": 129,
+ "": 130,
+ "": 131,
+ "": 132,
+ "": 133,
+ "": 134,
+ "": 135,
+ "": 136,
+ "": 137,
+ "": 138,
+ "": 139,
+ "": 140,
+ "": 141,
+ "": 142,
+ "": 143,
+ "": 144,
+ "": 145,
+ "": 146,
+ "": 147,
+ "": 148,
+ "": 149,
+ "": 150,
+ "": 151,
+ "": 152,
+ "": 153,
+ "": 154,
+ "": 155,
+ "": 156,
+ "": 157,
+ "": 158,
+ "": 159,
+ "": 160,
+ "": 161,
+ "": 162,
+ "": 163,
+ "": 164,
+ "": 165,
+ "": 166,
+ "": 167,
+ "": 168,
+ "": 169,
+ "": 170,
+ "": 171,
+ "": 172,
+ "": 173,
+ "": 174,
+ "": 175,
+ "": 176,
+ "": 177,
+ "": 178,
+ "": 179,
+ "": 180,
+ "": 181,
+ "": 182,
+ "": 183,
+ "": 184,
+ "": 185,
+ "": 186,
+ "": 187,
+ "": 188,
+ "": 189,
+ "": 190,
+ "": 191,
+ "": 192,
+ "": 193,
+ "": 194,
+ "": 195,
+ "": 196,
+ "": 197,
+ "": 198,
+ "": 199,
+ "": 200,
+ "": 201,
+ "": 202,
+ "": 203,
+ "": 204,
+ "": 205,
+ "": 206,
+ "": 207,
+ "": 208,
+ "": 209,
+ "": 210,
+ "": 211,
+ "": 212,
+ "": 213,
+ "": 214,
+ "": 215,
+ "": 216,
+ "": 217,
+ "": 218,
+ "": 219,
+ "": 220,
+ "": 221,
+ "": 222,
+ "": 223,
+ "": 224,
+ "": 225,
+ "": 226,
+ "": 227,
+ "": 228,
+ "": 229,
+ "": 230,
+ "": 231,
+ "": 232,
+ "": 233,
+ "": 234,
+ "": 235,
+ "": 236,
+ "": 237,
+ "": 238,
+ "": 239,
+ "": 240,
+ "": 241,
+ "": 242,
+ "": 243,
+ "": 244,
+ "": 245,
+ "": 246,
+ "": 247,
+ "": 248,
+ "": 249,
+ "": 250,
+ "": 251,
+ "": 252,
+ "": 253,
+ "": 254,
+ "": 255,
+ "": 256,
+ "": 257,
+ "": 258,
+ "": 259,
+ "": 260,
+ "": 261,
+ "": 262,
+ "": 263,
+ "": 264,
+ "": 265,
+ "": 266,
+ "": 267,
+ "": 268,
+ "": 269,
+ "": 270,
+ "": 271,
+ "": 272,
+ "": 273,
+ "": 274,
+ "": 275,
+ "": 276,
+ "": 277,
+ "": 278,
+ "": 279,
+ "": 280,
+ "": 281,
+ "": 282,
+ "": 283,
+ "": 284,
+ "": 285,
+ "": 286,
+ "": 287,
+ "": 288,
+ "": 289,
+ "": 290,
+ "": 291,
+ "": 292,
+ "": 293,
+ "": 294,
+ "": 295,
+ "": 296,
+ "": 297,
+ "": 298,
+ "": 299,
+ "": 300,
+ "": 301,
+ "": 302,
+ "": 303,
+ "": 304,
+ "": 305,
+ "": 306,
+ "": 307,
+ "": 308,
+ "": 309,
+ "": 310,
+ "": 311,
+ "": 312,
+ "": 313,
+ "": 314,
+ "": 315,
+ "": 316,
+ "": 317,
+ "": 318,
+ "": 319,
+ "": 320,
+ "": 321,
+ "": 322,
+ "": 323,
+ "": 324,
+ "": 325,
+ "": 326,
+ "": 327,
+ "": 328,
+ "": 329,
+ "": 330,
+ "": 331,
+ "": 332,
+ "": 333,
+ "": 334,
+ "": 335,
+ "": 336,
+ "": 337,
+ "": 338,
+ "": 339,
+ "": 340,
+ "": 341,
+ "": 342,
+ "": 343,
+ "": 344,
+ "": 345,
+ "": 346,
+ "": 347,
+ "": 348,
+ "": 349,
+ "": 350,
+ "": 351,
+ "": 352,
+ "": 353,
+ "