-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconvert_sharegpt.py
39 lines (32 loc) · 1.09 KB
/
convert_sharegpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import argparse
import jsonlines
import json
from tqdm import tqdm
import uuid
import random
in_file = "SystemChat_copy_filtered.jsonl"
out_file = "SystemChat_sharegpt.jsonl"
def convert_sample(sample, id = None):
obj = {
"conversations": []
}
for turn in sample:
if turn["role"] == "system":
obj["conversations"].append({"from": "system", "value": turn["content"] })
elif turn["role"] == "user":
obj["conversations"].append({"from": "human", "value": turn["content"] })
else:
obj["conversations"].append({"from": "gpt", "value": turn["content"]})
if turn["content"].strip() == "" or "<<||END||>>" in turn["content"]:
return None
return obj
with open(out_file, "w", encoding="utf-8") as f:
with jsonlines.open(in_file) as reader:
for obj in tqdm(reader):
if obj[-1]["role"] == "user":
obj.pop()
new_obj = convert_sample(obj)
if new_obj is None:
continue
json.dump(new_obj, f)
f.write("\n")