-
Notifications
You must be signed in to change notification settings - Fork 0
/
__main__.py
118 lines (91 loc) · 4.08 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
import re
import csv
import json
import argparse
__author__ = "Eryk Darnowski"
__version__ = "0.0.0"
__license__ = "MIT"
def is_sender(string):
return string[0] == " "
def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument("-i", default="transcript.txt", help="Choose input filename")
parser.add_argument("-o", default="output", help="Choose output filename - no extension")
parser.add_argument("-f", choices=["csv", "json"], default="json", help="Choose output format (csv/json)")
args = parser.parse_args()
output_format = args.f
output_filename = args.o
input_filename = args.i
config = {
"role": "system",
"content": "Jestem Marek, chat bot bazowany na wiadomościach z Messengera, który ma na celu imitować chat z tą osobą.",
}
regex_patterns = [
"http(s)?:\/\/",
"(Przekazano wiadomość|przekazuje wiadomość)",
"(C|c)ofn(ięto|ęła)\swys(y)?łanie\swiadomości",
"Załącznik niedostępnyTen załącznik mógł zostać usunięty lub",
"(Odpowiedziałeś\(aś\)\s|Oryginalna wiadomość:|Użytkownik odpisał Ci|odpowiedział)",
"^(|\s+)$",
]
# read the input file contents
with open(input_filename, "r", encoding="utf-8") as input_file:
transcript = input_file.read()
# perform cleanup
## remove parts
transcript = re.sub(r"^{[0-9]+}\s", "", transcript, 0, re.MULTILINE)
## remove lines
transcript = transcript.splitlines()
transcript = [line for line in transcript if not any(re.search(pattern, line) for pattern in regex_patterns)]
if (output_format == "csv"):
config = config["content"]
# split by sender / receiver + format
convo_list = []
convo = [config] if (output_format == "csv") else { "messages": [ config ] }
# go through each line
for i in range(len(transcript) - 1):
curr_is_sender = is_sender(transcript[i])
if (output_format == "csv"):
# needs to be first to keep the: prompts first, answers later schema
if curr_is_sender:
if (len(convo) == 1):
convo.append(transcript[i].lstrip())
else:
convo[-1] += '\n' + transcript[i].lstrip()
else:
if (len(convo) > 1):
if (len(convo) == 2):
convo.append(transcript[i])
else:
convo[-1] += '\n' + transcript[i]
# making sure that the convo won't start with an answer to a non existant prompt
if (not curr_is_sender and is_sender(transcript[i + 1])):
if (len(convo) > 1):
convo_list.append(convo)
convo = [config]
else:
if curr_is_sender:
convo["messages"].append({ "role": "user", "content": transcript[i].lstrip() })
else:
if (len(convo["messages"]) > 1):
convo["messages"].append({ "role": "assistant", "content": transcript[i] })
# making sure that the convo won't start with an answer to a non existant prompt
if (not curr_is_sender and is_sender(transcript[i + 1])):
if (len(convo["messages"]) > 1):
convo_list.append(convo)
convo = { "messages": [ config ] }
# write output
output_filename += '.csv' if (output_format == "csv") else '.jsonl'
with open(output_filename, 'w', encoding='utf-8') as output_file:
if (output_format == "csv"):
writer = csv.writer(output_file)
writer.writerow(['system', 'user', 'agent'])
for convo in convo_list:
writer.writerow(convo)
else:
for convo in convo_list:
json.dump(convo, output_file, ensure_ascii=False)
output_file.write('\n')
if __name__ == "__main__":
main()