-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfixer.py
34 lines (25 loc) · 1.15 KB
/
fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import csv
#script for removing [] and " " from id and text
input_file = "MaCoCu-dataset-(all)2M+-filtered-06-ids.csv"
output_file = "MaCoCu-dataset-(all)2M+-filtered-06-with-ids.csv"
with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", newline="", encoding="utf-8") as f_out:
reader = csv.reader(f_in, delimiter="\t")
writer = csv.writer(f_out, delimiter="\t")
writer.writerow(["ID", "Original", "Paraphrase[Translation]"])
for row in reader:
# skip the header row
if row[0] == "ID":
continue
# print(row)
# Extract values from the row
line_num = row[0].strip("[]") #strip the brackets
text = row[1].strip('"') #strip the quotes
#split the text into original and paraphrase
original = text.split("\t")[0].strip()
paraphrase = text.split("\t")[1].strip()
# Write the processed row to the output CSV file
writer.writerow([line_num, original, paraphrase])
#print status message every 10000 lines
if int(line_num) % 10000 == 0:
print("Processing line {}...".format(line_num))
print("Done!")