-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare_turing_dataset.py
79 lines (65 loc) · 4.01 KB
/
prepare_turing_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pandas as pd
from pandarallel import pandarallel
from utils import process, extract_style
pandarallel.initialize()
authors = ['human', 'ctrl', 'fair_wmt19', 'fair_wmt20', 'gpt1', 'gpt2_large', 'gpt2_medium', 'gpt2_pytorch',
'gpt2_small', 'gpt2_xl', 'gpt3', 'grover_base', 'grover_large', 'grover_mega', 'pplm_distil', 'pplm_gpt2',
'transfo_xl', 'xlm', 'xlnet_base', 'xlnet_large']
splits = ['test', 'train', 'valid']
# for split_name in splits:
# df = pd.read_csv(f'turing_dataset/AA/{split_name}.csv')
# df['From'] = df['label'].apply(lambda x: authors.index(x))
# df['train'] = 1 if split_name == 'train' else 0
# df['author'] = df['label']
# df['content'] = df['Generation']
# df['content_tfidf'] = df['content'].apply(lambda x: process(x))
# df[["avg_len", "len_text", "len_words", "num_short_w", "per_digit", "per_cap", "f_a", "f_b", "f_c", "f_d",
# "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m", "f_n", "f_o", "f_p", "f_q", "f_r", "f_s",
# "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7",
# "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4", "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9",
# "f_e_10", "f_e_11", "richness"]] = df['content'].apply(lambda x: extract_style(x))
# df.to_csv(f'turing_dataset/AA/{split_name}_processed.csv')
# for split_name in splits:
# for author in authors:
# if author != 'human':
# df = pd.read_csv(f'turing_dataset/TT_{author}/{split_name}.csv')
# df['From'] = df['label'].apply(lambda x: authors.index(x) if authors.index(x) == 0 else 1)
# df['train'] = 1 if split_name == 'train' else 0
# df['author'] = df['label']
# df['content'] = df['Generation']
# df['content_tfidf'] = df['content'].apply(lambda x: process(x))
# df[["avg_len", "len_text", "len_words", "num_short_w", "per_digit", "per_cap", "f_a", "f_b", "f_c", "f_d",
# "f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m", "f_n", "f_o", "f_p", "f_q", "f_r", "f_s",
# "f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7",
# "f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4", "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9",
# "f_e_10", "f_e_11", "richness"]] = df['content'].apply(lambda x: extract_style(x))
# df.to_csv(f'turing_dataset/TT_{author}/{split_name}_processed.csv')
# infile = open(f'turing_dataset/AA/{split_name}.csv', 'r')
# outfile = open(f'turing_dataset/AA/{split_name}_processed.csv', 'w')
# line = infile.readline()
# while line != '':
# update 20220208: consider all train(1) - test(0) - valid(2)
final_df = pd.read_csv(f'datasets/TuringBench/AA/train.csv')
for rd, split_name in enumerate(splits):
df = pd.read_csv(f'datasets/TuringBench/AA/{split_name}.csv')
df['From'] = df['label'].parallel_apply(lambda x: authors.index(x))
if split_name == 'train':
df['train'] = 1
elif split_name == 'test':
df['train'] = 0
else:
assert split_name == 'valid'
df['train'] = 2
df['author'] = df['label']
df['content'] = df['Generation']
df['content_tfidf'] = df['content'].parallel_apply(lambda x: process(x))
df[["avg_len", "len_text", "len_words", "num_short_w", "per_digit", "per_cap", "f_a", "f_b", "f_c", "f_d",
"f_e", "f_f", "f_g", "f_h", "f_i", "f_j", "f_k", "f_l", "f_m", "f_n", "f_o", "f_p", "f_q", "f_r", "f_s",
"f_t", "f_u", "f_v", "f_w", "f_x", "f_y", "f_z", "f_0", "f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7",
"f_8", "f_9", "f_e_0", "f_e_1", "f_e_2", "f_e_3", "f_e_4", "f_e_5", "f_e_6", "f_e_7", "f_e_8", "f_e_9",
"f_e_10", "f_e_11", "richness"]] = df['content'].parallel_apply(lambda x: extract_style(x))
if rd == 0:
final_df = df
else:
final_df = pd.concat([final_df, df], axis=0)
final_df.to_csv(f'datasets/turing_ori_0208.csv')