-
Notifications
You must be signed in to change notification settings - Fork 1
/
read_data.py
178 lines (151 loc) · 6.96 KB
/
read_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import os
import glob
import json
import sys
import gzip
from helpers import create_folder_path, write_csv_file, write_list_to_txt, save_pk_file
def read_jsonl_file( file_name, columns):
"""
Read jsonl file and create a data frame of the required data
file_name: the name of the read file
columns: a list of columns that need to be extracted,
if columns is None, all data from the jsonl file will be converted to dataframe
sort_value: the value that is used to sort the dataframe, default value is "id"
Return a dataframe of data required to be extracted
"""
with open(file_name,'r', encoding = 'utf-8') as jfile:
records = [json.loads(line) for line in jfile]
df = pd.DataFrame.from_records(records)
sorted_df = df.sort_values(by="id", ascending=True)
if columns == None:
return sorted_df
else:
cleaned_df = sorted_df[columns]
return cleaned_df
def prepare_json_data(inst_file, truth_file, inst_columns = None , truth_columns = None):
"""
Read the files from the corpus including an instance file and a truth file.
inst_file: the path to the instance file
truth_file: the path to the truth file
inst_columns: a list of columns required to extracted from the instance file, default value is None meaning all data are needed
truth_column: a list of columns required to extracted from the truth file, default value is None meaning all data are needed
Return a dataframe that is the combination of data from the instance file and the truth file
"""
inst_df = read_jsonl_file(inst_file, inst_columns)
truth_df = read_jsonl_file(truth_file, truth_columns)
merged_df = pd.merge(inst_df, truth_df, on = 'id')
return merged_df
def split_json_data(df, folder, column = "truthScale"):
"""
Split data into two subset according to the label
dataframe: the original dataframe
column: the name of the columns that contain labels
folder: the path to the folder containing new data file
Return the path to the new file
"""
value_set = set(df[column])
for value in value_set:
splited_data = df[df[column]== value]
headline = list(splited_data["targetTitle"])
textbody = list(splited_data["targetParagraphs"])
headline_file_path = f'{folder}/headline_{value}'
textbody_file_fath = f'{folder}/textbody_{value}'
write_list_to_txt(headline,headline_file_path)
write_list_to_txt(textbody,textbody_file_fath)
def read_jsonl_folder(json_folder):
"""
Read the instance.jsonl and truth.jsonl the folder
json_folder: the path to the folder that contain the two files
write_folder: the path to the folder that contain the outfile
Return the name of the outfile
"""
inst_columns = ['id',"targetTitle","targetParagraphs"]#, 'postMedia','postText']
truth_columns = ["id","truthClass"]#, "truthMode","truthJudgments"]
path_inst_file = json_folder+"/instances.jsonl"
path_truth_file = json_folder+"/truth.jsonl"
merged_df = prepare_json_data(path_inst_file, path_truth_file, inst_columns, truth_columns)
merged_df["targetTitle"] = merged_df["targetTitle"].progress_map(lambda x: str(x).strip("[").strip(']').strip("\'").strip('\"'))
#merged_df['postText'] = merged_df['postText'].progress_map(lambda x: ' '.join(map(str, x)))
#merged_df['postMedia'] = merged_df['postMedia'].progress_map(lambda x: 0 if x == "[]" else 1)
merged_df['targetParagraphs'] = merged_df['targetParagraphs'].progress_map(lambda x: ' '.join(map(str, x)))
#merged_df["truthScale"] = merged_df["truthMode"].progress_map(lambda x: "non" if x == 0.0 else ("slightly" if 0.3<x<0.6 else ("considerable" if 0.6<x<1 else "heavy")))
merged_df["truthClass"] = merged_df["truthClass"].progress_map(lambda x: "CB" if x == "clickbait" else "Non")
drop_df = merged_df[~merged_df.targetTitle.str.contains("Sections Shows Live Yahoo!")]
final_df = drop_df[~drop_df.targetTitle.str.contains("Top stories Top stories")]
write_csv_file(final_df, json_folder)
pk_file = save_pk_file(final_df, json_folder)
#split_json_data(final_df, save_to)
print(final_df[:3])
return pk_file
def gz_to_txt(gz_file, txt_file):
"""
Convert gz file to txt file and convert content format from byte to utf8
gz_file: the path gz file that need to be converted
txt_file: the path gz file that need to be converted
Print a statement that file created
"""
with gzip.open(gz_file, 'rb') as outfile:
file_content = outfile.read()
with open (txt_file,"w", encoding="utf8") as infile:
infile.write(file_content.decode("utf-8"))
print( "File {} created".format(txt_file))
def read_txt(file_name):
"""
Read txt file and return a dataframe containing the data
file_name: the name of txt file
"""
with open (file_name, "r", encoding = "utf8") as infile:
content = infile.readlines()
df = pd.DataFrame()
lines = []
for line in content:
if line != "\n":
new_line = line.strip("\n")
lines.append(new_line)
df["targetTitle"] = lines
df["truthClass"] = "Non" if "non" in file_name else "CB"
return df
def read_gz_folder(gz_folder):
"""
read .gz files and return a dataframe contain the data in the file
gz_folder: path to folder containing .gz files
"""
df_list = []
for read_file in tqdm(glob.glob(os.path.join(gz_folder, '*.gz'))):
file_name = read_file.replace(".gz", ".txt")
gz_to_txt(read_file, file_name)
df = read_txt(file_name)
df_list.append(df)
merged_df = pd.concat(df_list)
write_csv_file(merged_df, gz_folder)
pk_file = save_pk_file(merged_df, gz_folder)
print(merged_df[:5])
return pk_file
def read_data(read_folder, extention):
if extention == "json":
read_jsonl_folder(read_folder)
elif extention == "gz":
read_gz_folder(read_folder)
else:
pass
if __name__ == "__main__":
#if len(sys.argv) != 4:
# print("python read_data.py read_folder write_folder json/gz")
#else:
# read_folder = sys.argv[1]
# write_folder = sys.argv[2]
# extention = sys.argv[3]
# read_data(read_folder, write_folder, extention)
directories = os.listdir("Data")
print(directories)
for directory in directories:
if "." not in directory:
if "clickbait17" in directory:
read_data("Data/"+directory, "json")
else:
read_data("Data/"+directory, "gz")
#read_data("clickbait17-validation-170630", "clickbait17-validation-170630", "json")
#read_data("Dataset_Charabokty", "Data", "gz")