This repository has been archived by the owner on Dec 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
srl_day_level_final.py
123 lines (95 loc) · 3.45 KB
/
srl_day_level_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
import numpy as np
import pandas as pd
import relatio
from relatio.utils import split_into_sentences
from relatio.wrappers import run_srl
# Path for files to be annotated
path_to_data_sets = (
"/cluster/work/lawecon/Projects/Ash_Galletta_Widmer/data/scrapes_clean"
)
list_of_files = sorted(os.listdir(path_to_data_sets))
path_to_save = "/cluster/work/lawecon/Projects/annot_data_Ash_Widmer/daily_data"
semantics_dict = {}
# print(list_of_files)
file_path = sys.argv[1]
file_name = file_path.split("/")[-1]
print(file_path, file_name)
# sys.exit(0)
# Testing for one particular file
# file_name = "2005-01-02.csv"
# file_path = path_to_data_sets + "/" + file_name
# only .csv files matter
if file_name.endswith(".csv"):
print(file_name)
absolute_file_path = file_path
original_df = pd.read_csv(absolute_file_path)
# print(df.head())
# print(df['paragraph'])
print(len(original_df.index))
# make a doc_ID for the file in form of the date without - eg. 20050101
document_ID = file_name.replace(".csv", "").split("-")
document_ID = "".join(document_ID)
# Copy of IDs to match the number of rows in the csv
# Add the row number to map the paragraph_ID
list_of_ID_for_df = [
document_ID + "-" + str(i) for i in range(len(original_df.index))
]
id_df = pd.DataFrame({"id": list_of_ID_for_df})
# print(id_df.head())
# print(len(id_df.index))
doc_df = original_df["paragraph"]
# print(doc_df.head())
# print(len(doc_df.index))
data = [id_df["id"], doc_df]
headers = ["id", "doc"]
# new data frame has two columns, IDs and doc(paragraph), as required
# by the split_into_sentences_method
new_data_frame = pd.concat(data, axis=1, keys=headers)
print(new_data_frame.head())
# print(len(new_data_frame.index))
new_data_frame.fillna("", inplace=True)
split_sentences = split_into_sentences(new_data_frame, progress_bar=True)
"""for i in range(5):
print("document id: ", split_sentences[0][i])
print("doc_in_sentences: ", split_sentences[1][i])
"""
print(len(split_sentences[0]), len(split_sentences[1]))
# break
srl_results_per_paragraph = []
for i in range(len(new_data_frame.index)):
srl_results_per_paragraph.append([])
print(len(srl_results_per_paragraph))
# break
cuda_str = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
cuda_device = int(cuda_str[0]) if cuda_str[0] else -1
print(f"Using CUDA:{cuda_device}")
srl_res = run_srl(
# pre-trained model
path="https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz",
sentences=split_sentences[1],
cuda_device=cuda_device,
progress_bar=False,
)
print(len(srl_res))
# break
count_tracker = 0
for number in range(len(split_sentences[1])):
para_id = split_sentences[0][number].split("-")[1]
position = int(para_id)
srl_results_per_paragraph[position].append(srl_res[number])
print(len(srl_results_per_paragraph))
print(srl_results_per_paragraph[0])
print()
print(srl_results_per_paragraph[1])
print()
# break
srl_dataframe = original_df
srl_dataframe["srl_results"] = srl_results_per_paragraph
file_name_to_save = path_to_save + "/" + file_name
srl_dataframe.to_csv(file_name_to_save, index=False)
# import time; time.sleep(5 * 60)
# break