-
Notifications
You must be signed in to change notification settings - Fork 3
/
Main.py
240 lines (206 loc) · 14.6 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import gc
import os
import nltk
from nltk.corpus import wordnet_ic
from baselines.KMeans import AspectKMeans, akmeans_evaluation_functional
from baselines.LocLDA import loc_lda_evaluation_functional
from baselines.Random import random_evaluation_functional
from source.SemEval import read_sem_eval
from source.eval.Evaluation import report_pure
from source.eval.LatentAspectEvaluation import hidden_aspect_evaluation
from source.eval.OpinionatedEvaluation import report_opinionated, opinionated_pooling_layer, \
opinionated_aspect_detection
nltk.download('words', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('wordnet_ic', quiet=True)
import gensim
import pickle5 as pickle
import logging
import argparse
import warnings
import numpy as np
import pandas as pd
from datetime import datetime
from termcolor import colored
from source.Logging import logger
from source import LDATopicModeling
from flair.models import SequenceTagger
from source.OpinionSpecification import correction
from source.LDATopicModeling import TopicModeling, elbow_method
from source.Preprocessing import preprocess, remove_stop_word, specific_stop_words, preprocess_in_place
gc.enable()
warnings.filterwarnings("ignore", category=DeprecationWarning)
def corpus_preparation(dictionary, series: pd.Series) -> list:
string = [str(line).split() for line in series if line is not np.nan]
corpus = [dictionary.doc2bow(text) for text in string]
return corpus
def builder(args, train_series: pd.Series, postag: str) -> LDATopicModeling.TopicModeling:
pxp_model = TopicModeling(train_series, bigram=True) # bool(postag=='opinion'))
print(colored("model " + args.engine + " is built.", 'cyan'))
model_name = postag + '_' + args.path[args.path.find('/') + 1:args.path.find('.xlsx')]
# ==================================================================================================================
if args.tune:
coherence_values = pxp_model.cross_validation(start=5, limit=60, step=5)
print(coherence_values)
optimal_topic = elbow_method(coherence_values, name=model_name, start=5, limit=60, step=5)
print(colored("Our novel tuning system detected #" + str(optimal_topic) + " with coherence " + \
" as the optimal value for topic number", 'cyan'))
args.num_topics = optimal_topic
# ==================================================================================================================
pxp_model.topic_modeling(num_topics=args.num_topics, library=args.engine, alpha=args.alpha,
iterations=args.iterations)
string = 'models/pxp_model_flair_' + model_name + '.pxp'
with open(string, 'wb') as handle:
pickle.dump(pxp_model, handle, protocol=pickle.HIGHEST_PROTOCOL)
return pxp_model
def prep_sem_eval() -> dict:
sem_eval_paths = {'201{}'.format(i): 'sem_eval/sem_eval201{}.txt'.format(i) for i in [4, 5, 6]}
sem_eval_frame = {key: preprocess_in_place(read_sem_eval(sem_eval_paths[key]),
column_name='caption',
postags=['aspect', 'opinion', 'all', None])
for key in sem_eval_paths}
with open('sem_eval/preprocessed_data_frame.dict', 'wb') as handle:
pickle.dump(sem_eval_frame, handle, protocol=pickle.HIGHEST_PROTOCOL)
return sem_eval_frame
def main(args):
start_time = datetime.now()
print(logger(datetime.now(), 'start preprocessing', 'sem_eval dataset ready soon'))
if os.path.isfile('sem_eval/preprocessed_data_frame.dict'):
with open('sem_eval/preprocessed_data_frame.dict', 'rb') as handle:
sem_eval_dict = pickle.load(handle)
else: sem_eval_dict = prep_sem_eval()
print(logger(datetime.now(), 'end preprocessing', ''))
dataset = pd.read_excel(args.path, index_col=0)
nan_value = float("NaN")
# ==================================================================================================================
prep_n_seg_path = 'prep_and_seg_datasets/' + args.path[args.path.find('/') + 1:args.path.find('.xlsx')].replace('/', '')
os.makedirs(prep_n_seg_path, exist_ok=True)
if args.preprocess:
if args.flair: flair_tagger = SequenceTagger.load("flair/pos-english-fast")
else: flair_tagger = None
print(logger(datetime.now(), 'start preprocessing', 'pxp dataset takes several hours'))
for postag in args.postag_list:
print(logger(datetime.now(), postag, ''))
if postag == 'all': continue
dataset[postag + '_preprocessed'] = dataset['caption'].apply(preprocess, tagger=flair_tagger, postag=postag)
specification = specific_stop_words(dataset, column=postag + '_preprocessed')
dataset[postag + '_preprocessed'] = dataset[postag + '_preprocessed'] \
.apply(remove_stop_word, specifications=specification)
dataset.replace("", nan_value, inplace=True)
dataset.replace(np.nan, nan_value, inplace=True)
dataset.dropna(subset=[postag + '_preprocessed'], how='any', axis='index', inplace=True)
dataset['all_preprocessed'] = dataset['aspect_preprocessed'] + ' ' + dataset['opinion_preprocessed']
dataset.to_excel(prep_n_seg_path + '/preprocessed.xlsx')
print(logger(datetime.now(), 'end preprocessing', ''))
if args.correction:
dataset['opinion_preprocessed'] = correction(dataset, pretrained='labels/yelp_all_pretrained_dataset.xlsx')
dataset.to_excel(prep_n_seg_path + '/preprocessed_corrected.xlsx')
sem_eval_test_dataset = sem_eval_dict.pop(args.sem_eval_test)
dataset = pd.concat([dataset] + [sam_df for sam_df in sem_eval_dict.values()]).fillna("sem_eval")
# ==================================================================================================================
lda_storage = {}
for postag in args.postag_list:
model_args = getattr(args, postag + '_model')
print(logger(datetime.now(), '{} lda'.format(postag), 'using pre-trained model'\
if model_args is not None else 'building lda models'))
if model_args is None: model = builder(args, dataset[postag + '_preprocessed'], postag=postag)
else:
with open(model_args, 'rb') as handle:
model = pickle.load(handle)
model.lda_model = gensim.models.wrappers.ldamallet. \
malletmodel2ldamodel(model.lda_model, gamma_threshold=0.001, iterations=50)
lda_storage[postag] = model
# ==================================================================================================================
print(logger(datetime.now(), 'evaluate latent aspect', 'testing sem_eval{} restaurant dataset'.format(args.sem_eval_test)))
pd.DataFrame(report_pure(sem_eval_test_dataset, evaluation_functional=hidden_aspect_evaluation,
model=lda_storage['aspect'], corpus_ic=wordnet_ic.ic('ic-brown.dat')),
index=['ndcg', 'recall_5', 'recip_rank', 'success_1', 'success_3', 'success_5', 'success_10', 'success_32'])\
.to_excel('reports/report_pure_mrr_{}.xlsx'.format(args.sem_eval_test))
print(logger(datetime.now(), 'evaluate opinionated aspect', 'testing sem_eval{} restaurant dataset'.format(args.sem_eval_test)))
pd.DataFrame(report_opinionated(sem_eval_test_dataset, aspect_model=lda_storage['aspect'],
opinion_model=lda_storage['opinion'],
opinionated_layer_functional=opinionated_pooling_layer,
corpus_ic=wordnet_ic.ic('ic-brown.dat'),
train_set=dataset),
index=['ndcg', 'recall_5', 'recip_rank', 'success_1', 'success_3', 'success_5', 'success_10', 'success_32'])\
.to_excel('reports/report_opinionated_pool_mrr_{}.xlsx'.format(args.sem_eval_test))
# print(logger(datetime.now(), 'evaluate opinionated aspect', 'testing sem_eval{} restaurant dataset'.format(args.sem_eval_test)))
# pd.DataFrame(report_opinionated(sem_eval_test_dataset, aspect_model=lda_storage['aspect'],
# opinion_model=lda_storage['opinion'],
# opinionated_layer_functional=opinionated_aspect_detection,
# corpus_ic=wordnet_ic.ic('ic-brown.dat'),
# train_set=dataset, theta=0.1),
# index=['ndcg', 'recall_5', 'recip_rank', 'success_1', 'success_3', 'success_5', 'success_10', 'success_32'])\
# .to_excel('reports/report_opinionated_matrix_theta1_{}.xlsx'.format(args.sem_eval_test))
print(logger(datetime.now(), 'evaluate baseline Random', 'testing sem_eval{} restaurant dataset'.format(args.sem_eval_test)))
pd.DataFrame(report_pure(sem_eval_test_dataset, evaluation_functional=random_evaluation_functional),
index=['ndcg', 'recall_5', 'recip_rank', 'success_1', 'success_3', 'success_5', 'success_10', 'success_32'])\
.to_excel('reports/report_random_mrr_{}.xlsx'.format(args.sem_eval_test))
print(logger(datetime.now(), 'train/eval baseline LocLDA', 'testing sem_eval{} restaurant dataset'.format(args.sem_eval_test)))
locLDA = TopicModeling(dataset.all_preprocessed, bigram=False)
locLDA.topic_modeling(num_topics=32, library='mallet', iterations=args.iterations)
locLDA.lda_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(locLDA.lda_model,
gamma_threshold=0.001,
iterations=50)
pd.DataFrame(report_pure(sem_eval_test_dataset, evaluation_functional=loc_lda_evaluation_functional, model=locLDA),
index=['ndcg', 'recall_5', 'recip_rank', 'success_1', 'success_3', 'success_5', 'success_10', 'success_32']) \
.to_excel('reports/report_locLDA_mrr_{}.xlsx'.format(args.sem_eval_test))
print(colored(datetime.now() - start_time, 'cyan'))
if __name__ == '__main__':
os.makedirs("models", exist_ok=True)
os.makedirs("picture", exist_ok=True)
os.makedirs("logging", exist_ok=True)
os.makedirs("results", exist_ok=True)
os.makedirs("reports", exist_ok=True)
os.makedirs("pipeline", exist_ok=True)
os.makedirs("inference", exist_ok=True)
os.makedirs("prep_and_seg_datasets", exist_ok=True)
# ==================================================================================================================
parser = argparse.ArgumentParser(description='Latent Aspect Detection.')
parser.add_argument('--path', dest='path', type=str, default='data/Canadian_Restaurant.xlsx',
help='Raw dataset file address.')
parser.add_argument('--segment', dest='segment', type=bool, default=True, help='break every record into sentences.')
parser.add_argument('--augment', dest='augment', type=int, default=None,
help='augment the dataset to learn better.')
parser.add_argument('--engine', dest='engine', type=str, default='mallet',
help="supported topic modeling engines in this implementation are mallet, gensim, HDP.")
parser.add_argument('--preprocess', dest='preprocess', type=bool, default=True,
help="whether or not preprocessing documents to be used instead of preprocessing"
"the raw document in the --path.")
parser.add_argument('--tune', dest='tune', type=bool, default=False,
help="Using 5 folds of data to detect the best number of topics.")
parser.add_argument('--num_topics', dest='num_topics', type=int, default=32, help='User defined number of topics.')
parser.add_argument('--aspect_model', dest='aspect_model', type=str, default=None,
help="address to pxp lda model trained on noun documents.")
parser.add_argument('--opinion_model', dest='opinion_model', type=str, default=None,
help="address to pxp lda model trained on adj documents.")
parser.add_argument('--all_model', dest='all_model', type=str, default=None,
help="address to pxp lda model trained on all the phrases of the documents.")
parser.add_argument('--inference', dest='inference', type=str, default=None,
help="address to inference dataset.")
parser.add_argument('--correction', dest='correction', type=bool, default=True,
help="Using yelp dataset to improve bigram detection.")
parser.add_argument('--flair', dest='flair', type=bool, default=True,
help="Using flair POS Tagger in the preprocessing unit.")
parser.add_argument('--postag_list', dest='postag_list', type=list, default=["aspect", "opinion", "all"],
help="Give a list of possible extraction strategies, from 'aspect', 'opinion', 'all'")
parser.add_argument('--iterations', dest='iterations', type=int, default=1000,
help="mallet parameters")
parser.add_argument('--alpha', dest='alpha', type=float, default=0.1, help="mallet parameters")
parser.add_argument('--labeling', dest='labeling', type=str, default=None,
help="Customizable labeling of the aspect opinion table, provide tha path to your labeling doc")
parser.add_argument('--sem_eval_test', dest='sem_eval_test', type=str, default='2016',
help="choose between 2014, 2015, 2016. to be tested by our model")
parser.set_defaults(augment=None, segment=False, tune=False, preprocess=False, flair=False, correction=False,
path='data/Canadian_Restaurant_preprocessed_corrected.xlsx',
aspect_model='models/pxp_model_flair_aspect_Canadian_Restaurant_preprocessed_corrected.pxp',
opinion_model='models/pxp_model_flair_opinion_Canadian_Restaurant_preprocessed_corrected.pxp',
all_model='models/pxp_model_flair_all_Canadian_Restaurant_preprocessed_corrected.pxp')
arguments = parser.parse_args()
with warnings.catch_warnings():
logging.basicConfig(filename='logging/' + arguments.path[:arguments.path.find('.xlsx')].replace('/', '_') + \
str(arguments.sem_eval_test) + '_pxp_info.log', format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
warnings.filterwarnings("ignore")
print(logger(datetime.now(), "", 'PXP=TopicModeling'))
main(args=arguments)