Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
ChanakaUOMIT committed May 22, 2021
2 parents 302cfea + 803b7c4 commit df804bf
Show file tree
Hide file tree
Showing 9 changed files with 76 additions and 92 deletions.
28 changes: 11 additions & 17 deletions budgeted_submodular_maximization_multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,6 @@
tixier_evaluation.csv
"""
import os
import sys
# path_to_root = '/data/gshang/acl2018_abssumm/'
path_to_root = '/root/project/text-summery/text_summerization/'
# os.chdir(path_to_root)
sys.path.append(path_to_root)
import time
import csv
import string
Expand Down Expand Up @@ -83,7 +78,6 @@ def worker(worker_id, submodularity_param):
code = os.system('java -jar rouge2.0.jar > /dev/null')
if code != 0:
raise RuntimeError()
os.chdir(path_to_root)

# read results.csv
with open(path_to_results_csv_of_worker) as f:
Expand Down Expand Up @@ -122,7 +116,7 @@ def worker(worker_id, submodularity_param):
# ### RESOURCES LOADING ###
# #########################
if domain == 'meeting':
path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
path_to_stopwords = 'resources/stopwords/meeting/stopwords.' + language + '.dat'
stopwords = utils.load_stopwords(path_to_stopwords)

if dataset_id == 'ami':
Expand All @@ -135,7 +129,7 @@ def worker(worker_id, submodularity_param):
else meeting_lists.icsi_test_set

if language == 'en':
path_to_wv = path_to_root + 'resources/GoogleNews-vectors-negative300.bin.gz'
path_to_wv = 'resources/GoogleNews-vectors-negative300.bin.gz'

# Load Word2Vec (takes approx. 8G RAM)
print("loading GoogleNews...")
Expand All @@ -150,7 +144,7 @@ def worker(worker_id, submodularity_param):
# #############
# ### ROUGE ###
# #############
path_to_rouge = path_to_root + 'rouge2.0-distribution/'
path_to_rouge = 'rouge2.0-distribution/'

# clean existing system folder
if os.path.exists(path_to_rouge + 'test-summarization/system/'):
Expand All @@ -170,7 +164,7 @@ def worker(worker_id, submodularity_param):
# #####################################
# ### COMMUNITY CREATION PARAMETERS ###
# #####################################
path = path_to_root + 'data/' + dataset_id + '_params_create_community.csv'
path = 'data/' + dataset_id + '_params_create_community.csv'
with open(path) as f:
corpus_params_dict = {row['index']: {k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)}

Expand All @@ -181,7 +175,7 @@ def worker(worker_id, submodularity_param):
MSC_system_params_dict = {}

for system_name in system_name_list:
path = path_to_root + 'results/' + system_name + '_params_MSC_' + development_or_test + '.csv'
path = 'results/' + system_name + '_params_MSC_' + development_or_test + '.csv'
with open(path) as f:
MSC_system_params_dict[system_name] = {row['index']: {k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)}

Expand All @@ -201,7 +195,7 @@ def worker(worker_id, submodularity_param):

# save indexed parameter grid
keys = list(submodularity_params[0])
with open(path_to_root + 'results/' + 'params_submodularity.csv', 'w') as output_file:
with open('results/' + 'params_submodularity.csv', 'w') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(submodularity_params)
Expand All @@ -210,7 +204,7 @@ def worker(worker_id, submodularity_param):
# ### EVALUATION CSV ###
# ######################
for system_name in system_name_list:
with open(path_to_root + system_name + '_evaluation.csv', "w") as f:
with open(system_name + '_evaluation.csv', "w") as f:
f.write('index_step1,index_step2,index_step3,overall_score')
for key in ['Avg_F-Score', 'Avg_Precision', 'Avg_Recall']:
for summary_size in summary_size_range:
Expand Down Expand Up @@ -261,7 +255,7 @@ def worker(worker_id, submodularity_param):
kmeans_clusters_dict_of_meeting = {}

for meeting_id in ids:
path = path_to_root + 'data/utterance/' + domain + '/' + dataset_id + '_' + str(corpus_id) + '/' +\
path = 'data/utterance/' + domain + '/manual/' + dataset_id + '_' + str(corpus_id) + '/' +\
meeting_id + '_utterances.txt'
with open(path, 'r+') as f:
utterances = f.read().splitlines()
Expand All @@ -285,7 +279,7 @@ def worker(worker_id, submodularity_param):
kmeans_clusters_dict_of_meeting[meeting_id] = kmeans_clusters_dict
# optimal_k_clusters(X, range(0, X.shape[0], 10)[1:], meeting_id, system_name[i])

path = path_to_root + 'results/' + domain + '/' + dataset_id + '_' + str(corpus_id) + '/' + development_or_test + '/' \
path = 'results/' + domain + '/' + dataset_id + '_' + str(corpus_id) + '/' + development_or_test + '/' \
+ system_name + '/' + str(MSC_param_id) + '/' + meeting_id + '_' + system_name + '.txt'
with open(path, 'r+') as f:
summary = f.read().splitlines()
Expand Down Expand Up @@ -329,7 +323,7 @@ def worker(worker_id, submodularity_param):
scores_of_submodularity_params.append(scores)

# ---- Output all ----
with open(path_to_root + system_name + '_evaluation.csv', "a") as f:
with open(system_name + '_evaluation.csv', "a") as f:
for submodularity_param_id, submodularity_param in enumerate(submodularity_params):
f.write(
str(corpus_id) + ',' +
Expand All @@ -356,7 +350,7 @@ def worker(worker_id, submodularity_param):
# index_of_MSC_param = MSC_param_id
# index_of_submodularity_param = index
#
# with open(path_to_root + system_name + '_evaluation.csv', "a") as f:
# with open(system_name + '_evaluation.csv', "a") as f:
# f.write(
# str(index_of_community_creation_param) + ',' +
# str(index_of_MSC_param) + ',' +
Expand Down
2 changes: 1 addition & 1 deletion clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def cluster_utterances(
# plt.savefig('singular values/' + str(meeting_id) + '.png')
# plt.clf()
#
# accumulative_singular_values_ratio = np.array(list(accumulate(singular_values))) // float(sum(np.array(singular_values)))
# accumulative_singular_values_ratio = np.array(list(accumulate(singular_values))) / float(sum(np.array(singular_values)))
# plt.bar(range(len(singular_values)), accumulative_singular_values_ratio)
# plt.xlabel('numbers of component')
# plt.ylabel('accumulative singular values')
Expand Down
2 changes: 1 addition & 1 deletion language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def get_sentence_score(self, sentence, n=3, unknown_word_prob=1e-5, normalizatio
except KeyError:
score += unknown_word_prob
if normalization:
return score // len(n_grams)
return score / len(n_grams)
else:
return score
else:
Expand Down
24 changes: 7 additions & 17 deletions multi_sentence_compression_multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,7 @@
results/tixier_params_MSC_development.csv
"""
import os
# path_to_root = '/data/gshang/acl2018_abssumm/'
# os.chdir(path_to_root)
import sys
# path_to_root = '/data/gshang/acl2018_abssumm/'
# path_to_root = 'C:/Project/FYP/Summerizing/CoreRank/'
path_to_root = '/root/project/text-summery/text_summerization/'

# os.chdir(path_to_root)
sys.path.append(path_to_root)
import time
import string
import re
import gensim
import takahe
Expand Down Expand Up @@ -127,7 +117,7 @@ def worker(system_name, param):
# ######################
# ### OUTPUT SUMMARY ###
# ######################
output_path = path_to_root + 'results/' + domain + '/' + dataset_id + '_' + str(
output_path = 'results/' + domain + '/' + dataset_id + '_' + str(
corpus_id) + '/' + development_or_test + '/' + system_name + '/' + str(param_id) + '/'
if not os.path.exists(output_path):
os.makedirs(output_path)
Expand Down Expand Up @@ -161,7 +151,7 @@ def worker(system_name, param):
# ### RESOURCES LOADING ###
# #########################
if domain == 'meeting':
path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
path_to_stopwords = 'resources/stopwords/meeting/stopwords.' + language + '.dat'
stopwords = utils.load_stopwords(path_to_stopwords)

if dataset_id == 'ami':
Expand All @@ -175,8 +165,8 @@ def worker(system_name, param):


if language == 'en':
path_to_wv = path_to_root + 'resources/GoogleNews-vectors-negative300.bin.gz'
path_to_lm = path_to_root + 'resources/en-70k-0.2.lm'
path_to_wv = 'resources/GoogleNews-vectors-negative300.bin.gz'
path_to_lm = 'resources/en-70k-0.2.lm'

# Load Word2Vec (takes approx. 8G RAM)
print("loading GoogleNews...")
Expand Down Expand Up @@ -239,7 +229,7 @@ def worker(system_name, param):
# save indexed parameter grid
import csv
keys = list(params_new[0])
with open(path_to_root + 'results/' + system_name + '_params_MSC_' + development_or_test + '.csv', 'w') as output_file:
with open('results/' + system_name + '_params_MSC_' + development_or_test + '.csv', 'w') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(params_new)
Expand All @@ -259,9 +249,9 @@ def worker(system_name, param):

print(str(corpus_id_range.index(corpus_id)) + '/' + str(len(corpus_id_range) - 1), "corpus:", dataset_id + '_' + str(corpus_id))
if domain == 'meeting':
path_to_tagged_corpus = path_to_root + 'data/community_tagged/meeting/' + dataset_id + '_' + str(corpus_id) + '/'
path_to_tagged_corpus = 'data/community_tagged/meeting/manual/' + dataset_id + '_' + str(corpus_id) + '/'
elif domain == 'document':
path_to_tagged_corpus = path_to_root + 'data/community_tagged/document/' + dataset_id + '_' + str(corpus_id) + '/'
path_to_tagged_corpus = 'data/community_tagged/document/' + dataset_id + '_' + str(corpus_id) + '/'

# #############################
# ### TAGGED CORPUS LOADING ###
Expand Down
6 changes: 3 additions & 3 deletions submodularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ def concept_submodularity_objective(
if u in kmeans_clusters_dict:
idx = kmeans_clusters_dict[u]
cluster_counters[idx] = 1
diversity_score = np.sum(cluster_counters) // float(len(cluster_counters))
diversity_score = np.sum(cluster_counters) / float(len(cluster_counters))
else:
# percentage of unique concepts covered
diversity_score = len(list(set(concepts) & set(units))) // float(len(concepts))
diversity_score = len(list(set(concepts) & set(units))) / float(len(concepts))

my_score_final = sum_my_scores_coverage + lamda * diversity_score

Expand Down Expand Up @@ -137,7 +137,7 @@ def sentence_extraction_submodularity(
num = numerators_left[i] - concept_submodularity_objective_G
denom = cost_l ** scaling_factor
# print("num:", num, "cost_l:", cost_l)
ratios.append(round(num // denom, 4))
ratios.append(round(num / denom, 4))

# select unit associated with the max ratio

Expand Down
26 changes: 13 additions & 13 deletions takahe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1475,9 +1475,9 @@ def get_edge_weight(self, node1, node2):
weight2 = freq2

if self.twidf_coverage:
return (freq1 + freq2) // sum(diff)
return (freq1 + freq2) / sum(diff)
else:
return ( (freq1 + freq2) // sum(diff) ) // (weight1 * weight2)
return ( (freq1 + freq2) / sum(diff) ) / (weight1 * weight2)

def get_edge_weight_word_attract(self, node1, node2):
# Based on case sensitive form
Expand Down Expand Up @@ -1663,7 +1663,7 @@ def path_length_normalization_score(self, nbest_compressions):

# Loop over the compression candidates
for cummulative_score, path in nbest_compressions:
score = cummulative_score // len(path)
score = cummulative_score / len(path)
bisect.insort(reranked_compressions, (score, path))

return reranked_compressions
Expand Down Expand Up @@ -1704,9 +1704,9 @@ def noun_tfidf_coverage_score(self, nbest_compressions, normalization=True):
size += 1

if normalization:
all_scores.append(score // cumulative_score // size)
all_scores.append(score / cumulative_score / size)
else:
all_scores.append(score // cumulative_score)
all_scores.append(score / cumulative_score)
all_scores = np.array(all_scores)
return all_scores

Expand All @@ -1731,9 +1731,9 @@ def twidf_coverage_score(self, nbest_compressions, normalization=True):
size += 1

if normalization:
all_scores.append(score // cumulative_score // size)
all_scores.append(score / cumulative_score / size)
else:
all_scores.append(score // cumulative_score)
all_scores.append(score / cumulative_score)
all_scores = np.array(all_scores)
return all_scores

Expand All @@ -1755,7 +1755,7 @@ def diversity_score(self, nbest_compressions, normalization=True):
cluster_counters[idx] = 1

if normalization:
all_scores.append(np.sum(cluster_counters) // float(len(sentence)))
all_scores.append(np.sum(cluster_counters) / float(len(sentence)))
else:
all_scores.append(np.sum(cluster_counters))
all_scores = [i + 1e-5 for i in all_scores] # smooth
Expand Down Expand Up @@ -1803,31 +1803,31 @@ def final_score(self, nbest_compressions, n_results=20):
if self.tfidf_coverage:
fl_score = self.fluency_score(nbest_compressions, normalization=False)
for i, compression in enumerate(nbest_compressions):
nbest_compressions[i] = (compression[0] // fl_score[i], compression[1])
nbest_compressions[i] = (compression[0] / fl_score[i], compression[1])

# tixier
if self.twidf_coverage:
fl_score = self.fluency_score(nbest_compressions, normalization=True)
for i, compression in enumerate(nbest_compressions):
nbest_compressions[i] = (compression[0] // fl_score[i], compression[1])
nbest_compressions[i] = (compression[0] / fl_score[i], compression[1])

# mehdad
if self.tfidf_coverage:
tfidf_co_score = self.noun_tfidf_coverage_score(nbest_compressions, normalization=False)
for i, compression in enumerate(nbest_compressions):
nbest_compressions[i] = (compression[0] // tfidf_co_score[i], compression[1])
nbest_compressions[i] = (compression[0] / tfidf_co_score[i], compression[1])

# tixier
if self.twidf_coverage:
twidf_co_score = self.twidf_coverage_score(nbest_compressions, normalization=True)
for i, compression in enumerate(nbest_compressions):
nbest_compressions[i] = (compression[0] // twidf_co_score[i], compression[1])
nbest_compressions[i] = (compression[0] / twidf_co_score[i], compression[1])

# tixier
if self.diversity:
div_score = self.diversity_score(nbest_compressions, normalization=True)
for i, compression in enumerate(nbest_compressions):
nbest_compressions[i] = (compression[0] // div_score[i], compression[1])
nbest_compressions[i] = (compression[0] / div_score[i], compression[1])

sorted_by_score = sorted(nbest_compressions, key=lambda tup: tup[0])

Expand Down
6 changes: 3 additions & 3 deletions tf_idf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ def cosine_similarity(vector1, vector2):
magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
if not magnitude:
return 0
return dot_product/magnitude
return dot_product//magnitude

def jaccard_similarity(query, document):
intersection = set(query).intersection(set(document))
union = set(query).union(set(document))
return len(intersection)/len(union)
return len(intersection)//len(union)

def term_frequency(term, tokenized_document):
return tokenized_document.count(term)
Expand All @@ -34,7 +34,7 @@ def sublinear_term_frequency(term, tokenized_document):

def augmented_term_frequency(term, tokenized_document):
max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))
return (0.5 + ((0.5 * term_frequency(term, tokenized_document))//max_count))

def inverse_document_frequencies(tokenized_documents):
idf_values = {}
Expand Down
4 changes: 2 additions & 2 deletions utterance_community_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@
for utterance in utterances_processed:
index, role, utt = utterance
words = utt.split(' ')
utt_scores.append(round(sum([core_rank_scores[word] for word in words]) // float(len(words)), 2))
utt_scores.append(round(sum([core_rank_scores[word] for word in words]) / float(len(words)), 2))

# remove communities with less than min_elt number of utterances
comm_labels = [k for k, v in c.items() if v >= min_elt]
Expand All @@ -243,7 +243,7 @@
for label in comm_labels:
# get the index of all the utterances belonging to the comm
utt_indexes = [idx for idx, value in enumerate(membership) if value == label]
comm_scores.append(round(sum([utt_scores[idx] for idx in utt_indexes]) // float(len(utt_indexes)), 2))
comm_scores.append(round(sum([utt_scores[idx] for idx in utt_indexes]) / float(len(utt_indexes)), 2))

# sort communities according to the average score of the utterances they contain
# get sorted index of elements of comm_scores
Expand Down
Loading

0 comments on commit df804bf

Please sign in to comment.