Merge branch 'master' of https://github.com/Team-Agility/text_summeri…

…zation
Team-Agility · May 22, 2021 · df804bf · df804bf
2 parents 302cfea + 803b7c4
commit df804bf
Show file tree

Hide file tree

Showing 9 changed files with 76 additions and 92 deletions.
diff --git a/budgeted_submodular_maximization_multiprocessing.py b/budgeted_submodular_maximization_multiprocessing.py
@@ -9,11 +9,6 @@
 tixier_evaluation.csv
 """
 import os
-import sys
-# path_to_root = '/data/gshang/acl2018_abssumm/'
-path_to_root = '/root/project/text-summery/text_summerization/'
-# os.chdir(path_to_root)
-sys.path.append(path_to_root)
 import time
 import csv
 import string
@@ -83,7 +78,6 @@ def worker(worker_id, submodularity_param):
     code = os.system('java -jar rouge2.0.jar > /dev/null')
     if code != 0:
         raise RuntimeError()
-    os.chdir(path_to_root)
 
     # read results.csv
     with open(path_to_results_csv_of_worker) as f:
@@ -122,7 +116,7 @@ def worker(worker_id, submodularity_param):
 # ### RESOURCES LOADING ###
 # #########################
 if domain == 'meeting':
-    path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
+    path_to_stopwords = 'resources/stopwords/meeting/stopwords.' + language + '.dat'
     stopwords = utils.load_stopwords(path_to_stopwords)
 
     if dataset_id == 'ami':
@@ -135,7 +129,7 @@ def worker(worker_id, submodularity_param):
             else meeting_lists.icsi_test_set
 
 if language == 'en':
-    path_to_wv = path_to_root + 'resources/GoogleNews-vectors-negative300.bin.gz'
+    path_to_wv = 'resources/GoogleNews-vectors-negative300.bin.gz'
 
 # Load Word2Vec (takes approx. 8G RAM)
 print("loading GoogleNews...")
@@ -150,7 +144,7 @@ def worker(worker_id, submodularity_param):
 # #############
 # ### ROUGE ###
 # #############
-path_to_rouge = path_to_root + 'rouge2.0-distribution/'
+path_to_rouge = 'rouge2.0-distribution/'
 
 # clean existing system folder
 if os.path.exists(path_to_rouge + 'test-summarization/system/'):
@@ -170,7 +164,7 @@ def worker(worker_id, submodularity_param):
 # #####################################
 # ### COMMUNITY CREATION PARAMETERS ###
 # #####################################
-path = path_to_root + 'data/' + dataset_id + '_params_create_community.csv'
+path = 'data/' + dataset_id + '_params_create_community.csv'
 with open(path) as f:
     corpus_params_dict = {row['index']: {k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)}
 
@@ -181,7 +175,7 @@ def worker(worker_id, submodularity_param):
 MSC_system_params_dict = {}
 
 for system_name in system_name_list:
-    path = path_to_root + 'results/' + system_name + '_params_MSC_' + development_or_test + '.csv'
+    path = 'results/' + system_name + '_params_MSC_' + development_or_test + '.csv'
     with open(path) as f:
         MSC_system_params_dict[system_name] = {row['index']: {k: v for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)}
 
@@ -201,7 +195,7 @@ def worker(worker_id, submodularity_param):
 
 # save indexed parameter grid
 keys = list(submodularity_params[0])
-with open(path_to_root + 'results/' + 'params_submodularity.csv', 'w') as output_file:
+with open('results/' + 'params_submodularity.csv', 'w') as output_file:
     dict_writer = csv.DictWriter(output_file, keys)
     dict_writer.writeheader()
     dict_writer.writerows(submodularity_params)
@@ -210,7 +204,7 @@ def worker(worker_id, submodularity_param):
 # ### EVALUATION CSV ###
 # ######################
 for system_name in system_name_list:
-    with open(path_to_root + system_name + '_evaluation.csv', "w") as f:
+    with open(system_name + '_evaluation.csv', "w") as f:
         f.write('index_step1,index_step2,index_step3,overall_score')
         for key in ['Avg_F-Score', 'Avg_Precision', 'Avg_Recall']:
             for summary_size in summary_size_range:
@@ -261,7 +255,7 @@ def worker(worker_id, submodularity_param):
             kmeans_clusters_dict_of_meeting = {}
 
             for meeting_id in ids:
-                path = path_to_root + 'data/utterance/' + domain + '/' + dataset_id + '_' + str(corpus_id) + '/' +\
+                path = 'data/utterance/' + domain + '/manual/' + dataset_id + '_' + str(corpus_id) + '/' +\
                        meeting_id + '_utterances.txt'
                 with open(path, 'r+') as f:
                     utterances = f.read().splitlines()
@@ -285,7 +279,7 @@ def worker(worker_id, submodularity_param):
                 kmeans_clusters_dict_of_meeting[meeting_id] = kmeans_clusters_dict
                 # optimal_k_clusters(X, range(0, X.shape[0], 10)[1:], meeting_id, system_name[i])
 
-                path = path_to_root + 'results/' + domain + '/' + dataset_id + '_' + str(corpus_id) + '/' + development_or_test + '/' \
+                path = 'results/' + domain + '/' + dataset_id + '_' + str(corpus_id) + '/' + development_or_test + '/' \
                        + system_name + '/' + str(MSC_param_id) + '/' + meeting_id + '_' + system_name + '.txt'
                 with open(path, 'r+') as f:
                     summary = f.read().splitlines()
@@ -329,7 +323,7 @@ def worker(worker_id, submodularity_param):
                 scores_of_submodularity_params.append(scores)
 
             # ---- Output all ----
-            with open(path_to_root + system_name + '_evaluation.csv', "a") as f:
+            with open(system_name + '_evaluation.csv', "a") as f:
                 for submodularity_param_id, submodularity_param in enumerate(submodularity_params):
                     f.write(
                         str(corpus_id) + ',' +
@@ -356,7 +350,7 @@ def worker(worker_id, submodularity_param):
             # index_of_MSC_param = MSC_param_id
             # index_of_submodularity_param = index
             #
-            # with open(path_to_root + system_name + '_evaluation.csv', "a") as f:
+            # with open(system_name + '_evaluation.csv', "a") as f:
             #     f.write(
             #         str(index_of_community_creation_param) + ',' +
             #         str(index_of_MSC_param) + ',' +

diff --git a/clustering.py b/clustering.py
@@ -85,7 +85,7 @@ def cluster_utterances(
         # plt.savefig('singular values/' + str(meeting_id) + '.png')
         # plt.clf()
         #
-        # accumulative_singular_values_ratio = np.array(list(accumulate(singular_values))) // float(sum(np.array(singular_values)))
+        # accumulative_singular_values_ratio = np.array(list(accumulate(singular_values))) / float(sum(np.array(singular_values)))
         # plt.bar(range(len(singular_values)), accumulative_singular_values_ratio)
         # plt.xlabel('numbers of component')
         # plt.ylabel('accumulative singular values')

diff --git a/language_model.py b/language_model.py
@@ -64,7 +64,7 @@ def get_sentence_score(self, sentence, n=3, unknown_word_prob=1e-5, normalizatio
                 except KeyError:
                     score += unknown_word_prob
             if normalization:
-                return score // len(n_grams)
+                return score / len(n_grams)
             else:
                 return score
         else:

diff --git a/multi_sentence_compression_multiprocessing.py b/multi_sentence_compression_multiprocessing.py
@@ -12,17 +12,7 @@
 results/tixier_params_MSC_development.csv
 """
 import os
-# path_to_root = '/data/gshang/acl2018_abssumm/'
-# os.chdir(path_to_root)
-import sys
-# path_to_root = '/data/gshang/acl2018_abssumm/'
-# path_to_root = 'C:/Project/FYP/Summerizing/CoreRank/'
-path_to_root = '/root/project/text-summery/text_summerization/'
-
-# os.chdir(path_to_root)
-sys.path.append(path_to_root)
 import time
-import string
 import re
 import gensim
 import takahe
@@ -127,7 +117,7 @@ def worker(system_name, param):
         # ######################
         # ### OUTPUT SUMMARY ###
         # ######################
-        output_path = path_to_root + 'results/' + domain + '/' + dataset_id + '_' + str(
+        output_path = 'results/' + domain + '/' + dataset_id + '_' + str(
             corpus_id) + '/' + development_or_test + '/' + system_name + '/' + str(param_id) + '/'
         if not os.path.exists(output_path):
             os.makedirs(output_path)
@@ -161,7 +151,7 @@ def worker(system_name, param):
 # ### RESOURCES LOADING ###
 # #########################
 if domain == 'meeting':
-    path_to_stopwords = path_to_root + 'resources/stopwords/meeting/stopwords.' + language + '.dat'
+    path_to_stopwords = 'resources/stopwords/meeting/stopwords.' + language + '.dat'
     stopwords = utils.load_stopwords(path_to_stopwords)
 
     if dataset_id == 'ami':
@@ -175,8 +165,8 @@ def worker(system_name, param):
 
 
 if language == 'en':
-    path_to_wv = path_to_root + 'resources/GoogleNews-vectors-negative300.bin.gz'
-    path_to_lm = path_to_root + 'resources/en-70k-0.2.lm'
+    path_to_wv = 'resources/GoogleNews-vectors-negative300.bin.gz'
+    path_to_lm = 'resources/en-70k-0.2.lm'
 
 # Load Word2Vec (takes approx. 8G RAM)
 print("loading GoogleNews...")
@@ -239,7 +229,7 @@ def worker(system_name, param):
     # save indexed parameter grid
     import csv
     keys = list(params_new[0])
-    with open(path_to_root + 'results/' + system_name + '_params_MSC_' + development_or_test + '.csv', 'w') as output_file:
+    with open('results/' + system_name + '_params_MSC_' + development_or_test + '.csv', 'w') as output_file:
         dict_writer = csv.DictWriter(output_file, keys)
         dict_writer.writeheader()
         dict_writer.writerows(params_new)
@@ -259,9 +249,9 @@ def worker(system_name, param):
 
     print(str(corpus_id_range.index(corpus_id)) + '/' + str(len(corpus_id_range) - 1), "corpus:", dataset_id + '_' + str(corpus_id))
     if domain == 'meeting':
-        path_to_tagged_corpus = path_to_root + 'data/community_tagged/meeting/' + dataset_id + '_' + str(corpus_id) + '/'
+        path_to_tagged_corpus = 'data/community_tagged/meeting/manual/' + dataset_id + '_' + str(corpus_id) + '/'
     elif domain == 'document':
-        path_to_tagged_corpus = path_to_root + 'data/community_tagged/document/' + dataset_id + '_' + str(corpus_id) + '/'
+        path_to_tagged_corpus = 'data/community_tagged/document/' + dataset_id + '_' + str(corpus_id) + '/'
 
     # #############################
     # ### TAGGED CORPUS LOADING ###

diff --git a/submodularity.py b/submodularity.py
@@ -39,10 +39,10 @@ def concept_submodularity_objective(
                 if u in kmeans_clusters_dict:
                     idx = kmeans_clusters_dict[u]
                     cluster_counters[idx] = 1
-            diversity_score = np.sum(cluster_counters) // float(len(cluster_counters))
+            diversity_score = np.sum(cluster_counters) / float(len(cluster_counters))
         else:
             # percentage of unique concepts covered
-            diversity_score = len(list(set(concepts) & set(units))) // float(len(concepts))
+            diversity_score = len(list(set(concepts) & set(units))) / float(len(concepts))
 
         my_score_final = sum_my_scores_coverage + lamda * diversity_score
 
@@ -137,7 +137,7 @@ def sentence_extraction_submodularity(
                 num = numerators_left[i] - concept_submodularity_objective_G
                 denom = cost_l ** scaling_factor
                 # print("num:", num, "cost_l:", cost_l)
-                ratios.append(round(num // denom, 4))
+                ratios.append(round(num / denom, 4))
 
             # select unit associated with the max ratio
 

diff --git a/takahe.py b/takahe.py
@@ -1475,9 +1475,9 @@ def get_edge_weight(self, node1, node2):
         weight2 = freq2
 
         if self.twidf_coverage:
-            return (freq1 + freq2) // sum(diff)
+            return (freq1 + freq2) / sum(diff)
         else:
-            return ( (freq1 + freq2) // sum(diff) ) // (weight1 * weight2)
+            return ( (freq1 + freq2) / sum(diff) ) / (weight1 * weight2)
 
     def get_edge_weight_word_attract(self, node1, node2):
         # Based on case sensitive form
@@ -1663,7 +1663,7 @@ def path_length_normalization_score(self, nbest_compressions):
 
         # Loop over the compression candidates
         for cummulative_score, path in nbest_compressions:
-            score = cummulative_score // len(path)
+            score = cummulative_score / len(path)
             bisect.insort(reranked_compressions, (score, path))
 
         return reranked_compressions
@@ -1704,9 +1704,9 @@ def noun_tfidf_coverage_score(self, nbest_compressions, normalization=True):
                     size += 1
 
             if normalization:
-                all_scores.append(score // cumulative_score // size)
+                all_scores.append(score / cumulative_score / size)
             else:
-                all_scores.append(score // cumulative_score)
+                all_scores.append(score / cumulative_score)
         all_scores = np.array(all_scores)
         return all_scores
 
@@ -1731,9 +1731,9 @@ def twidf_coverage_score(self, nbest_compressions, normalization=True):
                     size += 1
 
             if normalization:
-                all_scores.append(score // cumulative_score // size)
+                all_scores.append(score / cumulative_score / size)
             else:
-                all_scores.append(score // cumulative_score)
+                all_scores.append(score / cumulative_score)
         all_scores = np.array(all_scores)
         return all_scores
 
@@ -1755,7 +1755,7 @@ def diversity_score(self, nbest_compressions, normalization=True):
                     cluster_counters[idx] = 1
 
             if normalization:
-                all_scores.append(np.sum(cluster_counters) // float(len(sentence)))
+                all_scores.append(np.sum(cluster_counters) / float(len(sentence)))
             else:
                 all_scores.append(np.sum(cluster_counters))
         all_scores = [i + 1e-5 for i in all_scores]  # smooth
@@ -1803,31 +1803,31 @@ def final_score(self, nbest_compressions, n_results=20):
                 if self.tfidf_coverage:
                     fl_score = self.fluency_score(nbest_compressions, normalization=False)
                     for i, compression in enumerate(nbest_compressions):
-                        nbest_compressions[i] = (compression[0] // fl_score[i], compression[1])
+                        nbest_compressions[i] = (compression[0] / fl_score[i], compression[1])
 
                 # tixier
                 if self.twidf_coverage:
                     fl_score = self.fluency_score(nbest_compressions, normalization=True)
                     for i, compression in enumerate(nbest_compressions):
-                        nbest_compressions[i] = (compression[0] // fl_score[i], compression[1])
+                        nbest_compressions[i] = (compression[0] / fl_score[i], compression[1])
 
             # mehdad
             if self.tfidf_coverage:
                 tfidf_co_score = self.noun_tfidf_coverage_score(nbest_compressions, normalization=False)
                 for i, compression in enumerate(nbest_compressions):
-                    nbest_compressions[i] = (compression[0] // tfidf_co_score[i], compression[1])
+                    nbest_compressions[i] = (compression[0] / tfidf_co_score[i], compression[1])
 
             # tixier
             if self.twidf_coverage:
                 twidf_co_score = self.twidf_coverage_score(nbest_compressions, normalization=True)
                 for i, compression in enumerate(nbest_compressions):
-                    nbest_compressions[i] = (compression[0] // twidf_co_score[i], compression[1])
+                    nbest_compressions[i] = (compression[0] / twidf_co_score[i], compression[1])
 
             # tixier
             if self.diversity:
                 div_score = self.diversity_score(nbest_compressions, normalization=True)
                 for i, compression in enumerate(nbest_compressions):
-                    nbest_compressions[i] = (compression[0] // div_score[i], compression[1])
+                    nbest_compressions[i] = (compression[0] / div_score[i], compression[1])
 
             sorted_by_score = sorted(nbest_compressions, key=lambda tup: tup[0])
 

diff --git a/tf_idf.py b/tf_idf.py
@@ -9,12 +9,12 @@ def cosine_similarity(vector1, vector2):
     magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
     if not magnitude:
         return 0
-    return dot_product/magnitude
+    return dot_product//magnitude
 
 def jaccard_similarity(query, document):
     intersection = set(query).intersection(set(document))
     union = set(query).union(set(document))
-    return len(intersection)/len(union)
+    return len(intersection)//len(union)
 
 def term_frequency(term, tokenized_document):
     return tokenized_document.count(term)
@@ -34,7 +34,7 @@ def sublinear_term_frequency(term, tokenized_document):
 
 def augmented_term_frequency(term, tokenized_document):
     max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
-    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))
+    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))//max_count))
 
 def inverse_document_frequencies(tokenized_documents):
     idf_values = {}

diff --git a/utterance_community_detection.py b/utterance_community_detection.py
@@ -232,7 +232,7 @@
         for utterance in utterances_processed:
             index, role, utt = utterance
             words = utt.split(' ')
-            utt_scores.append(round(sum([core_rank_scores[word] for word in words]) // float(len(words)), 2))
+            utt_scores.append(round(sum([core_rank_scores[word] for word in words]) / float(len(words)), 2))
 
         # remove communities with less than min_elt number of utterances
         comm_labels = [k for k, v in c.items() if v >= min_elt]
@@ -243,7 +243,7 @@
         for label in comm_labels:
             # get the index of all the utterances belonging to the comm
             utt_indexes = [idx for idx, value in enumerate(membership) if value == label]
-            comm_scores.append(round(sum([utt_scores[idx] for idx in utt_indexes]) // float(len(utt_indexes)), 2))
+            comm_scores.append(round(sum([utt_scores[idx] for idx in utt_indexes]) / float(len(utt_indexes)), 2))
 
         # sort communities according to the average score of the utterances they contain
         # get sorted index of elements of comm_scores