-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_augmentation.py
186 lines (173 loc) · 9.03 KB
/
text_augmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# -*- encoding: utf-8 -*-
#
# 'text_augmentation.py'
#
# Copyright (C) 2013 Jorge Couchet <jorge.couchet at gmail.com>
#
# This file is part of 'ted-scimu'
#
# 'ted-scimu' is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# 'ted-scimu' is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with 'ted-scimu'. If not, see <http ://www.gnu.org/licenses/>.
#
import traceback
import time
import base
import csv
import gensim
from gensim import corpora, similarities, models
import dbpedia
import math
import urllib2
# Constant used to measure the closeness between two weights
PERCENTAGE_MAX_WEIGHT = 0.05
def get_dbpedia_annotations(text, tokens):
"""
It returns for the given 'text' only one annotation from Dbpedia. The criteria in order to choose the annotation to return is
the token with the highest weight in the TF-IDF space from the tokens that are composing the annotation 's 'surface form'.
PARAMETERS:
1. text: The text used to query Dbpedia Spotlight for annotations
2. tokens: A dict where each key is a token from the corpus and, where each value is the token 's weight in the
TF-IDF space for the corpus
RETURNS:
A tuple with the following format:
(ANNOTATION_URI, ANNOTATION_SURFACE_FORM, ANNOTATION_ONTOLOGY_TYPES, SUPPORT_SCORE, SIMILARITY_SCORE)
"""
res = None
# Request the annotations to the Dbpedia Spotlight 's Web Service
annotations = dbpedia.get_annotations(text)
# Extract the information from the returned annotations
anns = dbpedia.get_results(annotations)
if len(anns) > 0:
if len(anns) == 1:
res = anns[0]
else:
max_weight = 0
max_element = None
# Loop over the returned annotations and select only one in order to avoid to add to much information
# The idea is to use the annotation from the tokens more relevant for the document, for this reason
# the script is using the results from the TF-IDF model stored in 'tokens'
for i in range(0, len(anns)):
# The current annotation
ann = anns[i]
# The 'surface form' returned by Dbpedia
base_tokens = ann[1]
# The 'surface form' from Dbpedia could be a combination of tokens
all_tokens = base_tokens.split()
# Iterate over the tokens from the 'surface form'
for token in all_tokens:
# If the token is a key in the dict 'tokens', then process the token
if token in tokens:
# It is a lucky token, it is the first
if max_weight == 0:
max_weight = tokens[token]
max_element = ann
# If the new token 's weight is clearly higher than the current weight, the the new one wins
elif tokens[token] > max_weight*(1 + PERCENTAGE_MAX_WEIGHT):
max_weight = tokens[token]
max_element = ann
# If the new weight has less than PERCENTAGE_MAX_WEIGHT difference with the current weight,
# then check the 'support' or 'similarity score'
elif ((math.fabs(tokens[token] - max_weight))/max_weight) <= PERCENTAGE_MAX_WEIGHT:
# It checks the support, if it is bigger, then the new one wins
if ann[3] > max_element[3]:
max_weight = tokens[token]
max_element = ann
# If there is a tie with the support, then check the similarity score
elif ann[3] == max_element[3]:
# It checks the similarity score, if it is bigger, then the new one wins
if ann[4] > max_element[4]:
max_weight = tokens[token]
max_element = ann
res = max_element
return res
def augment_corpus_information(dictionary_name, mm_name, docs2corpus_name, tfidf_name, docs_augmented_name):
"""
It returns the texts that will be used to augment the corpus 's document information in order to try to have a better match in a similarity
query. It is used when the documents from a corpus are containing little information.
PARAMETERS:
1. dictionary_name: The name of the file with the generated dictionary for the corpus. The dictionary has the tokens used for the
corpus representation in the VSM
2. mm_name: The name of the file with the Market Matrix representation for the corpus in the VSM
3. docs2corpus_name: The name of the file with the translations between the documents names (a file line) and the documents IDs for the
corpus representation in the VSM (the position in the file)
4. tfidf_name: The name of the file with the TF-IDF representation (transformation) of the corpus
5. docs_augmented_name: The name of the CSV file where to store the information about the augmentated documents
RETURNS:
A dict where each key and value has the following format:
{DOCUMENT_FILE_NAME:ANNOTATION_ABSTRACT}
It also generate the CSV file 'docs_augmented_name' with the information about the augmentated documents. It has the following format:
DOCUMENT_NAME, DBPEDIA_URI
"""
res = {}
total_augmentation = 0;
try:
# Load the needed vector transformations for the Science Museum corpus
docs2corpus = base.get_docs2corpus(docs2corpus_name)
dictionary = corpora.Dictionary().load(dictionary_name + '.dict')
mm = corpora.MmCorpus(mm_name + '.mm')
tfidf = gensim.models.tfidfmodel.TfidfModel().load(tfidf_name + '.tfidf')
docs_augmented = open(docs_augmented_name + '.csv', "w")
# Loop over all the documents from the stored Science Museum corpus representation
# in the VSM
for i in range(0, len(mm)):
# Get documents by offset, in this way the offset matches with the
# document ID within the corpus representation in the VSM
doc = mm[i]
# Get the Science Museum 's document name. The offset in the 'docs2corpus' file also
# matches with the document 's offset within the corpus representation in the VSM
doc_file_name = docs2corpus[i]
# Get the document 's representation in the TF-IDF space
vtf = tfidf[doc]
# Sort the vector coordinates from high to low according their TF-IDF weight
vtf = sorted(vtf, key=lambda info: info[1], reverse=True)
print "Going to add Dbpedia annotations for: %s"%(doc_file_name)
text = ''
tokens = {}
# Get the tokens from the document, add the token's weight to the
# 'tokens' dict and build 'text' as the concatenation of all the
# tokens. This 'text' is used later to ask for annotations to Dbpedia
for tid, weight in vtf:
if weight > 0:
# Get the token 's string from the token 's ID in the corpus
token = dictionary[tid]
tokens[token] = weight
if text == '':
text = token
else:
text = text + ' ' + token
# If any, it get only one annotation from Dbpedia
annotation = get_dbpedia_annotations(text, tokens)
if annotation:
# Ask for the 'abstract' referenced by the Dbpedia annotation
# This 'abstract' is the one used to augment the document information
# in order to try to have a better match with the TED talks
abstract = dbpedia.get_abstract(annotation[0])
if abstract is not None:
try:
abstract = abstract.encode('utf-8').strip()
res[doc_file_name] = abstract
total_augmentation = total_augmentation + 1
except:
pass
docs_augmented.write((', '.join([doc_file_name, annotation[0]])).encode('utf-8').strip() + '\n')
docs_augmented.close()
print "The total amount of augmentations is %s over a total of %s documents"%(total_augmentation,len(mm))
except:
try:
docs_augmented.close()
except:
pass
print "add_dbpedia_annotations"
print ""
traceback.print_exc()
return res