-
Notifications
You must be signed in to change notification settings - Fork 8
/
preprocess_hdn.py
188 lines (156 loc) · 7.15 KB
/
preprocess_hdn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
from nltk.corpus import wordnet as wn
from collections import defaultdict
from tqdm import tqdm
import random
from itertools import islice
from version import version
def synset2identifier(synset, wn_version):
"""
return synset identifier of
nltk.corpus.reader.wordnet.Synset instance
:param nltk.corpus.reader.wordnet.Synset synset: a wordnet synset
:param str wn_version: supported: '171 | 21 | 30'
:rtype: str
:return: eng-VERSION-OFFSET-POS (n | v | r | a)
e.g.
"""
offset = str(synset.offset())
offset_8_char = offset.zfill(8)
pos = synset.pos()
if pos == 'j':
pos = 'a'
identifier = 'eng-{wn_version}-{offset_8_char}-{pos}'.format_map(locals())
return identifier
def synsets_graph_info(wn_instance, wn_version, lemma, pos):
"""
extract:
1. hyponym under lowest least common subsumer
:param nltk.corpus.reader.wordnet.WordNetCorpusReader wn_instance: instance
of nltk.corpus.reader.wordnet.WordNetCorpusReader
:param str wn_version: supported: '171' | '21' | '30'
:param str lemma: a lemma
:param str pos: a pos
:rtype: dict
:return: mapping synset_id
-> 'under_lcs' -> under_lcs identifier
-> 'path_to_under_lcs' -> [sy1_iden, sy2_iden, sy3_iden, ...]
"""
sy_id2under_lcs_info = dict()
synsets = wn_instance.synsets(lemma, pos=pos)
synsets = set(synsets)
if len(synsets) == 1:
sy_obj = synsets.pop()
target_sy_iden = synset2identifier(sy_obj, wn_version)
sy_id2under_lcs_info[target_sy_iden] = {'under_lcs': None,
'under_lcs_obj': None,
'sy_obj' : sy_obj,
'path_to_under_lcs': []}
return sy_id2under_lcs_info
for sy1 in synsets:
target_sy_iden = synset2identifier(sy1, wn_version)
min_path_distance = 100
closest_lcs = None
for sy2 in synsets:
if sy1 != sy2:
try:
lcs_s = sy1.lowest_common_hypernyms(sy2, simulate_root=True)
lcs = lcs_s[0]
except:
lcs = None
print('wordnet error', sy1, sy2)
path_distance = sy1.shortest_path_distance(lcs, simulate_root=True)
if path_distance < min_path_distance:
closest_lcs = lcs
min_path_distance = path_distance
under_lcs = None
for hypernym_path in sy1.hypernym_paths():
for first, second in zip(hypernym_path, hypernym_path[1:]):
if first == closest_lcs:
under_lcs = second
index_under_lcs = hypernym_path.index(under_lcs)
path_to_under_lcs = hypernym_path[index_under_lcs + 1:-1]
under_lcs_iden = synset2identifier(under_lcs, wn_version)
path_to_under_lcs_idens = [synset2identifier(synset, wn_version)
for synset in path_to_under_lcs]
sy_id2under_lcs_info[target_sy_iden] = {'under_lcs': under_lcs_iden,
'under_lcs_obj': under_lcs,
'sy_obj' : sy1,
'path_to_under_lcs': path_to_under_lcs_idens}
return sy_id2under_lcs_info
if __name__ == '__main__':
gigaword_path = 'preprocessed-data/gigaword.txt'
output_path = 'output/gigaword-hdn-training.%s.txt' %version
id2synset = {synset2identifier(s, '30'): s for s in wn.all_synsets()}
all_noun_lemmas = {lemma.name()
for synset in wn.all_synsets('n')
for lemma in synset.lemmas()}
all_hdns = defaultdict(set)
hdn_list2lemma = defaultdict(list)
for lemma in all_noun_lemmas:
graph_info = synsets_graph_info(wn_instance=wn,
wn_version='30',
lemma=lemma,
pos='n')
hdns = tuple([info['under_lcs']
for synset, info in graph_info.items()
if info['under_lcs']])
hdn_list2lemma[hdns].append(lemma)
for hdn in hdns:
all_hdns[hdn].add(hdns)
for hdn in all_hdns:
all_hdns[hdn] = list(all_hdns[hdn])
# # Getting monosemous words
monosemous_noun2related_hdns = defaultdict(list)
for lemma in all_noun_lemmas:
synsets = wn.synsets(lemma=lemma, pos='n')
if len(synsets) == 1: # monsemous
for s in synsets:
paths = s.hypernym_paths()
for h in paths[0]:
id_ = synset2identifier(h, '30')
if id_ in all_hdns:
monosemous_noun2related_hdns[lemma].append(id_)
print('Monosemous multi-word-expressions:',
len([m for m in monosemous_noun2related_hdns if '_' not in m]))
print('All monosemous words:', len(monosemous_noun2related_hdns))
print('All lemmas:', len(all_noun_lemmas))
print('Proportion monosemous/all:',
len(monosemous_noun2related_hdns) / len(all_noun_lemmas))
# # Find all monsemous words in Gigaword
found_monosemous = []
with open(gigaword_path) as f:
for line in tqdm(f, total=175771829):
for word in line.split():
if word in monosemous_noun2related_hdns:
found_monosemous.append(word)
print('Found monosemous words: ', len(found_monosemous))
# # Turn Gigaword into a supervised training set
r = random.Random(5328952)
used_hdn_lists = set()
available_hdn_lists = set()
with open(gigaword_path) as f, open(output_path, 'w') as f_out:
for line in tqdm(f, total=175771829):
sent = line.split()
for i, word in enumerate(sent):
if word in monosemous_noun2related_hdns:
new_sent = sent[:]
new_sent[i] = '<target>'
for hdn in monosemous_noun2related_hdns[word]:
available_hdn_lists.update(all_hdns[hdn])
for _ in range(2):
hdn = r.choice(monosemous_noun2related_hdns[word])
hdn_list = r.choice(all_hdns[hdn])
used_hdn_lists.add(hdn_list)
f_out.write(' '.join((hdn, '/'.join(hdn_list), ' '.join(new_sent))))
f_out.write('\n')
print('Used HDNs:', len(used_hdn_lists))
print('HDNs availble in GigaWord:', len(available_hdn_lists))
all_hdn_lists = set(hdn_list
for hdn_lists in all_hdns.values()
for hdn_list in hdn_lists)
print('HDNs in WordNet:', len(available_hdn_lists))
l = all_hdn_lists.difference(used_hdn_lists)
print("Sample HDN lists that don't occur in our dataset")
for hdn_list in random.sample(l, 10):
print(','.join(hdn_list2lemma[hdn_list]), '-->',
'/'.join(id2synset[h].name() for h in hdn_list))