-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
189 lines (142 loc) · 5.18 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import random
import inflect
import nltk
import data
from nltk.corpus import cmudict
from nltk.corpus import brown
from nltk.corpus import wordnet
DEBUG_SKIP_WORD_ANALYSIS = False
DEBUG_OUTPUT = False
DEBUG_OUTPUT_NOISY = False
WORD_TYPE_UNKNOWN = ""
WORD_TYPE_NOUN = "N"
WORD_TYPE_ADJECTIVE = "JJ"
# WORD_TYPE_VERB = "VB"
WORD_TYPE_VERB_PRESENT = "VBG"
WORD_TYPE_VERB = "VB"
WORD_TYPES = [WORD_TYPE_NOUN, WORD_TYPE_ADJECTIVE, WORD_TYPE_VERB_PRESENT, WORD_TYPE_VERB]
p = inflect.engine()
uncountable_nouns = 0
def download_corpi():
nltk.download("brown")
nltk.download("cmudict")
nltk.download("wordnet")
def starts_with_vowel_sound(word, pronunciations=cmudict.dict()):
for syllables in pronunciations.get(word, []):
return syllables[0][-1].isdigit() # use only the first one
def get_indefinite_article(word):
return "an" if starts_with_vowel_sound(word) else "a"
def pluralize(word):
return p.plural(word)
# if word.endswith("s"):
# return word
# return word + "s"
def choose_and_remove(elements):
index = random.randint(0, len(elements) - 1)
item = elements[index]
del elements[index]
return item
def concat_list(elements, transform_function=lambda x: x):
result = ""
count = len(elements)
for index in range(count):
if index > 0:
if index == count - 1:
result += " and "
else:
result += ", "
result += transform_function(elements[index])
return result
def find_most_common_word_type(word):
if DEBUG_SKIP_WORD_ANALYSIS:
return random.choice(WORD_TYPES)
result = nltk.FreqDist(t for w, t in brown.tagged_words() if w.lower() == word).most_common()
if len(result) > 0:
result_type = result[0][0]
if DEBUG_OUTPUT_NOISY:
print(word + " is a " + result_type)
for word_type in WORD_TYPES:
if result_type.startswith(word_type):
return word_type
if DEBUG_OUTPUT:
print("[find_most_common_word_type] Unknown word type: " + result_type + " (for " + word + ")")
else:
if DEBUG_OUTPUT:
print("[find_most_common_word_type] Unknown word: " + word)
return WORD_TYPE_UNKNOWN
def has_word_type(word, word_types):
if DEBUG_SKIP_WORD_ANALYSIS:
return True
results = nltk.FreqDist(t for w, t in brown.tagged_words() if w.lower() == word).most_common()
if len(results) > 0:
for result in results:
result_type = result[0]
for word_type in word_types:
if result_type.startswith(word_type):
if DEBUG_OUTPUT_NOISY:
print(word + " has a word type " + result_type)
return True
else:
if DEBUG_OUTPUT:
print("[has_word_type] Unknown word: " + word)
return False
def random_weighted_choice(iteration, weight_delegate):
total_chance = sum(map(weight_delegate, iteration))
number = random.uniform(0, total_chance)
for element in iteration:
number -= weight_delegate(element)
if number <= 0:
return element
return element[-1]
def nounify_first_result(word, default=0):
result = nounify(word)
if len(result) == 0:
if default == 0:
return word
else:
return default
return result[0][0]()
def nounify(word, type=0):
if type == 0:
for t in ["a", "r", "v"]:
result = nounify(word, t)
if len(result) > 0:
return result
return []
""" Transform a verb to the closest noun: die -> death """
synsets = wordnet.synsets(word, pos=type)
# Word not found
if not synsets:
return []
# Get all verb lemmas of the word
lemmas = [l for s in synsets for l in s.lemmas() if s.name().split('.')[1] == type]
# Get related forms
derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]
# filter only the nouns
related_noun_lemmas = [l for drf in derivationally_related_forms for l in drf[1] if l.synset().name().split('.')[1] == 'n']
# Extract the words from the lemmas
words = [l.name for l in related_noun_lemmas]
len_words = len(words)
# Build the result in the form of a list containing tuples (word, probability)
result = [(w, float(words.count(w))/len_words) for w in set(words)]
result.sort(key=lambda w: -w[1])
# return all the possibilities sorted by probability
return result
def test_nounify():
print(nounify_first_result("testing"))
print(nounify_first_result("decide"))
print(nounify_first_result("deciding"))
print(nounify_first_result("decided"))
print(nounify_first_result("conclude"))
print(nounify_first_result("fast"))
print(nounify_first_result("feline"))
print(nounify_first_result("beautiful"))
print(nounify_first_result("boring"))
print(nounify_first_result("extreme"))
print(nounify_first_result("cute"))
print(nounify_first_result("exploding"))
def is_uncountable_noun(word):
global uncountable_nouns
if uncountable_nouns == 0:
uncountable_nouns = data.load_uncountable_nouns()
return word in uncountable_nouns