-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
126 lines (101 loc) · 3.67 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import torch
def word_to_padded_index_sequence(word, chars, is_padding=True, word_len=20, cuda=False):
"""
:param word: word to be tokenized by characters
:param padding: if the word is a padding word itself, return all paddings
"""
PADDING = "<PAD>"
UNKNOWN = "<UNK>"
if is_padding:
return [chars[PADDING]] * word_len
char_indices = []
for i in range(word_len):
if i < len(word):
if word[i] in chars:
index = chars[word[i]]
else:
index = chars[UNKNOWN]
else:
index = chars[PADDING]
char_indices.append(index)
if cuda:
char_indices = torch.cuda.LongTensor(char_indices)
else:
char_indices = torch.LongTensor(char_indices)
return char_indices
def sentence_to_padded_index_sequence(tokens, words, chars, seq_len=50, word_len=20, cuda=False):
"""
Converts tokenized sentences to padded word indices of specified seq_len and each word
in the sentence to character indices
:param tokens: list of words
:param words: word vocabulary
:param chars: character vocabulary
:param seq_len: padded length
:param word_len: padded word length
:param cuda: whether or not to use cuda tensors
:return: (word indices, character indices)
"""
PADDING = "<PAD>"
UNKNOWN = "<UNK>"
sentence_words = []
# list of each word's character indices list
words_as_chars = []
for i in range(seq_len):
# character indices of each word
word_as_chars = []
if i < len(tokens):
if tokens[i] in words:
index = words[tokens[i]]
else:
index = words[UNKNOWN]
word_as_chars.append(word_to_padded_index_sequence(tokens[i], chars, word_len=word_len, cuda=cuda))
else:
index = words[PADDING]
word_as_chars.append(word_to_padded_index_sequence('', chars, word_len=word_len,
cuda=cuda, is_padding=True))
if cuda:
words_as_chars.append(torch.cuda.LongTensor(word_as_chars))
else:
words_as_chars.append(torch.LongTensor(word_as_chars))
sentence_words.append(index)
if cuda:
sentence_words = torch.cuda.LongTensor(sentence_words)
else:
sentence_words = torch.LongTensor(sentence_words)
return sentence_words, torch.stack(words_as_chars).squeeze()
def load_embedding(embedding_path, words_to_load=1000000):
with open(embedding_path) as f:
loaded_embeddings = []
words = {}
chars = {}
idx2words = {}
ordered_words = []
for i, line in enumerate(f):
if len(words) >= words_to_load:
break
s = line.split()
# check for words with spaces?
if len(s) != 301:
continue
# check if already loaded?
if s[0] in words:
continue
loaded_embeddings.append(np.asarray(s[1:]))
# collect char
for c in s[0]:
if c not in chars:
chars[c] = len(chars)
words[s[0]] = len(words)
idx2words[i] = s[0]
ordered_words.append(s[0])
# add unknown to word and char
loaded_embeddings.append(np.random.rand(300))
words["<UNK>"] = len(words)
# add padding
loaded_embeddings.append(np.zeros(300))
words["<PAD>"] = len(words)
chars["<UNK>"] = len(chars)
chars["<PAD>"] = len(chars)
loaded_embeddings = np.array(loaded_embeddings).astype(float)
return loaded_embeddings, words, chars