forked from caravanuden/text_norm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
72 lines (65 loc) · 2.49 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# utils for LSTM encoder/decoder text normalization model
# mostly for data preprocessing -
# adding context window, padding to a fixed length vec,
# converting tokens to ints and sequences to np arrays
# CVU 2018
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Embedding
from keras.layers.recurrent import LSTM
import numpy as np
import os
# replace word/character tokens with respective integers from vocabulary
def index(token_list, vocab):
for token in token_list:
for num in range(0, len(token)):
if token[num] in vocab:
token[num] = vocab[token[num]]
else:
token[num] = vocab['UNK']
return token_list
# add context window to input sequences
def add_context_window(df, append_size, pad, vocab):
df.extend([pad for i in range(append_size)])
df.insert(0,pad) #Do in one line
df.insert(0,pad)
df.insert(0,pad)
neo_data = []
for i in range(append_size, (len(df) - append_size)):
row = []
row = [vocab['<norm>']] + df[i] + [vocab['<norm>']]
row = ([item for sublist in df[(i - append_size):i] for item in sublist]
+ row
+ [item for sublist in df[(i + 1):(i + 1 + append_size)] for item in sublist])
neo_data.append(row)
return neo_data
# pad input and target sequences to create fixed length sequences
def padding_batchwise(data, max_len):
i_end = 0
for i in range(0, len(data),10000):
if i + 10000 >= len(data):
i_end = len(data)
else:
i_end = i + 10000
data[i:i_end] = pad_sequences(data[i:i_end], maxlen = max_len, dtype='int32')
return data
# convert 2D target sequences to 3D suitable for LSTM model
def sequences(data, seq_len, vocab):
sequences = np.zeros((len(data), seq_len, len(vocab)))
# Vectorizing each element in each sequence
for i, sentence in enumerate(data):
for j, word in enumerate(sentence):
sequences[i, j, word] = 1.
return sequences
# convert test data into array format, batch-wise
def array_batchwise(data, seq_len):
i_end = 0
array = np.empty((0,seq_len), dtype = int)
for i in range(0, len(data),10000):
if i + 10000 >= len(data):
i_end = len(data)
else:
i_end = i + 10000
arr = np.asarray(data[i:i_end], dtype = int)
array = np.concatenate((array,arr), axis = 0)
return array