-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
156 lines (126 loc) · 4.62 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from __future__ import print_function
import os
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import string
import tensorflow as tf
import json
from os.path import join, dirname
FLAGS = tf.app.flags.FLAGS
def get_txt_files(directory):
"""
Function to get all text files from a directory
params:
directory(string): path of directory
returns:
txt_files: list of all txt_file paths
"""
txt_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.txt'):
txt_files.append(root + file)
return txt_files
def get_sen_and_labels(df):
"""
Function to get all sentences, labels
"""
sentences = []
labels = []
for txt,lab in zip(df.text,df.label):
sentences.append(txt.split())
labels.append([lab])
return sentences, labels
def build_dict(sen_or_labels, include_pad=True):
"""
Function to construct dicts for word 2 idx and also return reverse
maps
params:
sen_or_labels(string): list of lists of sentences or labels
include_pad(bool): whether <pad> token should be added to dict
returns:
word2idx(dict) and its reverse map
"""
# Create a dictionary with default value 0
word2idx = defaultdict(lambda: 0)
idx = 0
if include_pad:
pad_token = '<pad>'
word2idx[pad_token] = idx
idx += 1
for word_list in sen_or_labels:
for word in word_list:
if word in word2idx:
continue
word2idx[word] = idx
idx += 1
# py3
idx2word = {v: k for k, v in word2idx.items()}
# py2
# idx2word = {v: k for k, v in word2idx.iter_items()}
return word2idx, idx2word
def sen2idxs(sentences, word2idx):
# converts string sentences to list of word ids
return [word2idx[word] for word in sentences]
def lab2idxs(label, label2idx):
# converts string labels to list of word ids
one_hot= [0]*FLAGS.num_classes
one_hot[ label2idx[label[0]] ]=1
return one_hot
def batches_generator(batch_size, sentences, labels,
word2idx, label2idx,
shuffle=False, allow_smaller_last_batch=False,
get_lengths=False):
"""
Generates padded batches of sen and labels.
params:
batch_size(int): batch_size of sen or labels to be returned
sentences(string): list of lists
labels(string): list of lists
shuffle(bool): whether data should be shuffled
allow_smaller_batch(bool): self explanatory
get_lengths(bool): whether seq_lengths of each sen to be
returned
returns:
A generator obj which can yield a sen and a label batch
"""
n_samples = len(sentences)
if shuffle:
order = np.random.permutation(n_samples)
else:
order = np.arange(n_samples)
n_batches = n_samples // batch_size
if allow_smaller_last_batch and n_samples % batch_size:
n_batches += 1
for k in range(n_batches):
batch_start = k * batch_size
batch_end = min((k + 1) * batch_size, n_samples)
current_batch_size = batch_end - batch_start
x_list = []
y_list = []
seq_lengths = []
max_len_sentence = 0
for idx in order[batch_start: batch_end]:
if len(sentences[idx]) > FLAGS.max_words:
sentences[idx] = sentences[idx][:FLAGS.max_words]
seq_lengths.append(len(sentences[idx]))
x_list.append(sen2idxs(sentences[idx], word2idx))
y_list.append(lab2idxs(labels[idx], label2idx))
max_len_sentence = max(max_len_sentence, len(sentences[idx]))
# Fill in the data into numpy nd-arrays filled with padding indices.
x = np.ones([current_batch_size, FLAGS.max_words],
dtype=np.int32) * word2idx['<pad>']
# lengths = np.zeros(current_batch_size, dtype=np.int32)
for n in range(current_batch_size):
# print(x.shape, x_list[n])
utt_len = len(x_list[n])
x[n, :utt_len] = x_list[n]
# lengths[n] = utt_len
# yield x, np.array(y_list), lengths
# print(y_list)
if get_lengths is True:
yield x, np.array(y_list), np.asarray(seq_lengths,
dtype=np.float64)
else:
yield x, np.array(y_list)