-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild_embs.py
81 lines (70 loc) · 4.07 KB
/
build_embs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import numpy as np
import pickle
import os
import re
import utils.data_utils as data_utils
import argparse
import json
from collections import defaultdict
def main(argv=None):
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--data_dir",
default="./data/",
type=str,
required=True,
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--task_name",
default="imdb",
type=str,
required=True,
help="The name of the task to train.")
parser.add_argument("--max_candidates",
default=4,
type=int)
parser.add_argument("--threshold_distance",
default=0.5,
type=float)
parser.add_argument("--vocab_size",
default=50000,
type=int)
args = parser.parse_args()
MAX_VOCAB_SIZE = args.vocab_size
if not os.path.exists(os.path.join(args.data_dir, 'aux_files')):
os.makedirs(os.path.join(args.data_dir, 'aux_files'))
# Generate dictionary
if not os.path.isfile(os.path.join(args.data_dir, 'aux_files', 'org_dic_%s_%d.pkl' % (args.task_name, MAX_VOCAB_SIZE))):
print('org_dic & org_inv_dic not exist, build and save the dict...')
org_dic, org_inv_dic, _ = data_utils.build_dict(args.task_name, MAX_VOCAB_SIZE, data_dir=args.data_dir)
with open(os.path.join(args.data_dir, 'aux_files', 'org_dic_%s_%d.pkl' % (args.task_name, MAX_VOCAB_SIZE)), 'wb') as f:
pickle.dump(org_dic, f, protocol=4)
with open(os.path.join(args.data_dir, 'aux_files', 'org_inv_dic_%s_%d.pkl' % (args.task_name, MAX_VOCAB_SIZE)), 'wb') as f:
pickle.dump(org_inv_dic, f, protocol=4)
else:
print('org_dic & org_inv_dic already exist, load the dict...')
with open(os.path.join(args.data_dir, 'aux_files', 'org_dic_%s_%d.pkl' % (args.task_name, MAX_VOCAB_SIZE)), 'rb') as f:
org_dic = pickle.load(f)
with open(os.path.join(args.data_dir, 'aux_files', 'org_inv_dic_%s_%d.pkl' % (args.task_name, MAX_VOCAB_SIZE)), 'rb') as f:
org_inv_dic = pickle.load(f)
# Calculate the distance matrix
if not os.path.isfile(os.path.join(args.data_dir, 'aux_files', 'small_dist_counter_%s_%d.npy' % (args.task_name, MAX_VOCAB_SIZE))):
print('small dist counter not exists, create and save...')
dist_mat = data_utils.compute_dist_matrix(org_dic, args.task_name, MAX_VOCAB_SIZE, data_dir=args.data_dir)
print('dist matrix created!')
small_dist_mat = data_utils.create_small_embedding_matrix(dist_mat, MAX_VOCAB_SIZE, threshold=1.5, retain_num=50)
print('small dist counter created!')
np.save(os.path.join(args.data_dir, 'aux_files', 'small_dist_counter_%s_%d.npy' % (args.task_name, MAX_VOCAB_SIZE)), small_dist_mat)
else:
print('small dist counter exists, loading...')
small_dist_mat = np.load(os.path.join(args.data_dir, 'aux_files', 'small_dist_counter_%s_%d.npy' % (args.task_name, MAX_VOCAB_SIZE)))
if not os.path.isfile(os.path.join(args.data_dir, 'aux_files', 'embeddings_glove_%s_%d.npy' % (args.task_name, MAX_VOCAB_SIZE))):
print('embeddings glove not exists, creating...')
glove_model = data_utils.loadGloveModel('vectors/glove.840B.300d.txt', data_dir=args.data_dir)
glove_embeddings, _ = data_utils.create_embeddings_matrix(glove_model, org_dic, dataset=args.task_name, data_dir=args.data_dir)
print("embeddings glove created!")
np.save(os.path.join(args.data_dir, 'aux_files', 'embeddings_glove_%s_%d.npy' % (args.task_name, MAX_VOCAB_SIZE)), glove_embeddings)
else:
print('embeddings glove exists, loading...')
glove_embeddings = np.load(os.path.join(args.data_dir, 'aux_files', 'embeddings_glove_%s_%d.npy' % (args.task_name, MAX_VOCAB_SIZE)))
if __name__ == '__main__':
main()