-
Notifications
You must be signed in to change notification settings - Fork 6
/
load_data.py
92 lines (74 loc) · 2.98 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
__author__ = 'thiagocastroferreira'
"""
Author: Thiago Castro Ferreira
Date: 12/12/2017
Description:
Script for loading the referring expressions collection.
PYTHON VERSION: 2.7
UPDATE CONSTANTS:
VOCAB_PATH
TRAIN_REFEX_PATH
DEV_REFEX_PATH
TEST_REFEX_PATH
"""
import os
# PATH FOR VOCABULARY
VOCAB_PATH = 'data/'
# PATH FOR REFERRING EXPRESSION COLLECTIONS
TRAIN_REFEX_PATH = 'data/train'
DEV_REFEX_PATH = 'data/dev'
TEST_REFEX_PATH = 'data/test'
def load(fpre_context, fpos_context, fentity, frefex, fsize, character):
with open(fpre_context) as f:
pre_context = map(lambda x: x.split(), f.read().split('\n'))
with open(fpos_context) as f:
pos_context = map(lambda x: x.split(), f.read().split('\n'))
with open(fentity) as f:
entity = f.read().split('\n')
with open(frefex) as f:
if character:
refex = map(lambda x: ['eos'] + list(x.replace('eos', '').strip()) + ['eos'], f.read().split('\n'))
else:
refex = map(lambda x: x.split(), f.read().split('\n'))
with open(fsize) as f:
size = f.read().split('\n')
return {
'pre_context':list(pre_context),
'pos_context':list(pos_context),
'entity':list(entity),
'refex':list(refex),
'size':list(size)
}
def run(character=False):
# VOCABULARY
with open(os.path.join(VOCAB_PATH, 'input_vocab.txt')) as f:
input_vocab = f.read().split('\n')
if character:
with open(os.path.join(VOCAB_PATH, 'character_vocab.txt')) as f:
output_vocab = f.read().split('\n')
else:
with open(os.path.join(VOCAB_PATH, 'output_vocab.txt')) as f:
output_vocab = f.read().split('\n')
vocab = {'input':input_vocab, 'output':output_vocab}
# TRAINSET
fprecontext = os.path.join(TRAIN_REFEX_PATH, 'pre_context.txt')
fposcontext = os.path.join(TRAIN_REFEX_PATH, 'pos_context.txt')
fentity = os.path.join(TRAIN_REFEX_PATH, 'entity.txt')
frefex = os.path.join(TRAIN_REFEX_PATH, 'refex.txt')
fsize = os.path.join(TRAIN_REFEX_PATH, 'size.txt')
trainset = load(fprecontext, fposcontext, fentity, frefex, fsize, character)
# DEVSET
fprecontext = os.path.join(DEV_REFEX_PATH, 'pre_context.txt')
fposcontext = os.path.join(DEV_REFEX_PATH, 'pos_context.txt')
fentity = os.path.join(DEV_REFEX_PATH, 'entity.txt')
frefex = os.path.join(DEV_REFEX_PATH, 'refex.txt')
fsize = os.path.join(DEV_REFEX_PATH, 'size.txt')
devset = load(fprecontext, fposcontext, fentity, frefex, fsize, character)
# TESTSET
fprecontext = os.path.join(TEST_REFEX_PATH, 'pre_context.txt')
fposcontext = os.path.join(TEST_REFEX_PATH, 'pos_context.txt')
fentity = os.path.join(TEST_REFEX_PATH, 'entity.txt')
frefex = os.path.join(TEST_REFEX_PATH, 'refex.txt')
fsize = os.path.join(TEST_REFEX_PATH, 'size.txt')
testset = load(fprecontext, fposcontext, fentity, frefex, fsize, character)
return vocab, trainset, devset, testset