-
Notifications
You must be signed in to change notification settings - Fork 22
/
preprocess.py
43 lines (34 loc) · 1.16 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
__author__='thiagocastroferreira'
"""
Author: Thiago Castro Ferreira
Date: 15/07/2018
Description:
Preprocessing script: for train and dev sets, parse files, order english part and preprocess to be translated it
PYTHON VERSION: 2.7
"""
import order
import parser
from nmt import NMT
import cPickle as p
def run(set_path, entry_path, _set):
entryset = parser.run_parser(set_path) # parse
entryset = order.run(entryset, 'en') # order english part
# nmt = NMT(entryset, _set) # translate to german
# nmt.preprocess()
p.dump(entryset, open(entry_path, 'w'))
if __name__ == '__main__':
print('Preparing testset....')
TRAIN_PATH = 'dependencies/delexicalized/v1.4/test'
ENTRY_PATH = 'dependencies/test.cPickle'
_set = 'test'
run(TRAIN_PATH, ENTRY_PATH, _set)
print('Preparing devset...')
DEV_PATH = 'dependencies/delexicalized/v1.4/dev'
ENTRY_PATH = 'dependencies/dev.cPickle'
_set = 'dev'
run(DEV_PATH, ENTRY_PATH, _set)
print('Preparing trainset....')
TRAIN_PATH = 'dependencies/delexicalized/v1.4/train'
ENTRY_PATH = 'dependencies/train.cPickle'
_set = 'train'
run(TRAIN_PATH, ENTRY_PATH, _set)