-
Notifications
You must be signed in to change notification settings - Fork 2
/
build_vocab.py
48 lines (40 loc) · 1.42 KB
/
build_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from os.path import join
import json
import os
import random
import argparse
from collections import Counter
import pickle as pkl
import re
def _count_data(path):
""" count number of data in the given path"""
matcher = re.compile(r'[0-9]+\.json')
match = lambda name: bool(matcher.match(name))
names = os.listdir(path)
n_data = len(list(filter(match, names)))
return n_data
def main(data_dir):
split_dir = join(data_dir, "train")
n_data = _count_data(split_dir)
vocab_counter = Counter()
for i in range(n_data):
js = json.load(open(join(split_dir, '{}.json'.format(i))))
summary = js['summary']
summary_text = ' '.join(summary).strip()
summary_word_list = summary_text.strip().split(' ')
review = js['reviewText']
review_text = ' '.join(review).strip()
review_word_list = review_text.strip().split(' ')
all_tokens = summary_word_list + review_word_list
vocab_counter.update([t for t in all_tokens if t != ""])
with open(os.path.join(data_dir, "vocab_cnt.pkl"),
'wb') as vocab_file:
pkl.dump(vocab_counter, vocab_file)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=('Preprocess review data')
)
parser.add_argument('-data_dir', type=str, action='store',
help='The directory of the data.')
args = parser.parse_args()
main(args.data_dir)