-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_conllu.py
55 lines (42 loc) · 1.51 KB
/
split_conllu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python
#
# Author: (c) 2016 Vincent Kriz <[email protected]>
#
import sys
import logging
import argparse
# Logging.
from wheel.metadata import requires_to_requires_dist
logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', level=logging.DEBUG)
# Parse command line arguments.
parser = argparse.ArgumentParser()
parser.description = 'Split CoNNLU file into several smaller files.'
parser.add_argument('--input_file', required=True, help='an input file')
parser.add_argument('--output_dir', required=True, help='an output dir')
args = parser.parse_args()
# A method for reading one sentence.
def read_sentence(filehandler):
lines = []
while 42:
one_line = filehandler.readline().rstrip()
lines.append(one_line)
if one_line == '':
return lines
sentence_counter = 0
with open(args.input_file, 'r') as fin:
while 42:
sentence = read_sentence(fin)
if len(sentence) == 0:
break
if (sentence_counter % 10000) == 0:
logging.info('Processed %d sentences.', sentence_counter)
sentence_counter += 1
if sentence[0][:9] == '# sent_id':
sent_id = sentence[0][10:]
if sent_id[-3:] == '/cs':
sent_id = sent_id[:-3]
output_file = '%s/%s.conllu' % (args.output_dir, sent_id)
logging.debug('File: %s', output_file)
with open(output_file, 'w') as fout:
fout.write('\n'.join(sentence))
fout.write('\n')