-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_dep_lemma.py
93 lines (79 loc) · 2.94 KB
/
gen_dep_lemma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
'''
To use the script:
1) Run coreNLP server in seperate process
> python2 coreNLP.py &
2) Run client script (ie this)
> python gen_dep_lemma.py <Corpus File>
'''
from __future__ import print_function
import json
from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
from pprint import pprint
import sys
import time
import itertools
from collections import OrderedDict
class StanfordNLP:
def __init__(self):
self.server = ServerProxy(JsonRpc20(),
TransportTcpIp(addr=("127.0.0.1", 8080)))
def parse(self, text):
return json.loads(self.server.parse(text))
def chain(*args):
''' flattens arguemtents into single list '''
if len(args) == 1:
l = itertools.chain.from_iterable(*args)
else:
l = itertools.chain(*args)
return list(l)
def sub_one(w):
''' fixes off by one error in dependency suffix index '''
def f(w):
return chain([w,[str(int(w[-1])-1)]])
rel,w1,w2 = w
w1 = f(w1.split('-'))
w2 = f(w2.split('-'))
return [rel,w1,w2]
def get_article(example):
''' Extracts match an previous sentence from example '''
before = [sent['string'] for sent in example['before']]
match = example['match']['sentence']['string']
candidates = before+[match]
return candidates[-2:]
if __name__ == '__main__':
nlp = StanfordNLP()
## linesWithAnnotations110716_onlyannotated.json
# Assumes linesep json dumps of examples
anno_path = sys.argv[1]
anno_data = {}
with open(anno_path,'r') as in_:
for example in in_:
dict_ = json.loads(example)
try:
sluice_id = "{0[file]}_{0[line]}_{0[treeNode]}".format(dict_['metadata'])
article = get_article(dict_)
except:
continue
anno_data[sluice_id] = article
## writes dependency-lemma structures to dep_lemma.jsons
with open('dep_lemma.jsons','w') as out_:
for sluice_id,article in anno_data.items():
for sent in article:
try:
result = nlp.parse(sent)
except:
print('error in parsing',file=sys.stderr)
print(sluice_id,file=sys.stderr)
continue
if len(result['sentences']) != 1:
print('multiple sentences',file=sys.stderr)
print(sluice_id,file=sys.stderr)
continue
lemmas = [word[-1]['Lemma'] for word in result['sentences'][-1]['words']]
try:
dict_ = {'sluiceId':sluice_id, 'deps': list(map(sub_one,result['sentences'][-1]['dependencies'])),'string':sent,'lemmas':lemmas}
except:
print('error in dict',file=sys.stderr)
print(sluice_id,file=sys.stderr)
continue
print(json.dumps(dict_),file=out_)