-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpieCSV_to_xml.py
executable file
·52 lines (42 loc) · 1.75 KB
/
pieCSV_to_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import sys
from falcon import *
import falcon.lemmatise_pie as lemmatise_pie
import regex as re
_FINALPUNCT = re.compile(r"[.?!]")
if __name__ == "__main__":
content = {}
for file in sys.argv[1:]:
wit = file
content[wit] = []
tokenId = 1
#tasks = ('lemma', 'pos', 'morph')
with open(file, 'r') as f:
# pop first line (header)
tasks = f.readline().rstrip().split('\t')
tasks = [t.lower() for t in tasks if t in ['token', 'form', 'pos', 'POS', 'lemma', 'morph'] ]
sent = []
for line in f.readlines():
t = line.rstrip().split('\t')
if len(t) < len(tasks):
print("Error on this line:")
print(t)
break
if not t == ['']:
token_dict = {"form": t[0], "id": "w_"+str(tokenId), "order_id": str(tokenId)}
# and now add the different annotations from lemmatiser
for index in enumerate(tasks):
token_dict[index[1]] = t[index[0]]
sent.append(token_dict)
tokenId += 1
if _FINALPUNCT.match(t[0]):
content[wit].append(sent)
sent = []
else:
content[wit].append(sent)
sent = []
#Deal with last sentence, if file does not end properly
content[wit].append(sent)
documents = lemmatise_pie.xmlify(content)
for doc in documents:
with open(doc + ".xml", 'w') as f:
f.write(documents[doc])