-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpretokenize.py
executable file
·53 lines (46 loc) · 1.65 KB
/
pretokenize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
"""Converts data to CoNLL-U format using UDPipe models."""
import argparse
import spacy_udpipe
from udtube.data import conllu
BLANK = "_"
def main(args: argparse.Namespace) -> None:
spacy_udpipe.download(args.langcode)
tokenize = spacy_udpipe.load(args.langcode).tokenizer
with (
open(args.text, "r") as source,
open(args.conllu, "w") as sink,
):
for sentence in tokenize(source.read()).sents:
tokenlist = conllu.TokenList(
[
conllu.Token(
id_=conllu.ID(id_),
form=token.text,
lemma=BLANK,
upos=BLANK,
xpos=BLANK,
feats=BLANK,
head=BLANK,
deprel=BLANK,
deps=BLANK,
misc=BLANK,
)
for id_, token in enumerate(sentence, 1)
],
metadata={"text": sentence},
)
print(tokenlist, file=sink)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("text", help="path to input text file")
parser.add_argument("conllu", help="path for output CoNLL-U file")
parser.add_argument(
"--langcode",
required=True,
help="the language and name of treebank (e.g., `en-ewt`); "
"for a list of supported languages, see: "
"https://github.com/TakeLab/spacy-udpipe/blob/master/spacy_udpipe/"
"resources/languages.json",
)
main(parser.parse_args())