-
Notifications
You must be signed in to change notification settings - Fork 2
/
task_make_annotation.py
executable file
·44 lines (37 loc) · 1.72 KB
/
task_make_annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from os import listdir
from underthesea import pos_tag, word_sent
from os.path import join, dirname, basename
import io
def auto_annotation(input_file, output_folder="."):
file_id = basename(input_file).split(".")[0]
texts = open(input_file).read().strip().decode("utf-8").split("\n")
content = u"\n".join([u" ".join(word_sent(text)) for text in texts])
output_text_file = join(output_folder, "%s.txt" % file_id)
io.open(output_text_file, "w", encoding="utf-8", newline="\n").write(content)
start = 0
end = 0
output_annotation_file = join(output_folder, "%s.ann" % file_id)
ann_file = io.open(output_annotation_file, "w", encoding="utf-8", newline="\n")
token_id = 1
for text in texts:
tokens = pos_tag(text)
for token in tokens:
word, tag = token
end = start + len(word)
ann_file.write(u"T%d\t%s %d %d\t%s\n" % (token_id, tag, start, end, word))
token_id += 1
start = end + 1
def get_annotated_files(brat_folder):
files = [f[2:] for f in listdir(brat_folder) if f.startswith("p_") and f.endswith(".txt")]
return set(files)
if __name__ == '__main__':
folder = join(dirname(__file__), "raw", "vinews")
files = set(listdir(folder))
brat_folder = join(dirname(__file__), "brat")
brat_final_folder = join(dirname(__file__), "brat_final")
annotated_files = get_annotated_files(brat_folder) | get_annotated_files(brat_final_folder)
un_annotated_files = files - annotated_files
for file in un_annotated_files:
input_file = join(folder, file)
output_folder = join(dirname(__file__), "brat")
auto_annotation(input_file, output_folder)