forked from EvilFreelancer/ruMorpheme
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread.py
64 lines (58 loc) · 2.2 KB
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# чтение и разметка данных
import numpy as np
def generate_BMES(morphs, morph_types):
answer = []
for morph, morph_type in zip(morphs, morph_types):
if len(morph) == 1:
answer.append("S-" + morph_type)
else:
answer.append("B-" + morph_type)
answer.extend(["M-" + morph_type] * (len(morph) - 2))
answer.append("E-" + morph_type)
return answer
def read_splitted(infile, transform_to_BMES=True, n=None, morph_sep="/", shuffle=True):
source, targets = [], []
with open(infile, "r", encoding="utf8") as fin:
for line in fin:
line = line.strip()
if line == "":
break
word, analysis = line.split("\t")
morphs = analysis.split(morph_sep)
morph_types = ["None"] * len(morphs)
if transform_to_BMES:
target = generate_BMES(morphs, morph_types)
else:
target = morph_types
source.append(word)
targets.append(target)
indexes = list(range(len(source)))
if shuffle:
np.random.shuffle(indexes)
if n is not None:
indexes = indexes[:n]
source = [source[i] for i in indexes]
targets = [targets[i] for i in indexes]
return source, targets
def read_BMES(infile, transform_to_BMES=True, n=None,
morph_sep="/" ,sep=":", shuffle=True):
source, targets = [], []
with open(infile, "r", encoding="utf8") as fin:
for line in fin:
line = line.strip()
if line == "":
break
word, analysis = line.split("\t")
analysis = [x.split(sep) for x in analysis.split(morph_sep)]
morphs, morph_types = [elem[0] for elem in analysis], [elem[1] for elem in analysis]
target = generate_BMES(morphs, morph_types) if transform_to_BMES else morphs
source.append(word)
targets.append(target)
indexes = list(range(len(source)))
if shuffle:
np.random.shuffle(indexes)
if n is not None:
indexes = indexes[:n]
source = [source[i] for i in indexes]
targets = [targets[i] for i in indexes]
return source, targets