forked from Limour-dev/HelloGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tmp_vn.py
78 lines (70 loc) · 2.33 KB
/
tmp_vn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
from opencc import OpenCC
cc = OpenCC('t2s') # 't2s'表示繁体转简体
from h_corpus import Fileset
import unicodedata
def fullwidth_to_halfwidth(input_str):
return ''.join([unicodedata.normalize('NFKC', char) for char in input_str])
def clearT(s):
s = cc.convert(fullwidth_to_halfwidth(s))
return s
a = r'D:\datasets\v-corpus'
b = r'D:\datasets\tmp' + '\\'
# nSet = {}
with open('tmp_nSet.json', 'r', encoding='utf-8') as f:
json_str = f.read()
nSet = json.loads(json_str)
# with open('tmp_v_33036.txt', 'r', encoding='utf-8') as f:
# tmp_v = [x.rstrip().split('\t') for x in f]
#
# for v,k in tmp_v:
# nSet[k] = v
# for k in nSet:
# tmp = k.replace(' ', '')
# tmp = tmp.replace('・', '&')
# tmp = tmp.replace('+', '&')
# tmp = cc.convert(fullwidth_to_halfwidth(tmp))
# nSet[k] = tmp
# for k, v in nSet.items():
# v = v.replace('·', '&')
# nSet[k] = v
a = Fileset(a, ext='.tsv')
for i in range(len(a)):
save = []
with open(a[i], 'r', encoding='utf-8') as f:
tmp = next(f).rstrip().split('\t')
# # print(tmp)
# if not('Name' in tmp) :
# # print(a[i])
# # continue
idx_n = tmp.index('Name')
idx_d = tmp.index('Dialogue')
idx_v = tmp.index('Voice') if 'Voice' in tmp else idx_n
for line in f:
tmp = line.rstrip().split('\t')
if len(tmp) < idx_d + 1:
print(tmp)
continue
n = tmp[idx_n]
n = nSet.get(n, n).strip()
v = tmp[idx_v].strip()
d = tmp[idx_d].strip()
if not d:
print(a[i])
break
if n == '' and v != '':
n = v
n = n.replace(':', ':')
if n == '' and v == '':
n = '旁白'
save.append(clearT(n) + ':' + clearT(d))
if save:
with open(b + f'{i}.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(save))
# break
# # print(tmp[idx_n], tmp[idx_d])
# # nSet[tmp[idx_n]] = nSet.get(tmp[idx_n],0) + 1
#
# json_str = json.dumps(nSet, indent=2, ensure_ascii=False)
# with open('tmp_nSet.json', 'w', encoding='utf-8') as f:
# f.write(json_str)