-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
50 lines (41 loc) · 2.2 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pyphen
def preprocess(DATA):
data = DATA
data = data.lower().replace('"', ''). \
replace('<br> ', '\n').replace('<br>', '\n').replace('\r', '').replace('.', ''). \
replace('?', '').replace('!', '').replace("&", ''). \
replace('(', '').replace(')', '').replace('*', '').replace("'", ''). \
replace('[', '').replace(']', '').replace('/', '').replace("\\", ''). \
replace('ę', 'e').replace('ć', 'c').replace('ą', 'a'). \
replace('ń', 'n').replace('ó', 'o').replace('ś', 's').replace('ź', 'z'). \
replace('ż', 'z').replace('ł', 'l').replace(',', '').replace('\'', ''). \
replace('#', '').replace('$', '').replace('+', '').replace('%', ''). \
replace(';', '').replace(':', '').replace('^', '').replace('@', ''). \
replace('~', '').replace('_', '').replace('=', '').replace('{', ''). \
replace('}', '').replace('|', '')
data = data.replace("--", "-").replace("-", " ")
return data
def preprocess_pl(DATA):
data = DATA
data = data.replace('"', '').replace('\n\n', '\n'). \
replace('<br> ', '\n').replace('<br>', '\n').replace('\r', '').replace("&", ''). \
replace('(', '').replace(')', '').replace('*', '').replace("'", ''). \
replace('[', '').replace(']', '').replace('/', '').replace("\\", ''). \
replace(',', '').replace('\'', '').replace('—', '').replace("\ufeff", ''). \
replace('#', '').replace('$', '').replace('+', '').replace('%', ''). \
replace(';', '').replace(':', '').replace('^', '').replace('@', ''). \
replace('~', '').replace('_', '').replace('=', '').replace('{', ''). \
replace('}', '').replace('|', '')
data = data.replace("--", "-").replace("-", " ")
return data
def get_syllables(data, lang):
data = data.lower().replace('.', '').replace('?', '').replace('!', '')
preprocessed =[]
data = data.replace("--", "-").replace("-", " ").replace('\n\n', '\n')
dic = pyphen.Pyphen(lang=lang)
for line in data.split('\n'):
linebuf = []
for word in line.split(' '):
linebuf += dic.inserted(word).split('-') + [' ']
preprocessed += linebuf + ['\n']
return preprocessed