-
Notifications
You must be signed in to change notification settings - Fork 0
/
word2vec_with_morfessor_build.py
37 lines (26 loc) · 1.14 KB
/
word2vec_with_morfessor_build.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import gensim
import morfessor
import time
import os
start_time = time.time()
io = morfessor.MorfessorIO()
model_types = io.read_binary_model_file('morfessor/types')
path_dir = 'C:/Users/Ольга/Downloads/lemmas_only' # 'C:/Users/Ольга/Desktop/test'
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname), encoding='utf-8', errors='ignore'):
output = []
for word in line.split():
try:
output.extend(model_types.segment(word))
except KeyError:
output.extend(model_types.viterbi_segment(word)[0])
output.append(' ')
yield output
corpus = MySentences(path_dir) # a memory-friendly iterator from Rehurek's post
model = gensim.models.Word2Vec(corpus, sg=1, size=300, window=5, min_count=3, workers=4, negative=15)
model.save('word2vec_morpho')
print("Elapsed time for learning: {:.3f} sec".format(time.time() - start_time))