-
Notifications
You must be signed in to change notification settings - Fork 1
/
fitXtl.py
35 lines (31 loc) · 913 Bytes
/
fitXtl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from gensim import corpora, similarities
import numpy as np
import sys,gzip,collections,gensim.models.doc2vec
from gensim.models import LsiModel, TfidfModel
from collections import OrderedDict, namedtuple
import multiprocessing, random,unicodedata, re
from random import shuffle
import datetime
cores = multiprocessing.cpu_count()-2
ms = list()
nl = 0
# First read into tags and ms
sys.argv.pop(0)
lst=sys.argv.pop(0)
nTopics = int (sys.argv.pop(0))
f=gzip.open(lst)
for l in f:
all = l.rstrip().decode('ascii', 'ignore').split(';')
p = all .pop(0)
a = all .pop(0)
m = all
nl+=1
ms.append (m)
print ('records:' + str(nl))
dictionary = corpora.Dictionary(ms)
dictionary.save('dict.'+lst)
doc = [ dictionary.doc2bow(text) for text in ms ]
modt = TfidfModel (corpus=doc, normalize=True)
modt.save("tfidf."+lst)
modl = LsiModel (corpus=modt[doc], num_topics=nTopics)
modl.save("tlsi."+lst+"."+str(nTopics))