-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlsi_model.py
79 lines (57 loc) · 2.56 KB
/
lsi_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Latent Semantic Indexing Model
------------------------------
BOW vectors dimension reduction with LSI
Public Methods
~~~~~~~~~~~~~~
train_model
"""
from __future__ import print_function
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
class CustomPreprocessor(object):
def __call__(self, doc):
return doc
class CustomTokenizer(object):
def __call__(self, doc):
return doc
df = (5, 0.90)
default_tf_vectorizer = CountVectorizer(min_df=df[0], max_df=df[1], # found in 90% and found onnly 5 documents are ignored
tokenizer=CustomTokenizer(),
preprocessor=CustomPreprocessor())
default_tfidf_vectorizer = TfidfVectorizer(min_df=df[0], max_df=df[1],
# found in 90% and found onnly 5 documents are ignored
tokenizer=CustomTokenizer(),
preprocessor=CustomPreprocessor())
def train_model(data_samples, test_dataset, vectorizer=default_tfidf_vectorizer, n_components=50):
n_samples = len(data_samples)
# Use tfidf (raw term count) features for LSI.
print("Extracting tfidf features for LSI...")
t0 = time()
vectorizer.fit(data_samples)
tf = vectorizer.transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print("Fitting LSI models with tf features, "
"n_samples=%d"
% n_samples)
lsi_model = TruncatedSVD(n_components=n_components, n_iter=10, random_state=42)
t0 = time()
lsi_model.fit(tf)
print("done in %0.3fs." % (time() - t0))
# https://stats.stackexchange.com/questions/171539/percentage-of-variation-in-each-column-explained-by-each-svd-mode
# https://stats.stackexchange.com/questions/184603/in-pca-what-is-the-connection-between-explained-variance-and-squared-error
# Proportion of explained variance is 1 - Error^2 when we reconstruct
print(lsi_model.explained_variance_ratio_)
print(lsi_model.explained_variance_ratio_.sum())
tf_test = vectorizer.transform(test_dataset)
lsi_test = lsi_model.transform(tf_test)
return {
'transformations': lsi_test,
'features': vectorizer.get_feature_names(),
'_model': lsi_model,
'_tfidf_vectorizer': vectorizer,
'_tfidf_transformations': tf_test,
'_name': vectorizer.__class__.__name__.lower() + '_lsi' + str(n_components)
}