-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcustom_binarytfidf_model.py
128 lines (99 loc) · 4.07 KB
/
custom_binarytfidf_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Binary TF-IDF Vectorizer
------------------------
Implements a custom TF-IDF formula based on binary values and create a weighted vector for every document
Public Classes
~~~~~~~~~~~~~~
BinaryTFIDFVectorizer
Public Methods
~~~~~~~~~~~~~~
train_model
"""
from __future__ import print_function
import math
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
class CustomPreprocessor(object):
def __call__(self, doc):
return doc
class CustomTokenizer(object):
def __call__(self, doc):
return doc
class BinaryTFIDFVectorizer(object):
"""Contains the logic of the Vectorizer using binary TFIDF
Attributes
----------
metas : list
the list of metas, this should be passed as parameters
nbr_of_metas : int
words : list
vocab
nbr_of_words : list
"""
def __init__(self, metas):
df = (5, 0.90)
self.tf_vectorizer = CountVectorizer(
min_df=df[0], max_df=df[1],
tokenizer=CustomTokenizer(),
preprocessor=CustomPreprocessor())
self.metas = list(set(metas))
self.nbr_of_metas = len(self.metas)
self.words = None
self.nbr_of_words = None
def fit(self, data_samples):
self.tf_vectorizer.fit(data_samples)
self.words = self.tf_vectorizer.get_feature_names()
self.nbr_of_words = len(self.words)
def transform(self, test_dataset, metas):
tf_trained = self.tf_vectorizer.transform(test_dataset)
N = tf_trained.shape[0]
# THIS IS A CHEAT TO CALCULATE METAS WEIGHTS
print("loading metas...")
uniq_metas = list(set(metas))
nbr_of_metas = len(uniq_metas)
words_docs_occurences = [0] * self.nbr_of_words
words_metas_occurences = [[0] * nbr_of_metas for _ in [0] * self.nbr_of_words]
coo_tf_trained = coo_matrix(tf_trained)
for i, j, df in zip(coo_tf_trained.row, coo_tf_trained.col, coo_tf_trained.data):
# print("document = %d, term = %d, tf = %s" % (i, j, df))
words_docs_occurences[j] += 1
meta_index = uniq_metas.index(metas[i])
words_metas_occurences[j][meta_index] += 1
words_metas_occurences_sorted = []
for word_metas_occurences in words_metas_occurences:
word_metas_occurences_total = sum(word_metas_occurences)
word_metas_occurences.sort(reverse=True)
# The +1 is to forbid the division by 0 but also penalize the words with to low frequencies
word_metas_occurences = [occ / (word_metas_occurences_total + 1) for occ in word_metas_occurences]
words_metas_occurences_sorted.append(word_metas_occurences)
idf = [- math.log(doc_occ / N) for k, doc_occ in enumerate(words_docs_occurences)]
# helps see the weights of idf
sorted_indexed_idf = np.argsort(idf)[::-1]
sorted_word_idf = [[self.words[i], idf[i]] for i in sorted_indexed_idf.tolist()]
# from pandas import DataFrame
# sorted_word_idf_frame = DataFrame(sorted_word_idf)
# with open('./output/tfidf_values.txt', 'w') as f:
# f.write(sorted_word_idf_frame.to_string())
row = []
col = []
data = []
for i, j, df in zip(coo_tf_trained.row, coo_tf_trained.col, coo_tf_trained.data):
# print("row = %d, column = %d, value = %s" % (i, j, df))
row.append(i)
col.append(j)
word_tfidf = idf[j]
data.append(word_tfidf)
return csr_matrix((np.array(data), (np.array(row), np.array(col))), shape=(N, self.nbr_of_words))
def get_feature_names(self):
return self.words
def train_model(data_samples, test_dataset, metas):
vectorizer = BinaryTFIDFVectorizer()
vectorizer.fit(data_samples)
tfidf_trained = vectorizer.transform(test_dataset, metas)
return {
'features': vectorizer.get_feature_names(),
'transformations': tfidf_trained,
'_model': vectorizer,
'_name': 'mutual_info'
}