-
Notifications
You must be signed in to change notification settings - Fork 0
/
document_matrix.py
131 lines (120 loc) · 5 KB
/
document_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import sys
import numpy as np
from app.irsystem.models.inverted_index import InvertedIndex
from collections import Counter
from app.irsystem.models.get_data import data, trail_to_idx
from app.irsystem.models.tokens import *
from sklearn.feature_extraction.text import TfidfVectorizer
class DTMat:
"""
Inputs:
- [term_rep], a choice of representations:
- 'tfidf' (default)
- 'tf'
- 'binary'
This parameter determines the weight of each term in a document's
vector representation.
- [token_type], a choice of source for tokens:
- 'reviews and descriptions'
- 'reviews'
- 'attributes'
- 'descriptions'
This parameter determines which tokens the Document-Term Matrix
takes into account.
- [features] the number of features (aka columns) that the document
term matrix should have.
Outputs:
Creates a document-term matrix object [dtm] with attributes:
- A numpy array of size [(# trails in the data) x (# features)]
[mat] -> dtm.mat
- A string representing the term representation (or weights)
[term_rep] -> dtm.term_rep
- A string representing the source type for token retrieval
[token_type] -> dtm.token_type
- A number representing the number of tokens (aka features)
[features] -> dtm.features
- A list of length [features], the tokens that make up the columns in
[dtm]
[feature_names] -> dtm.feature_names
- The number of trails used to calculate the Tokens object
[num_trails] -> dtm.num_trails
- The inverted index used
[inv_idx] -> dtm.inv_idx
- Provided by the Tokens object, a dictionary where keys are indexes
for a particular trail, and values are lists containing the tokens
returned from calling [Tokens().tokens.per_trail[trail_idx]] for
some trail_idx
[toks_per_trials] -> dtm.toks_per_trail
Note: By default creates a tf-idf vector representation with [feature] features.
For a document-term matrix with binary weights for k-features, call
[DTMat("binary", k)].
"""
mat = None
term_rep = ''
token_type = ''
features = None
feature_names = []
num_trails = None
toks_per_trails = {}
inv_idx = None
def __init__(self, term_rep="tfidf", token_type="reviews and descriptions", features=100):
assert term_rep in ['tfidf', 'tf', 'binary']
assert token_type in ['reviews and descriptions',
'reviews',
'attributes',
'descriptions']
self.features = features
self.term_rep = term_rep
self.token_type = token_type
self.toks_per_trails = Tokens(token_type).tokens_per_trail
self.num_trails = len(self.toks_per_trails)
self.inv_idx = InvertedIndex(
token_type=self.token_type, vector_type='tfidf')
if term_rep == "tfidf":
self.mat = self._get_tfidf_mat(features)
if term_rep == "tf":
self.mat = self._get_tf_mat()
if term_rep == "binary":
self.mat = self._get_binary_mat()
# mat[i, j] := the tf-idf measure for the term j in document i
def _get_tfidf_mat(self, features):
vectorizer = TfidfVectorizer(
stop_words='english',
max_df=0.9,
min_df=10,
max_features=features)
mat = vectorizer.fit_transform(
[' '.join(self.toks_per_trails[trail])
for trail in self.toks_per_trails]).toarray()
self.feature_names = vectorizer.get_feature_names()
return mat
# mat[i, j] := the term frequency of the term j in document i
# Features are not implemented for this matrix, so all terms are
# weighed and diplayed
def _get_tf_mat(self):
toks = Tokens(self.token_type)
self.feature_names = toks.tokens
toks_to_idx = toks.tokens_to_idx
inv_idx = InvertedIndex(
token_type=self.token_type, vector_type='tf').inv_idx
mat = np.zeros([self.num_trails, len(toks.tokens)])
for tok in inv_idx:
for doc, frequency in inv_idx[tok]:
tok_idx = toks_to_idx[tok]
mat[doc, tok_idx] = frequency
return mat
# mat[i, j] := 1 if term j in document i | 0 otherwise
# Features are not implemented for this matrix, so all terms are
# weighed and displayed
def _get_binary_mat(self):
toks = Tokens(self.token_type)
self.feature_names = toks.tokens
toks_to_idx = toks.tokens_to_idx
inv_idx = InvertedIndex(
token_type=self.token_type, vector_type='binary').inv_idx
mat = np.zeros([self.num_trails, len(toks.tokens)])
for tok in inv_idx:
for doc in inv_idx[tok]:
tok_idx = toks_to_idx[tok]
mat[doc, tok_idx] = 1
return mat