-
Notifications
You must be signed in to change notification settings - Fork 4
/
method_clustering.py
182 lines (145 loc) · 5.91 KB
/
method_clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import decomposition
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
from pyhelpers import tools
#import matplotlib.pyplot as plt
import numpy as np
import _pickle as pkl
import config as cfg
def is_int_or_float(s):
''' return 1 for int, 2 for float, -1 for not a number'''
try:
float(s)
return 1 if s.count('.')==0 else 2
except:
return -1
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
def facet_embedding(db):
papers = db.sentences_ner.distinct('paper_id')
docs = []
conf_flag = False
journal_flag = False
for p in papers:
find_paper = db.publications.find_one({'_id':p})
if 'booktitle' in find_paper and find_paper['booktitle'] in cfg.booktitles:
conf_flag = True
if 'journal' in find_paper and find_paper['journal'] in cfg.journals:
journal_flag = True
if conf_flag or journal_flag:
ners = db.sentences_ner.distinct('ner', {'paper_id': p, 'multiLabel_cls': {'$in': ['method']}, 'inWordnet': 0})
methodsString = ''
for ne in ners:
isint = is_int_or_float(ne)
if not hasNumbers(ne):
ne = ne.replace(' ', '')
# ne= ne.replace('_','')
methodsString = methodsString + str(ne) + ' '
docs.append(methodsString)
conf_flag = False
journal_flag = False
return docs
#The matplot give error on the server!!
def calculate_s_scores(X, min_k, max_k):
s = []
with open(cfg.folder_culsters + "silhouette_scores_between_multilabel_{}_{}.csv".format(min_k,max_k), 'w', encoding="UTF-8") as f:
for n_clusters in range(min_k, max_k):
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=100, n_init=1)
kmeans.fit(X) # instead of the countvectorizer
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
f.write("{},{}".format(n_clusters,silhouette_score(X, labels)))
f.write("\n")
#s.append(silhouette_score(X, labels, sample_size=10000))
"""
x = range(15, 30)
y = s
# my_xticks = np.arange(2, 40, 5)
# markers_on = [20,22] #, 'rD', markevery=markers_on
# k = [2,4,11,15,26,34]
plt.show()
plt.xticks(np.arange(min(x), max(x) + 1, 2))
plt.yticks(np.arange(min(y), max(y), 0.001))
plt.xlabel("k-values")
plt.ylabel("Silouette")
plt.title("Silouette for K-means cell's behaviour")
plt.plot(x, y)
#plt.plot(x, y, 'rD', markevery=[3, 5, 9, 13])
plt.grid(axis='y', linestyle='-')
plt.savefig(cfg.folder_culsters +'silhouette_4_kmeans_methods_large.png', bbox_inches='tight')
plt.show()
"""
def write_clusters(X,k_values,svd,vectorizer):
print()
print("Write results in Methods_Clusters_DataPipelines.csv ")
with open(cfg.folder_culsters + "Methods_Clusters_DataPipelines_multilabel.csv", 'w', encoding="UTF-8") as f:
for k in k_values:
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, verbose=False)
km.fit(X)
# save the classifier
with open(cfg.folder_pickle + 'k_means_methods_multilabel_DataPipelines_{}.pkl'.format(k), 'wb') as fid:
pkl.dump(km, fid)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
with open(cfg.folder_pickle + 'svd_multilabel_DataPipelines_{}.pkl'.format(k), 'wb') as decom:
pkl.dump(svd,decom)
with open(cfg.folder_pickle + 'vectorizer_multilabel_DataPipelines_{}.pkl'.format(k), 'wb') as vec:
pkl.dump(vectorizer,vec)
terms = vectorizer.get_feature_names()
f.write("Top terms with {} clusters".format(k))
f.write("\n")
for i in range(len(order_centroids)):
f.write("Annotate Here,Cluster {}:".format(i))
for ind in order_centroids[i, :40]:
f.write(',{}'.format(terms[ind]))
f.write("\n")
f.write("\n")
def main():
db = tools.connect_to_mongo()
print("Collect list of NEE per publication")
documents = facet_embedding(db)
print()
print("Create Count Vectorizer")
vectorizer = CountVectorizer(ngram_range=(1,1))
print()
print("Fit documents tfidfVectorixaer")
X = vectorizer.fit_transform(documents)
Xc = (X.T * X)
#len(vectorizer.get_feature_names())
print()
print("Create SVD pipeline with {} components and normalization".format(900))
svd = decomposition.TruncatedSVD(n_components=900, n_iter=5)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
print()
print("Fit SVD+Normalization")
X = lsa.fit_transform(Xc)
with open(cfg.folder_pickle + 'PCA_fitted_Data_Robots.pkl', 'wb') as data:
pkl.dump(X, data)
with open(cfg.folder_culsters + "Methods_Clusters_Data_multilabel.csv", 'w', encoding="UTF-8") as f:
for k in range(28,29):
print("K-means: {}".format(k))
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, verbose=False)
km.fit(X)
# save the classifier
with open(cfg.folder_pickle + 'k_means_methods_multilabel_Data_{}.pkl'.format(k), 'wb') as fid:
pkl.dump(km, fid)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
#order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
f.write("Top terms with {} clusters".format(k))
f.write("\n")
for i in range(len(order_centroids)):
f.write("Annotate Here,Cluster {}:".format(i))
for ind in order_centroids[i, :40]:
f.write(',{}'.format(terms[ind]))
f.write("\n")
f.write("\n")
#explained_variance = svd.explained_variance_ratio_.sum()
#print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))
if __name__ == '__main__':
main()