-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclustering.py
40 lines (23 loc) · 888 Bytes
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
'''
Input: string array and the number of clusters
Output: clustered array
'''
from sklearn.cluster import KMeans
import numpy as np
import lsa
def clusterthedocs(stringArray, clusterNo):
#Lsi Corpus
lsad = lsa.createLsiCorpus(stringArray)
numpied = np.array(lsad)
#Generate Kmeans model.....
kmeans_model = KMeans(n_clusters = clusterNo, precompute_distances='auto', random_state = 0).fit(numpied)
# generate labels.....
labels=kmeans_model.labels_.tolist()
cluster_list = []
for i in range(clusterNo):
cluster_list.append([])
for i in range(len(labels)):
cluster_list[labels[i]].append(i)
return cluster_list
# stringArray = stringArray = ['this is designed for evaluating tfidf gibberish','run for tdidf','stuff is old', 'old stuff is great', 'tfidf sucks suckss sss yeydh']
# clusterthedocs(stringArray, 3)