-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClustering.py
53 lines (41 loc) · 1.42 KB
/
Clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import math
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
def get_partitions(km_obj: KMeans, feature: pd.Series):
# a map from feature value to cluster label
value_label_pairs = zip(feature, km_obj.labels_)
cluster_map = {val: cluster_label for val, cluster_label in value_label_pairs}
# now we use the map to assign each value to its cluster in a list called 'clusters'
clusters = [list() for _ in range(km_obj.n_clusters)]
for i in range(len(feature)):
val = feature.iloc[i]
cluster_label = cluster_map[val]
clusters[cluster_label].append(val)
return [min(cluster) for cluster in clusters]
df = pd.read_csv('ACST_Cust_Sum.csv')
# cluster TWA
congregant_users = df.loc[:, 'CongregantUsers']
congregant_users = congregant_users.loc[~congregant_users.map(math.isnan)]
feat = congregant_users.to_numpy().reshape(-1, 1)
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": None
}
kmeans = KMeans(n_clusters=4, **kmeans_kwargs)
kmeans.fit(feat)
print(get_partitions(kmeans, congregant_users))
sse = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(feat)
sse.append(kmeans.inertia_)
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.title('CongregantUsers')
plt.show()