-
Notifications
You must be signed in to change notification settings - Fork 11
/
part_b_mllib.py
39 lines (30 loc) · 1.26 KB
/
part_b_mllib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from pyspark import SparkContext
from numpy import array
from pyspark.mllib.clustering import KMeans, KMeansModel
############################################
#### PLEASE USE THE GIVEN PARAMETERS ###
#### FOR TRAINING YOUR KMEANS CLUSTERING ###
#### MODEL ###
############################################
NUM_CLUSTERS = 4
SEED = 0
MAX_ITERATIONS = 100
INITIALIZATION_MODE = "random"
sc = SparkContext()
def get_clusters(data_rdd, num_clusters=NUM_CLUSTERS, max_iterations=MAX_ITERATIONS,
initialization_mode=INITIALIZATION_MODE, seed=SEED):
# TODO:
# Use the given data and the cluster pparameters to train a K-Means model
# Find the cluster id corresponding to data point (a car)
# Return a list of lists of the titles which belong to the same cluster
# For example, if the output is [["Mercedes", "Audi"], ["Honda", "Hyundai"]]
# Then "Mercedes" and "Audi" should have the same cluster id, and "Honda" and
# "Hyundai" should have the same cluster id
return [[]]
if __name__ == "__main__":
f = sc.textFile("dataset/cars.data")
# TODO: Parse data from file into an RDD
data_rdd = None
clusters = get_clusters(data_rdd)
for cluster in clusters:
print(','.join(cluster))