from pyspark import SparkContext, SparkConf from numpy import array from math import sqrt # https://spark.apache.org/docs/latest/mllib-clustering.html#k-means from pyspark.mllib.clustering import KMeans, KMeansModel sconf = SparkConf().setAppName("KMeansApp").setMaster("local[4]") sc = SparkContext(conf=sconf) # Load and parse the data data = sc.textFile("/usr/local/spark/data/mllib/kmeans_data.txt") parsedData = data.map(lambda line: [float(x) for x in line.split(' ')]) # if you want to examine the contents of parsedData RDD # you need to collect the collection (list) from the cluster print(parsedData.collect()) # Build the model (cluster the data) --> model will contains clusters # train(rdd, k, maxIterations=100, runs=1, initializationMode='k-means||', seed=None, initializationSteps=2, epsilon=0.0001, initialModel=None) model = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") # print the centers of the 2 clusters after training print(model.centers) # OR print(model.clusterCenters) # it will print: [array([ 0.1, 0.1, 0.1]), array([ 9.1, 9.1, 9.1])] # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = model.centers[model.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model #clusters.save(sc, "KMeansModel") #sameModel = KMeansModel.load(sc, "KMeansModel")