1. 程式人生 > >python spark kmeans demo

python spark kmeans demo

app import urn lib tex oop return clas lin

官方的demo

from numpy import array
from math import sqrt

from pyspark import SparkContext

from pyspark.mllib.clustering import KMeans, KMeansModel

sc = SparkContext(appName="clusteringExample")
# Load and parse the data
data = sc.textFile("/root/spark-2.1.1-bin-hadoop2.6/data/mllib/kmeans_data.txt")
parsedData 
= data.map(lambda line: array([float(x) for x in line.split( )])) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="random") # Evaluate clustering by computing Within Set Sum of Squared Errors def error(point): center = clusters.centers[clusters.predict(point)]
return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model #clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") #sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")

python spark kmeans demo