0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 An example demonstrating k-means clustering.
0020 Run with:
0021 bin/spark-submit examples/src/main/python/ml/kmeans_example.py
0022
0023 This example requires NumPy (http://www.numpy.org/).
0024 """
0025 from __future__ import print_function
0026
0027
0028 from pyspark.ml.clustering import KMeans
0029 from pyspark.ml.evaluation import ClusteringEvaluator
0030
0031
0032 from pyspark.sql import SparkSession
0033
0034 if __name__ == "__main__":
0035 spark = SparkSession\
0036 .builder\
0037 .appName("KMeansExample")\
0038 .getOrCreate()
0039
0040
0041
0042 dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
0043
0044
0045 kmeans = KMeans().setK(2).setSeed(1)
0046 model = kmeans.fit(dataset)
0047
0048
0049 predictions = model.transform(dataset)
0050
0051
0052 evaluator = ClusteringEvaluator()
0053
0054 silhouette = evaluator.evaluate(predictions)
0055 print("Silhouette with squared euclidean distance = " + str(silhouette))
0056
0057
0058 centers = model.clusterCenters()
0059 print("Cluster Centers: ")
0060 for center in centers:
0061 print(center)
0062
0063
0064 spark.stop()