0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 An example for computing correlation matrix.
0020 Run with:
0021 bin/spark-submit examples/src/main/python/ml/correlation_example.py
0022 """
0023 from __future__ import print_function
0024
0025
0026 from pyspark.ml.linalg import Vectors
0027 from pyspark.ml.stat import Correlation
0028
0029 from pyspark.sql import SparkSession
0030
0031 if __name__ == "__main__":
0032 spark = SparkSession \
0033 .builder \
0034 .appName("CorrelationExample") \
0035 .getOrCreate()
0036
0037
0038 data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
0039 (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
0040 (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
0041 (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
0042 df = spark.createDataFrame(data, ["features"])
0043
0044 r1 = Correlation.corr(df, "features").head()
0045 print("Pearson correlation matrix:\n" + str(r1[0]))
0046
0047 r2 = Correlation.corr(df, "features", "spearman").head()
0048 print("Spearman correlation matrix:\n" + str(r2[0]))
0049
0050
0051 spark.stop()