0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 An example for Chi-square hypothesis testing.
0020 Run with:
0021 bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
0022 """
0023 from __future__ import print_function
0024
0025 from pyspark.sql import SparkSession
0026
0027 from pyspark.ml.linalg import Vectors
0028 from pyspark.ml.stat import ChiSquareTest
0029
0030
0031 if __name__ == "__main__":
0032 spark = SparkSession \
0033 .builder \
0034 .appName("ChiSquareTestExample") \
0035 .getOrCreate()
0036
0037
0038 data = [(0.0, Vectors.dense(0.5, 10.0)),
0039 (0.0, Vectors.dense(1.5, 20.0)),
0040 (1.0, Vectors.dense(1.5, 30.0)),
0041 (0.0, Vectors.dense(3.5, 30.0)),
0042 (0.0, Vectors.dense(3.5, 40.0)),
0043 (1.0, Vectors.dense(3.5, 40.0))]
0044 df = spark.createDataFrame(data, ["label", "features"])
0045
0046 r = ChiSquareTest.test(df, "features", "label").head()
0047 print("pValues: " + str(r.pValues))
0048 print("degreesOfFreedom: " + str(r.degreesOfFreedom))
0049 print("statistics: " + str(r.statistics))
0050
0051
0052 spark.stop()