0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 Correlations using MLlib.
0020 """
0021 from __future__ import print_function
0022
0023 import sys
0024
0025 from pyspark import SparkContext
0026 from pyspark.mllib.regression import LabeledPoint
0027 from pyspark.mllib.stat import Statistics
0028 from pyspark.mllib.util import MLUtils
0029
0030
0031 if __name__ == "__main__":
0032 if len(sys.argv) not in [1, 2]:
0033 print("Usage: correlations (<file>)", file=sys.stderr)
0034 sys.exit(-1)
0035 sc = SparkContext(appName="PythonCorrelations")
0036 if len(sys.argv) == 2:
0037 filepath = sys.argv[1]
0038 else:
0039 filepath = 'data/mllib/sample_linear_regression_data.txt'
0040 corrType = 'pearson'
0041
0042 points = MLUtils.loadLibSVMFile(sc, filepath)\
0043 .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
0044
0045 print()
0046 print('Summary of data file: ' + filepath)
0047 print('%d data points' % points.count())
0048
0049
0050 print()
0051 print('Correlation (%s) between label and each feature' % corrType)
0052 print('Feature\tCorrelation')
0053 numFeatures = points.take(1)[0].features.size
0054 labelRDD = points.map(lambda lp: lp.label)
0055 for i in range(numFeatures):
0056 featureRDD = points.map(lambda lp: lp.features[i])
0057 corr = Statistics.corr(labelRDD, featureRDD, corrType)
0058 print('%d\t%g' % (i, corr))
0059 print()
0060
0061 sc.stop()