0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
0019 from pyspark.mllib.evaluation import RegressionMetrics
0020 from pyspark.mllib.linalg import DenseVector
0021
0022
0023 from pyspark import SparkContext
0024
0025 if __name__ == "__main__":
0026 sc = SparkContext(appName="Regression Metrics Example")
0027
0028
0029
0030 def parsePoint(line):
0031 values = line.split()
0032 return LabeledPoint(float(values[0]),
0033 DenseVector([float(x.split(':')[1]) for x in values[1:]]))
0034
0035 data = sc.textFile("data/mllib/sample_linear_regression_data.txt")
0036 parsedData = data.map(parsePoint)
0037
0038
0039 model = LinearRegressionWithSGD.train(parsedData)
0040
0041
0042 valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
0043
0044
0045 metrics = RegressionMetrics(valuesAndPreds)
0046
0047
0048 print("MSE = %s" % metrics.meanSquaredError)
0049 print("RMSE = %s" % metrics.rootMeanSquaredError)
0050
0051
0052 print("R-squared = %s" % metrics.r2)
0053
0054
0055 print("MAE = %s" % metrics.meanAbsoluteError)
0056
0057
0058 print("Explained variance = %s" % metrics.explainedVariance)
0059