0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 Linear Regression With SGD Example.
0020 """
0021 from __future__ import print_function
0022
0023 from pyspark import SparkContext
0024
0025 from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
0026
0027
0028 if __name__ == "__main__":
0029
0030 sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
0031
0032
0033
0034 def parsePoint(line):
0035 values = [float(x) for x in line.replace(',', ' ').split(' ')]
0036 return LabeledPoint(values[0], values[1:])
0037
0038 data = sc.textFile("data/mllib/ridge-data/lpsa.data")
0039 parsedData = data.map(parsePoint)
0040
0041
0042 model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)
0043
0044
0045 valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
0046 MSE = valuesAndPreds \
0047 .map(lambda vp: (vp[0] - vp[1])**2) \
0048 .reduce(lambda x, y: x + y) / valuesAndPreds.count()
0049 print("Mean Squared Error = " + str(MSE))
0050
0051
0052 model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel")
0053 sameModel = LinearRegressionModel.load(sc, "target/tmp/pythonLinearRegressionWithSGDModel")
0054