0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 Random Forest Regressor Example.
0020 """
0021 from __future__ import print_function
0022
0023
0024 from pyspark.ml import Pipeline
0025 from pyspark.ml.regression import RandomForestRegressor
0026 from pyspark.ml.feature import VectorIndexer
0027 from pyspark.ml.evaluation import RegressionEvaluator
0028
0029 from pyspark.sql import SparkSession
0030
0031 if __name__ == "__main__":
0032 spark = SparkSession\
0033 .builder\
0034 .appName("RandomForestRegressorExample")\
0035 .getOrCreate()
0036
0037
0038
0039 data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
0040
0041
0042
0043 featureIndexer =\
0044 VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
0045
0046
0047 (trainingData, testData) = data.randomSplit([0.7, 0.3])
0048
0049
0050 rf = RandomForestRegressor(featuresCol="indexedFeatures")
0051
0052
0053 pipeline = Pipeline(stages=[featureIndexer, rf])
0054
0055
0056 model = pipeline.fit(trainingData)
0057
0058
0059 predictions = model.transform(testData)
0060
0061
0062 predictions.select("prediction", "label", "features").show(5)
0063
0064
0065 evaluator = RegressionEvaluator(
0066 labelCol="label", predictionCol="prediction", metricName="rmse")
0067 rmse = evaluator.evaluate(predictions)
0068 print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
0069
0070 rfModel = model.stages[1]
0071 print(rfModel)
0072
0073
0074 spark.stop()