python/ml/fm_regressor_example.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 """
0019 FMRegressor Example.
0020 """
0021 from __future__ import print_function
0022
0023 # $example on$
0024 from pyspark.ml import Pipeline
0025 from pyspark.ml.regression import FMRegressor
0026 from pyspark.ml.feature import MinMaxScaler
0027 from pyspark.ml.evaluation import RegressionEvaluator
0028 # $example off$
0029 from pyspark.sql import SparkSession
0030
0031 if __name__ == "__main__":
0032     spark = SparkSession \
0033         .builder \
0034         .appName("FMRegressorExample") \
0035         .getOrCreate()
0036
0037     # $example on$
0038     # Load and parse the data file, converting it to a DataFrame.
0039     data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
0040
0041     # Scale features.
0042     featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(data)
0043
0044     # Split the data into training and test sets (30% held out for testing)
0045     (trainingData, testData) = data.randomSplit([0.7, 0.3])
0046
0047     # Train a FM model.
0048     fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001)
0049
0050     # Create a Pipeline.
0051     pipeline = Pipeline(stages=[featureScaler, fm])
0052
0053     # Train model.
0054     model = pipeline.fit(trainingData)
0055
0056     # Make predictions.
0057     predictions = model.transform(testData)
0058
0059     # Select example rows to display.
0060     predictions.select("prediction", "label", "features").show(5)
0061
0062     # Select (prediction, true label) and compute test error
0063     evaluator = RegressionEvaluator(
0064         labelCol="label", predictionCol="prediction", metricName="rmse")
0065     rmse = evaluator.evaluate(predictions)
0066     print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
0067
0068     fmModel = model.stages[1]
0069     print("Factors: " + str(fmModel.factors))
0070     print("Linear: " + str(fmModel.linear))
0071     print("Intercept: " + str(fmModel.intercept))
0072     # $example off$
0073
0074     spark.stop()