python/ml/fm_classifier_example.py

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 """
0019 FMClassifier Example.
0020 """
0021 from __future__ import print_function
0022
0023 # $example on$
0024 from pyspark.ml import Pipeline
0025 from pyspark.ml.classification import FMClassifier
0026 from pyspark.ml.feature import MinMaxScaler, StringIndexer
0027 from pyspark.ml.evaluation import MulticlassClassificationEvaluator
0028 # $example off$
0029 from pyspark.sql import SparkSession
0030
0031 if __name__ == "__main__":
0032     spark = SparkSession \
0033         .builder \
0034         .appName("FMClassifierExample") \
0035         .getOrCreate()
0036
0037     # $example on$
0038     # Load and parse the data file, converting it to a DataFrame.
0039     data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
0040
0041     # Index labels, adding metadata to the label column.
0042     # Fit on whole dataset to include all labels in index.
0043     labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
0044     # Scale features.
0045     featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(data)
0046
0047     # Split the data into training and test sets (30% held out for testing)
0048     (trainingData, testData) = data.randomSplit([0.7, 0.3])
0049
0050     # Train a FM model.
0051     fm = FMClassifier(labelCol="indexedLabel", featuresCol="scaledFeatures", stepSize=0.001)
0052
0053     # Create a Pipeline.
0054     pipeline = Pipeline(stages=[labelIndexer, featureScaler, fm])
0055
0056     # Train model.
0057     model = pipeline.fit(trainingData)
0058
0059     # Make predictions.
0060     predictions = model.transform(testData)
0061
0062     # Select example rows to display.
0063     predictions.select("prediction", "indexedLabel", "features").show(5)
0064
0065     # Select (prediction, true label) and compute test accuracy
0066     evaluator = MulticlassClassificationEvaluator(
0067         labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
0068     accuracy = evaluator.evaluate(predictions)
0069     print("Test set accuracy = %g" % accuracy)
0070
0071     fmModel = model.stages[2]
0072     print("Factors: " + str(fmModel.factors))
0073     print("Linear: " + str(fmModel.linear))
0074     print("Intercept: " + str(fmModel.intercept))
0075     # $example off$
0076
0077     spark.stop()