examples/ml/JavaIndexToStringExample.java

0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017
0018 package org.apache.spark.examples.ml;
0019
0020 import org.apache.spark.sql.Dataset;
0021 import org.apache.spark.sql.SparkSession;
0022
0023 // $example on$
0024 import java.util.Arrays;
0025 import java.util.List;
0026
0027 import org.apache.spark.ml.attribute.Attribute;
0028 import org.apache.spark.ml.feature.IndexToString;
0029 import org.apache.spark.ml.feature.StringIndexer;
0030 import org.apache.spark.ml.feature.StringIndexerModel;
0031 import org.apache.spark.sql.Row;
0032 import org.apache.spark.sql.RowFactory;
0033 import org.apache.spark.sql.types.DataTypes;
0034 import org.apache.spark.sql.types.Metadata;
0035 import org.apache.spark.sql.types.StructField;
0036 import org.apache.spark.sql.types.StructType;
0037 // $example off$
0038
0039 public class JavaIndexToStringExample {
0040   public static void main(String[] args) {
0041     SparkSession spark = SparkSession
0042       .builder()
0043       .appName("JavaIndexToStringExample")
0044       .getOrCreate();
0045
0046     // $example on$
0047     List<Row> data = Arrays.asList(
0048       RowFactory.create(0, "a"),
0049       RowFactory.create(1, "b"),
0050       RowFactory.create(2, "c"),
0051       RowFactory.create(3, "a"),
0052       RowFactory.create(4, "a"),
0053       RowFactory.create(5, "c")
0054     );
0055     StructType schema = new StructType(new StructField[]{
0056       new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
0057       new StructField("category", DataTypes.StringType, false, Metadata.empty())
0058     });
0059     Dataset<Row> df = spark.createDataFrame(data, schema);
0060
0061     StringIndexerModel indexer = new StringIndexer()
0062       .setInputCol("category")
0063       .setOutputCol("categoryIndex")
0064       .fit(df);
0065     Dataset<Row> indexed = indexer.transform(df);
0066
0067     System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
0068         "to indexed column '" + indexer.getOutputCol() + "'");
0069     indexed.show();
0070
0071     StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
0072     System.out.println("StringIndexer will store labels in output column metadata: " +
0073         Attribute.fromStructField(inputColSchema).toString() + "\n");
0074
0075     IndexToString converter = new IndexToString()
0076       .setInputCol("categoryIndex")
0077       .setOutputCol("originalCategory");
0078     Dataset<Row> converted = converter.transform(indexed);
0079
0080     System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
0081         "original string column '" + converter.getOutputCol() + "' using labels in metadata");
0082     converted.select("id", "categoryIndex", "originalCategory").show();
0083
0084     // $example off$
0085     spark.stop();
0086   }
0087 }