0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.ml;
0019
0020 import org.apache.spark.sql.Dataset;
0021 import org.apache.spark.sql.SparkSession;
0022
0023
0024 import java.util.Arrays;
0025 import java.util.List;
0026
0027 import org.apache.spark.ml.attribute.Attribute;
0028 import org.apache.spark.ml.feature.IndexToString;
0029 import org.apache.spark.ml.feature.StringIndexer;
0030 import org.apache.spark.ml.feature.StringIndexerModel;
0031 import org.apache.spark.sql.Row;
0032 import org.apache.spark.sql.RowFactory;
0033 import org.apache.spark.sql.types.DataTypes;
0034 import org.apache.spark.sql.types.Metadata;
0035 import org.apache.spark.sql.types.StructField;
0036 import org.apache.spark.sql.types.StructType;
0037
0038
0039 public class JavaIndexToStringExample {
0040 public static void main(String[] args) {
0041 SparkSession spark = SparkSession
0042 .builder()
0043 .appName("JavaIndexToStringExample")
0044 .getOrCreate();
0045
0046
0047 List<Row> data = Arrays.asList(
0048 RowFactory.create(0, "a"),
0049 RowFactory.create(1, "b"),
0050 RowFactory.create(2, "c"),
0051 RowFactory.create(3, "a"),
0052 RowFactory.create(4, "a"),
0053 RowFactory.create(5, "c")
0054 );
0055 StructType schema = new StructType(new StructField[]{
0056 new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
0057 new StructField("category", DataTypes.StringType, false, Metadata.empty())
0058 });
0059 Dataset<Row> df = spark.createDataFrame(data, schema);
0060
0061 StringIndexerModel indexer = new StringIndexer()
0062 .setInputCol("category")
0063 .setOutputCol("categoryIndex")
0064 .fit(df);
0065 Dataset<Row> indexed = indexer.transform(df);
0066
0067 System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
0068 "to indexed column '" + indexer.getOutputCol() + "'");
0069 indexed.show();
0070
0071 StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
0072 System.out.println("StringIndexer will store labels in output column metadata: " +
0073 Attribute.fromStructField(inputColSchema).toString() + "\n");
0074
0075 IndexToString converter = new IndexToString()
0076 .setInputCol("categoryIndex")
0077 .setOutputCol("originalCategory");
0078 Dataset<Row> converted = converter.transform(indexed);
0079
0080 System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
0081 "original string column '" + converter.getOutputCol() + "' using labels in metadata");
0082 converted.select("id", "categoryIndex", "originalCategory").show();
0083
0084
0085 spark.stop();
0086 }
0087 }