0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.ml;
0019
0020 import org.apache.spark.sql.SparkSession;
0021
0022
0023 import java.util.Map;
0024
0025 import org.apache.spark.ml.feature.VectorIndexer;
0026 import org.apache.spark.ml.feature.VectorIndexerModel;
0027 import org.apache.spark.sql.Dataset;
0028 import org.apache.spark.sql.Row;
0029
0030
0031 public class JavaVectorIndexerExample {
0032 public static void main(String[] args) {
0033 SparkSession spark = SparkSession
0034 .builder()
0035 .appName("JavaVectorIndexerExample")
0036 .getOrCreate();
0037
0038
0039 Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
0040
0041 VectorIndexer indexer = new VectorIndexer()
0042 .setInputCol("features")
0043 .setOutputCol("indexed")
0044 .setMaxCategories(10);
0045 VectorIndexerModel indexerModel = indexer.fit(data);
0046
0047 Map<Integer, Map<Double, Integer>> categoryMaps = indexerModel.javaCategoryMaps();
0048 System.out.print("Chose " + categoryMaps.size() + " categorical features:");
0049
0050 for (Integer feature : categoryMaps.keySet()) {
0051 System.out.print(" " + feature);
0052 }
0053 System.out.println();
0054
0055
0056 Dataset<Row> indexedData = indexerModel.transform(data);
0057 indexedData.show();
0058
0059 spark.stop();
0060 }
0061 }