0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.ml;
0019
0020 import org.apache.spark.sql.SparkSession;
0021
0022
0023 import java.util.Arrays;
0024
0025 import org.apache.spark.ml.feature.VectorAssembler;
0026 import org.apache.spark.ml.linalg.VectorUDT;
0027 import org.apache.spark.ml.linalg.Vectors;
0028 import org.apache.spark.sql.Dataset;
0029 import org.apache.spark.sql.Row;
0030 import org.apache.spark.sql.RowFactory;
0031 import org.apache.spark.sql.types.*;
0032 import static org.apache.spark.sql.types.DataTypes.*;
0033
0034
0035 public class JavaVectorAssemblerExample {
0036 public static void main(String[] args) {
0037 SparkSession spark = SparkSession
0038 .builder()
0039 .appName("JavaVectorAssemblerExample")
0040 .getOrCreate();
0041
0042
0043 StructType schema = createStructType(new StructField[]{
0044 createStructField("id", IntegerType, false),
0045 createStructField("hour", IntegerType, false),
0046 createStructField("mobile", DoubleType, false),
0047 createStructField("userFeatures", new VectorUDT(), false),
0048 createStructField("clicked", DoubleType, false)
0049 });
0050 Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
0051 Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);
0052
0053 VectorAssembler assembler = new VectorAssembler()
0054 .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
0055 .setOutputCol("features");
0056
0057 Dataset<Row> output = assembler.transform(dataset);
0058 System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
0059 "'features'");
0060 output.select("features", "clicked").show(false);
0061
0062
0063 spark.stop();
0064 }
0065 }
0066