examples/ml/JavaVectorAssemblerExample.java

0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017
0018 package org.apache.spark.examples.ml;
0019
0020 import org.apache.spark.sql.SparkSession;
0021
0022 // $example on$
0023 import java.util.Arrays;
0024
0025 import org.apache.spark.ml.feature.VectorAssembler;
0026 import org.apache.spark.ml.linalg.VectorUDT;
0027 import org.apache.spark.ml.linalg.Vectors;
0028 import org.apache.spark.sql.Dataset;
0029 import org.apache.spark.sql.Row;
0030 import org.apache.spark.sql.RowFactory;
0031 import org.apache.spark.sql.types.*;
0032 import static org.apache.spark.sql.types.DataTypes.*;
0033 // $example off$
0034
0035 public class JavaVectorAssemblerExample {
0036   public static void main(String[] args) {
0037     SparkSession spark = SparkSession
0038       .builder()
0039       .appName("JavaVectorAssemblerExample")
0040       .getOrCreate();
0041
0042     // $example on$
0043     StructType schema = createStructType(new StructField[]{
0044       createStructField("id", IntegerType, false),
0045       createStructField("hour", IntegerType, false),
0046       createStructField("mobile", DoubleType, false),
0047       createStructField("userFeatures", new VectorUDT(), false),
0048       createStructField("clicked", DoubleType, false)
0049     });
0050     Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
0051     Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);
0052
0053     VectorAssembler assembler = new VectorAssembler()
0054       .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
0055       .setOutputCol("features");
0056
0057     Dataset<Row> output = assembler.transform(dataset);
0058     System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
0059         "'features'");
0060     output.select("features", "clicked").show(false);
0061     // $example off$
0062
0063     spark.stop();
0064   }
0065 }
0066