0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.ml;
0019
0020
0021 import java.util.Arrays;
0022 import java.util.List;
0023
0024 import org.apache.spark.ml.feature.CountVectorizer;
0025 import org.apache.spark.ml.feature.CountVectorizerModel;
0026 import org.apache.spark.sql.Dataset;
0027 import org.apache.spark.sql.Row;
0028 import org.apache.spark.sql.RowFactory;
0029 import org.apache.spark.sql.SparkSession;
0030 import org.apache.spark.sql.types.*;
0031
0032
0033 public class JavaCountVectorizerExample {
0034 public static void main(String[] args) {
0035 SparkSession spark = SparkSession
0036 .builder()
0037 .appName("JavaCountVectorizerExample")
0038 .getOrCreate();
0039
0040
0041
0042 List<Row> data = Arrays.asList(
0043 RowFactory.create(Arrays.asList("a", "b", "c")),
0044 RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
0045 );
0046 StructType schema = new StructType(new StructField [] {
0047 new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
0048 });
0049 Dataset<Row> df = spark.createDataFrame(data, schema);
0050
0051
0052 CountVectorizerModel cvModel = new CountVectorizer()
0053 .setInputCol("text")
0054 .setOutputCol("feature")
0055 .setVocabSize(3)
0056 .setMinDF(2)
0057 .fit(df);
0058
0059
0060 CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
0061 .setInputCol("text")
0062 .setOutputCol("feature");
0063
0064 cvModel.transform(df).show(false);
0065
0066
0067 spark.stop();
0068 }
0069 }