examples/ml/JavaTokenizerExample.java

0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017
0018 package org.apache.spark.examples.ml;
0019
0020 import org.apache.spark.sql.SparkSession;
0021
0022 // $example on$
0023 import java.util.Arrays;
0024 import java.util.List;
0025
0026 import scala.collection.mutable.WrappedArray;
0027
0028 import org.apache.spark.ml.feature.RegexTokenizer;
0029 import org.apache.spark.ml.feature.Tokenizer;
0030 import org.apache.spark.sql.Dataset;
0031 import org.apache.spark.sql.Row;
0032 import org.apache.spark.sql.RowFactory;
0033 import org.apache.spark.sql.types.DataTypes;
0034 import org.apache.spark.sql.types.Metadata;
0035 import org.apache.spark.sql.types.StructField;
0036 import org.apache.spark.sql.types.StructType;
0037
0038 // col("...") is preferable to df.col("...")
0039 import static org.apache.spark.sql.functions.callUDF;
0040 import static org.apache.spark.sql.functions.col;
0041 // $example off$
0042
0043 public class JavaTokenizerExample {
0044   public static void main(String[] args) {
0045     SparkSession spark = SparkSession
0046       .builder()
0047       .appName("JavaTokenizerExample")
0048       .getOrCreate();
0049
0050     // $example on$
0051     List<Row> data = Arrays.asList(
0052       RowFactory.create(0, "Hi I heard about Spark"),
0053       RowFactory.create(1, "I wish Java could use case classes"),
0054       RowFactory.create(2, "Logistic,regression,models,are,neat")
0055     );
0056
0057     StructType schema = new StructType(new StructField[]{
0058       new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
0059       new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
0060     });
0061
0062     Dataset<Row> sentenceDataFrame = spark.createDataFrame(data, schema);
0063
0064     Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
0065
0066     RegexTokenizer regexTokenizer = new RegexTokenizer()
0067         .setInputCol("sentence")
0068         .setOutputCol("words")
0069         .setPattern("\\W");  // alternatively .setPattern("\\w+").setGaps(false);
0070
0071     spark.udf().register(
0072       "countTokens", (WrappedArray<?> words) -> words.size(), DataTypes.IntegerType);
0073
0074     Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
0075     tokenized.select("sentence", "words")
0076         .withColumn("tokens", callUDF("countTokens", col("words")))
0077         .show(false);
0078
0079     Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
0080     regexTokenized.select("sentence", "words")
0081         .withColumn("tokens", callUDF("countTokens", col("words")))
0082         .show(false);
0083     // $example off$
0084
0085     spark.stop();
0086   }
0087 }