ml/feature/JavaTokenizerSuite.java

0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017
0018 package org.apache.spark.ml.feature;
0019
0020 import java.util.Arrays;
0021 import java.util.List;
0022
0023 import org.junit.Assert;
0024 import org.junit.Test;
0025
0026 import org.apache.spark.SharedSparkSession;
0027 import org.apache.spark.api.java.JavaRDD;
0028 import org.apache.spark.sql.Dataset;
0029 import org.apache.spark.sql.Row;
0030
0031 public class JavaTokenizerSuite extends SharedSparkSession {
0032
0033   @Test
0034   public void regexTokenizer() {
0035     RegexTokenizer myRegExTokenizer = new RegexTokenizer()
0036       .setInputCol("rawText")
0037       .setOutputCol("tokens")
0038       .setPattern("\\s")
0039       .setGaps(true)
0040       .setToLowercase(false)
0041       .setMinTokenLength(3);
0042
0043
0044     JavaRDD<TokenizerTestData> rdd = jsc.parallelize(Arrays.asList(
0045       new TokenizerTestData("Test of tok.", new String[]{"Test", "tok."}),
0046       new TokenizerTestData("Te,st.  punct", new String[]{"Te,st.", "punct"})
0047     ));
0048     Dataset<Row> dataset = spark.createDataFrame(rdd, TokenizerTestData.class);
0049
0050     List<Row> pairs = myRegExTokenizer.transform(dataset)
0051       .select("tokens", "wantedTokens")
0052       .collectAsList();
0053
0054     for (Row r : pairs) {
0055       Assert.assertEquals(r.get(0), r.get(1));
0056     }
0057   }
0058 }