0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 from __future__ import print_function
0019
0020
0021 from pyspark.ml.feature import Tokenizer, RegexTokenizer
0022 from pyspark.sql.functions import col, udf
0023 from pyspark.sql.types import IntegerType
0024
0025 from pyspark.sql import SparkSession
0026
0027 if __name__ == "__main__":
0028 spark = SparkSession\
0029 .builder\
0030 .appName("TokenizerExample")\
0031 .getOrCreate()
0032
0033
0034 sentenceDataFrame = spark.createDataFrame([
0035 (0, "Hi I heard about Spark"),
0036 (1, "I wish Java could use case classes"),
0037 (2, "Logistic,regression,models,are,neat")
0038 ], ["id", "sentence"])
0039
0040 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
0041
0042 regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
0043
0044
0045 countTokens = udf(lambda words: len(words), IntegerType())
0046
0047 tokenized = tokenizer.transform(sentenceDataFrame)
0048 tokenized.select("sentence", "words")\
0049 .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
0050
0051 regexTokenized = regexTokenizer.transform(sentenceDataFrame)
0052 regexTokenized.select("sentence", "words") \
0053 .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
0054
0055
0056 spark.stop()