0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 from __future__ import print_function
0019
0020
0021 from pyspark.ml.feature import HashingTF, IDF, Tokenizer
0022
0023 from pyspark.sql import SparkSession
0024
0025 if __name__ == "__main__":
0026 spark = SparkSession\
0027 .builder\
0028 .appName("TfIdfExample")\
0029 .getOrCreate()
0030
0031
0032 sentenceData = spark.createDataFrame([
0033 (0.0, "Hi I heard about Spark"),
0034 (0.0, "I wish Java could use case classes"),
0035 (1.0, "Logistic regression models are neat")
0036 ], ["label", "sentence"])
0037
0038 tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
0039 wordsData = tokenizer.transform(sentenceData)
0040
0041 hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
0042 featurizedData = hashingTF.transform(wordsData)
0043
0044
0045 idf = IDF(inputCol="rawFeatures", outputCol="features")
0046 idfModel = idf.fit(featurizedData)
0047 rescaledData = idfModel.transform(featurizedData)
0048
0049 rescaledData.select("label", "features").show()
0050
0051
0052 spark.stop()