0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 An example demonstrating LDA.
0020 Run with:
0021 bin/spark-submit examples/src/main/python/ml/lda_example.py
0022 """
0023 from __future__ import print_function
0024
0025
0026 from pyspark.ml.clustering import LDA
0027
0028 from pyspark.sql import SparkSession
0029
0030 if __name__ == "__main__":
0031 spark = SparkSession \
0032 .builder \
0033 .appName("LDAExample") \
0034 .getOrCreate()
0035
0036
0037
0038 dataset = spark.read.format("libsvm").load("data/mllib/sample_lda_libsvm_data.txt")
0039
0040
0041 lda = LDA(k=10, maxIter=10)
0042 model = lda.fit(dataset)
0043
0044 ll = model.logLikelihood(dataset)
0045 lp = model.logPerplexity(dataset)
0046 print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
0047 print("The upper bound on perplexity: " + str(lp))
0048
0049
0050 topics = model.describeTopics(3)
0051 print("The topics described by their top-weighted terms:")
0052 topics.show(truncate=False)
0053
0054
0055 transformed = model.transform(dataset)
0056 transformed.show(truncate=False)
0057
0058
0059 spark.stop()