0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 from __future__ import print_function
0019
0020 from pyspark import SparkContext
0021
0022 from pyspark.mllib.clustering import LDA, LDAModel
0023 from pyspark.mllib.linalg import Vectors
0024
0025
0026 if __name__ == "__main__":
0027 sc = SparkContext(appName="LatentDirichletAllocationExample")
0028
0029
0030
0031 data = sc.textFile("data/mllib/sample_lda_data.txt")
0032 parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
0033
0034 corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
0035
0036
0037 ldaModel = LDA.train(corpus, k=3)
0038
0039
0040 print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize())
0041 + " words):")
0042 topics = ldaModel.topicsMatrix()
0043 for topic in range(3):
0044 print("Topic " + str(topic) + ":")
0045 for word in range(0, ldaModel.vocabSize()):
0046 print(" " + str(topics[word][topic]))
0047
0048
0049 ldaModel.save(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
0050 sameModel = LDAModel\
0051 .load(sc, "target/org/apache/spark/PythonLatentDirichletAllocationExample/LDAModel")
0052
0053
0054 sc.stop()