0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 """
0019 Counts words in new text files created in the given directory
0020 Usage: hdfs_wordcount.py <directory>
0021 <directory> is the directory that Spark Streaming will use to find and read new text files.
0022
0023 To run this on your local machine on directory `localdir`, run this example
0024 $ bin/spark-submit examples/src/main/python/streaming/hdfs_wordcount.py localdir
0025
0026 Then create a text file in `localdir` and the words in the file will get counted.
0027 """
0028 from __future__ import print_function
0029
0030 import sys
0031
0032 from pyspark import SparkContext
0033 from pyspark.streaming import StreamingContext
0034
0035 if __name__ == "__main__":
0036 if len(sys.argv) != 2:
0037 print("Usage: hdfs_wordcount.py <directory>", file=sys.stderr)
0038 sys.exit(-1)
0039
0040 sc = SparkContext(appName="PythonStreamingHDFSWordCount")
0041 ssc = StreamingContext(sc, 1)
0042
0043 lines = ssc.textFileStream(sys.argv[1])
0044 counts = lines.flatMap(lambda line: line.split(" "))\
0045 .map(lambda x: (x, 1))\
0046 .reduceByKey(lambda a, b: a+b)
0047 counts.pprint()
0048
0049 ssc.start()
0050 ssc.awaitTermination()