0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 r"""
0019 Counts words in UTF8 encoded, '\n' delimited text received from the network every second.
0020 Usage: network_wordcount.py <hostname> <port>
0021 <hostname> and <port> describe the TCP server that Spark Streaming would connect to receive data.
0022
0023 To run this on your local machine, you need to first run a Netcat server
0024 `$ nc -lk 9999`
0025 and then run the example
0026 `$ bin/spark-submit examples/src/main/python/streaming/network_wordcount.py localhost 9999`
0027 """
0028 from __future__ import print_function
0029
0030 import sys
0031
0032 from pyspark import SparkContext
0033 from pyspark.streaming import StreamingContext
0034
0035 if __name__ == "__main__":
0036 if len(sys.argv) != 3:
0037 print("Usage: network_wordcount.py <hostname> <port>", file=sys.stderr)
0038 sys.exit(-1)
0039 sc = SparkContext(appName="PythonStreamingNetworkWordCount")
0040 ssc = StreamingContext(sc, 1)
0041
0042 lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
0043 counts = lines.flatMap(lambda line: line.split(" "))\
0044 .map(lambda word: (word, 1))\
0045 .reduceByKey(lambda a, b: a+b)
0046 counts.pprint()
0047
0048 ssc.start()
0049 ssc.awaitTermination()