0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples;
0019
0020 import scala.Tuple2;
0021
0022 import org.apache.spark.api.java.JavaPairRDD;
0023 import org.apache.spark.api.java.JavaRDD;
0024 import org.apache.spark.sql.SparkSession;
0025
0026 import java.util.Arrays;
0027 import java.util.List;
0028 import java.util.regex.Pattern;
0029
0030 public final class JavaWordCount {
0031 private static final Pattern SPACE = Pattern.compile(" ");
0032
0033 public static void main(String[] args) throws Exception {
0034
0035 if (args.length < 1) {
0036 System.err.println("Usage: JavaWordCount <file>");
0037 System.exit(1);
0038 }
0039
0040 SparkSession spark = SparkSession
0041 .builder()
0042 .appName("JavaWordCount")
0043 .getOrCreate();
0044
0045 JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();
0046
0047 JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator());
0048
0049 JavaPairRDD<String, Integer> ones = words.mapToPair(s -> new Tuple2<>(s, 1));
0050
0051 JavaPairRDD<String, Integer> counts = ones.reduceByKey((i1, i2) -> i1 + i2);
0052
0053 List<Tuple2<String, Integer>> output = counts.collect();
0054 for (Tuple2<?,?> tuple : output) {
0055 System.out.println(tuple._1() + ": " + tuple._2());
0056 }
0057 spark.stop();
0058 }
0059 }