0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.mllib;
0019
0020 import com.google.common.collect.ImmutableMap;
0021 import org.apache.spark.SparkConf;
0022 import org.apache.spark.api.java.JavaSparkContext;
0023
0024
0025 import java.util.*;
0026
0027 import scala.Tuple2;
0028
0029 import org.apache.spark.api.java.JavaPairRDD;
0030
0031
0032 public class JavaStratifiedSamplingExample {
0033 public static void main(String[] args) {
0034
0035 SparkConf conf = new SparkConf().setAppName("JavaStratifiedSamplingExample");
0036 JavaSparkContext jsc = new JavaSparkContext(conf);
0037
0038 @SuppressWarnings("unchecked")
0039
0040 List<Tuple2<Integer, Character>> list = Arrays.asList(
0041 new Tuple2<>(1, 'a'),
0042 new Tuple2<>(1, 'b'),
0043 new Tuple2<>(2, 'c'),
0044 new Tuple2<>(2, 'd'),
0045 new Tuple2<>(2, 'e'),
0046 new Tuple2<>(3, 'f')
0047 );
0048
0049 JavaPairRDD<Integer, Character> data = jsc.parallelizePairs(list);
0050
0051
0052 ImmutableMap<Integer, Double> fractions = ImmutableMap.of(1, 0.1, 2, 0.6, 3, 0.3);
0053
0054
0055 JavaPairRDD<Integer, Character> approxSample = data.sampleByKey(false, fractions);
0056
0057 JavaPairRDD<Integer, Character> exactSample = data.sampleByKeyExact(false, fractions);
0058
0059
0060 System.out.println("approxSample size is " + approxSample.collect().size());
0061 for (Tuple2<Integer, Character> t : approxSample.collect()) {
0062 System.out.println(t._1() + " " + t._2());
0063 }
0064
0065 System.out.println("exactSample size is " + exactSample.collect().size());
0066 for (Tuple2<Integer, Character> t : exactSample.collect()) {
0067 System.out.println(t._1() + " " + t._2());
0068 }
0069
0070 jsc.stop();
0071 }
0072 }