0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.mllib;
0019
0020 import org.apache.spark.SparkConf;
0021 import org.apache.spark.api.java.JavaSparkContext;
0022
0023 import org.apache.spark.api.java.JavaRDD;
0024 import org.apache.spark.mllib.feature.ChiSqSelector;
0025 import org.apache.spark.mllib.feature.ChiSqSelectorModel;
0026 import org.apache.spark.mllib.linalg.Vectors;
0027 import org.apache.spark.mllib.regression.LabeledPoint;
0028 import org.apache.spark.mllib.util.MLUtils;
0029
0030
0031 public class JavaChiSqSelectorExample {
0032 public static void main(String[] args) {
0033
0034 SparkConf conf = new SparkConf().setAppName("JavaChiSqSelectorExample");
0035 JavaSparkContext jsc = new JavaSparkContext(conf);
0036
0037
0038 JavaRDD<LabeledPoint> points = MLUtils.loadLibSVMFile(jsc.sc(),
0039 "data/mllib/sample_libsvm_data.txt").toJavaRDD().cache();
0040
0041
0042
0043 JavaRDD<LabeledPoint> discretizedData = points.map(lp -> {
0044 double[] discretizedFeatures = new double[lp.features().size()];
0045 for (int i = 0; i < lp.features().size(); ++i) {
0046 discretizedFeatures[i] = Math.floor(lp.features().apply(i) / 16);
0047 }
0048 return new LabeledPoint(lp.label(), Vectors.dense(discretizedFeatures));
0049 });
0050
0051
0052 ChiSqSelector selector = new ChiSqSelector(50);
0053
0054 ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd());
0055
0056 JavaRDD<LabeledPoint> filteredData = discretizedData.map(lp ->
0057 new LabeledPoint(lp.label(), transformer.transform(lp.features())));
0058
0059
0060 System.out.println("filtered data: ");
0061 filteredData.foreach(System.out::println);
0062
0063 jsc.stop();
0064 }
0065 }