0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.examples.mllib;
0019
0020
0021 import java.util.Arrays;
0022 import java.util.List;
0023
0024
0025 import org.apache.spark.SparkConf;
0026 import org.apache.spark.SparkContext;
0027
0028 import org.apache.spark.api.java.JavaRDD;
0029 import org.apache.spark.api.java.JavaSparkContext;
0030 import org.apache.spark.mllib.linalg.Matrix;
0031 import org.apache.spark.mllib.linalg.Vector;
0032 import org.apache.spark.mllib.linalg.Vectors;
0033 import org.apache.spark.mllib.linalg.distributed.RowMatrix;
0034
0035
0036
0037
0038
0039 public class JavaPCAExample {
0040 public static void main(String[] args) {
0041 SparkConf conf = new SparkConf().setAppName("PCA Example");
0042 SparkContext sc = new SparkContext(conf);
0043 JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
0044
0045
0046 List<Vector> data = Arrays.asList(
0047 Vectors.sparse(5, new int[] {1, 3}, new double[] {1.0, 7.0}),
0048 Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
0049 Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
0050 );
0051
0052 JavaRDD<Vector> rows = jsc.parallelize(data);
0053
0054
0055 RowMatrix mat = new RowMatrix(rows.rdd());
0056
0057
0058
0059 Matrix pc = mat.computePrincipalComponents(4);
0060
0061
0062 RowMatrix projected = mat.multiply(pc);
0063
0064 Vector[] collectPartitions = (Vector[])projected.rows().collect();
0065 System.out.println("Projected vector of principal component:");
0066 for (Vector vector : collectPartitions) {
0067 System.out.println("\t" + vector);
0068 }
0069 jsc.stop();
0070 }
0071 }