Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 from __future__ import print_function
0019 
0020 import numpy as np
0021 
0022 from pyspark import SparkContext
0023 # $example on$
0024 from pyspark.mllib.stat import Statistics
0025 # $example off$
0026 
0027 if __name__ == "__main__":
0028     sc = SparkContext(appName="CorrelationsExample")  # SparkContext
0029 
0030     # $example on$
0031     seriesX = sc.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
0032     # seriesY must have the same number of partitions and cardinality as seriesX
0033     seriesY = sc.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])
0034 
0035     # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
0036     # If a method is not specified, Pearson's method will be used by default.
0037     print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))
0038 
0039     data = sc.parallelize(
0040         [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
0041     )  # an RDD of Vectors
0042 
0043     # calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
0044     # If a method is not specified, Pearson's method will be used by default.
0045     print(Statistics.corr(data, method="pearson"))
0046     # $example off$
0047 
0048     sc.stop()