|
||||
0001 # 0002 # Licensed to the Apache Software Foundation (ASF) under one or more 0003 # contributor license agreements. See the NOTICE file distributed with 0004 # this work for additional information regarding copyright ownership. 0005 # The ASF licenses this file to You under the Apache License, Version 2.0 0006 # (the "License"); you may not use this file except in compliance with 0007 # the License. You may obtain a copy of the License at 0008 # 0009 # http://www.apache.org/licenses/LICENSE-2.0 0010 # 0011 # Unless required by applicable law or agreed to in writing, software 0012 # distributed under the License is distributed on an "AS IS" BASIS, 0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 0014 # See the License for the specific language governing permissions and 0015 # limitations under the License. 0016 # 0017 0018 from __future__ import print_function 0019 0020 from pyspark import SparkContext 0021 # $example on$ 0022 from pyspark.mllib.linalg import Matrices, Vectors 0023 from pyspark.mllib.regression import LabeledPoint 0024 from pyspark.mllib.stat import Statistics 0025 # $example off$ 0026 0027 if __name__ == "__main__": 0028 sc = SparkContext(appName="HypothesisTestingExample") 0029 0030 # $example on$ 0031 vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # a vector composed of the frequencies of events 0032 0033 # compute the goodness of fit. If a second vector to test against 0034 # is not supplied as a parameter, the test runs against a uniform distribution. 0035 goodnessOfFitTestResult = Statistics.chiSqTest(vec) 0036 0037 # summary of the test including the p-value, degrees of freedom, 0038 # test statistic, the method used, and the null hypothesis. 0039 print("%s\n" % goodnessOfFitTestResult) 0040 0041 mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) # a contingency matrix 0042 0043 # conduct Pearson's independence test on the input contingency matrix 0044 independenceTestResult = Statistics.chiSqTest(mat) 0045 0046 # summary of the test including the p-value, degrees of freedom, 0047 # test statistic, the method used, and the null hypothesis. 0048 print("%s\n" % independenceTestResult) 0049 0050 obs = sc.parallelize( 0051 [LabeledPoint(1.0, [1.0, 0.0, 3.0]), 0052 LabeledPoint(1.0, [1.0, 2.0, 0.0]), 0053 LabeledPoint(1.0, [-1.0, 0.0, -0.5])] 0054 ) # LabeledPoint(label, feature) 0055 0056 # The contingency table is constructed from an RDD of LabeledPoint and used to conduct 0057 # the independence test. Returns an array containing the ChiSquaredTestResult for every feature 0058 # against the label. 0059 featureTestResults = Statistics.chiSqTest(obs) 0060 0061 for i, result in enumerate(featureTestResults): 0062 print("Column %d:\n%s" % (i + 1, result)) 0063 # $example off$ 0064 0065 sc.stop()
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |