Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 # mllib_stat.R: Provides methods for MLlib statistics algorithms integration
0019 
0020 #' S4 class that represents an KSTest
0021 #'
0022 #' @param jobj a Java object reference to the backing Scala KSTestWrapper
0023 #' @note KSTest since 2.1.0
0024 setClass("KSTest", representation(jobj = "jobj"))
0025 
0026 #' (One-Sample) Kolmogorov-Smirnov Test
0027 #'
0028 #' @description
0029 #' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a
0030 #' continuous distribution.
0031 #'
0032 #' By comparing the largest difference between the empirical cumulative
0033 #' distribution of the sample data and the theoretical distribution we can provide a test for the
0034 #' the null hypothesis that the sample data comes from that theoretical distribution.
0035 #'
0036 #' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest}
0037 #' to print out a summary result.
0038 #'
0039 #' @param data a SparkDataFrame of user data.
0040 #' @param testCol column name where the test data is from. It should be a column of double type.
0041 #' @param nullHypothesis name of the theoretical distribution tested against. Currently only
0042 #'                       \code{"norm"} for normal distribution is supported.
0043 #' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"},
0044 #'                   we can provide as a vector the mean and standard deviation of
0045 #'                   the distribution. If none is provided, then standard normal will be used.
0046 #'                   If only one is provided, then the standard deviation will be set to be one.
0047 #' @param ... additional argument(s) passed to the method.
0048 #' @return \code{spark.kstest} returns a test result object.
0049 #' @rdname spark.kstest
0050 #' @aliases spark.kstest,SparkDataFrame-method
0051 #' @name spark.kstest
0052 #' @seealso \href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
0053 #'          MLlib: Hypothesis Testing}
0054 #' @examples
0055 #' \dontrun{
0056 #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
0057 #' df <- createDataFrame(data)
0058 #' test <- spark.kstest(df, "test", "norm", c(0, 1))
0059 #'
0060 #' # get a summary of the test result
0061 #' testSummary <- summary(test)
0062 #' testSummary
0063 #'
0064 #' # print out the summary in an organized way
0065 #' print.summary.KSTest(testSummary)
0066 #' }
0067 #' @note spark.kstest since 2.1.0
0068 setMethod("spark.kstest", signature(data = "SparkDataFrame"),
0069           function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) {
0070             tryCatch(match.arg(nullHypothesis),
0071                      error = function(e) {
0072                        stop("Distribution ", nullHypothesis, " is not supported.")
0073                      })
0074             if (nullHypothesis == "norm") {
0075               distParams <- as.numeric(distParams)
0076               mu <- ifelse(length(distParams) < 1, 0, distParams[1])
0077               sigma <- ifelse(length(distParams) < 2, 1, distParams[2])
0078               jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper",
0079                                   "test", data@sdf, testCol, nullHypothesis,
0080                                   as.array(c(mu, sigma)))
0081               new("KSTest", jobj = jobj)
0082             }
0083 })
0084 
0085 #  Get the summary of Kolmogorov-Smirnov (KS) Test.
0086 
0087 #' @param object test result object of KSTest by \code{spark.kstest}.
0088 #' @return \code{summary} returns summary information of KSTest object, which is a list.
0089 #'         The list includes the \code{p.value} (p-value), \code{statistic} (test statistic
0090 #'         computed for the test), \code{nullHypothesis} (the null hypothesis with its
0091 #'         parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test).
0092 #' @rdname spark.kstest
0093 #' @aliases summary,KSTest-method
0094 #' @note summary(KSTest) since 2.1.0
0095 setMethod("summary", signature(object = "KSTest"),
0096           function(object) {
0097             jobj <- object@jobj
0098             pValue <- callJMethod(jobj, "pValue")
0099             statistic <- callJMethod(jobj, "statistic")
0100             nullHypothesis <- callJMethod(jobj, "nullHypothesis")
0101             distName <- callJMethod(jobj, "distName")
0102             distParams <- unlist(callJMethod(jobj, "distParams"))
0103             degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom")
0104 
0105             ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis,
0106                         nullHypothesis.name = distName, nullHypothesis.parameters = distParams,
0107                         degreesOfFreedom = degreesOfFreedom, jobj = jobj)
0108             class(ans) <- "summary.KSTest"
0109             ans
0110           })
0111 
0112 #  Prints the summary of KSTest
0113 
0114 #' @rdname spark.kstest
0115 #' @param x summary object of KSTest returned by \code{summary}.
0116 #' @note print.summary.KSTest since 2.1.0
0117 print.summary.KSTest <- function(x, ...) {
0118   jobj <- x$jobj
0119   summaryStr <- callJMethod(jobj, "summary")
0120   cat(summaryStr, "\n")
0121   invisible(x)
0122 }