0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements. See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License. You may obtain a copy of the License at
0008 #
0009 # http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 # mllib_stat.R: Provides methods for MLlib statistics algorithms integration
0019
0020 #' S4 class that represents an KSTest
0021 #'
0022 #' @param jobj a Java object reference to the backing Scala KSTestWrapper
0023 #' @note KSTest since 2.1.0
0024 setClass("KSTest", representation(jobj = "jobj"))
0025
0026 #' (One-Sample) Kolmogorov-Smirnov Test
0027 #'
0028 #' @description
0029 #' \code{spark.kstest} Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a
0030 #' continuous distribution.
0031 #'
0032 #' By comparing the largest difference between the empirical cumulative
0033 #' distribution of the sample data and the theoretical distribution we can provide a test for the
0034 #' the null hypothesis that the sample data comes from that theoretical distribution.
0035 #'
0036 #' Users can call \code{summary} to obtain a summary of the test, and \code{print.summary.KSTest}
0037 #' to print out a summary result.
0038 #'
0039 #' @param data a SparkDataFrame of user data.
0040 #' @param testCol column name where the test data is from. It should be a column of double type.
0041 #' @param nullHypothesis name of the theoretical distribution tested against. Currently only
0042 #' \code{"norm"} for normal distribution is supported.
0043 #' @param distParams parameters(s) of the distribution. For \code{nullHypothesis = "norm"},
0044 #' we can provide as a vector the mean and standard deviation of
0045 #' the distribution. If none is provided, then standard normal will be used.
0046 #' If only one is provided, then the standard deviation will be set to be one.
0047 #' @param ... additional argument(s) passed to the method.
0048 #' @return \code{spark.kstest} returns a test result object.
0049 #' @rdname spark.kstest
0050 #' @aliases spark.kstest,SparkDataFrame-method
0051 #' @name spark.kstest
0052 #' @seealso \href{http://spark.apache.org/docs/latest/mllib-statistics.html#hypothesis-testing}{
0053 #' MLlib: Hypothesis Testing}
0054 #' @examples
0055 #' \dontrun{
0056 #' data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25))
0057 #' df <- createDataFrame(data)
0058 #' test <- spark.kstest(df, "test", "norm", c(0, 1))
0059 #'
0060 #' # get a summary of the test result
0061 #' testSummary <- summary(test)
0062 #' testSummary
0063 #'
0064 #' # print out the summary in an organized way
0065 #' print.summary.KSTest(testSummary)
0066 #' }
0067 #' @note spark.kstest since 2.1.0
0068 setMethod("spark.kstest", signature(data = "SparkDataFrame"),
0069 function(data, testCol = "test", nullHypothesis = c("norm"), distParams = c(0, 1)) {
0070 tryCatch(match.arg(nullHypothesis),
0071 error = function(e) {
0072 stop("Distribution ", nullHypothesis, " is not supported.")
0073 })
0074 if (nullHypothesis == "norm") {
0075 distParams <- as.numeric(distParams)
0076 mu <- ifelse(length(distParams) < 1, 0, distParams[1])
0077 sigma <- ifelse(length(distParams) < 2, 1, distParams[2])
0078 jobj <- callJStatic("org.apache.spark.ml.r.KSTestWrapper",
0079 "test", data@sdf, testCol, nullHypothesis,
0080 as.array(c(mu, sigma)))
0081 new("KSTest", jobj = jobj)
0082 }
0083 })
0084
0085 # Get the summary of Kolmogorov-Smirnov (KS) Test.
0086
0087 #' @param object test result object of KSTest by \code{spark.kstest}.
0088 #' @return \code{summary} returns summary information of KSTest object, which is a list.
0089 #' The list includes the \code{p.value} (p-value), \code{statistic} (test statistic
0090 #' computed for the test), \code{nullHypothesis} (the null hypothesis with its
0091 #' parameters tested against) and \code{degreesOfFreedom} (degrees of freedom of the test).
0092 #' @rdname spark.kstest
0093 #' @aliases summary,KSTest-method
0094 #' @note summary(KSTest) since 2.1.0
0095 setMethod("summary", signature(object = "KSTest"),
0096 function(object) {
0097 jobj <- object@jobj
0098 pValue <- callJMethod(jobj, "pValue")
0099 statistic <- callJMethod(jobj, "statistic")
0100 nullHypothesis <- callJMethod(jobj, "nullHypothesis")
0101 distName <- callJMethod(jobj, "distName")
0102 distParams <- unlist(callJMethod(jobj, "distParams"))
0103 degreesOfFreedom <- callJMethod(jobj, "degreesOfFreedom")
0104
0105 ans <- list(p.value = pValue, statistic = statistic, nullHypothesis = nullHypothesis,
0106 nullHypothesis.name = distName, nullHypothesis.parameters = distParams,
0107 degreesOfFreedom = degreesOfFreedom, jobj = jobj)
0108 class(ans) <- "summary.KSTest"
0109 ans
0110 })
0111
0112 # Prints the summary of KSTest
0113
0114 #' @rdname spark.kstest
0115 #' @param x summary object of KSTest returned by \code{summary}.
0116 #' @note print.summary.KSTest since 2.1.0
0117 print.summary.KSTest <- function(x, ...) {
0118 jobj <- x$jobj
0119 summaryStr <- callJMethod(jobj, "summary")
0120 cat(summaryStr, "\n")
0121 invisible(x)
0122 }