0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements. See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License. You may obtain a copy of the License at
0008 #
0009 # http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 # mllib_regression.R: Provides methods for MLlib classification algorithms
0019 # (except for tree-based algorithms) integration
0020
0021 #' S4 class that represents an LinearSVCModel
0022 #'
0023 #' @param jobj a Java object reference to the backing Scala LinearSVCModel
0024 #' @note LinearSVCModel since 2.2.0
0025 setClass("LinearSVCModel", representation(jobj = "jobj"))
0026
0027 #' S4 class that represents an LogisticRegressionModel
0028 #'
0029 #' @param jobj a Java object reference to the backing Scala LogisticRegressionModel
0030 #' @note LogisticRegressionModel since 2.1.0
0031 setClass("LogisticRegressionModel", representation(jobj = "jobj"))
0032
0033 #' S4 class that represents a MultilayerPerceptronClassificationModel
0034 #'
0035 #' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper
0036 #' @note MultilayerPerceptronClassificationModel since 2.1.0
0037 setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj"))
0038
0039 #' S4 class that represents a NaiveBayesModel
0040 #'
0041 #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
0042 #' @note NaiveBayesModel since 2.0.0
0043 setClass("NaiveBayesModel", representation(jobj = "jobj"))
0044
0045 #' Linear SVM Model
0046 #'
0047 #' Fits a linear SVM model against a SparkDataFrame, similar to svm in e1071 package.
0048 #' Currently only supports binary classification model with linear kernel.
0049 #' Users can print, make predictions on the produced model and save the model to the input path.
0050 #'
0051 #' @param data SparkDataFrame for training.
0052 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
0053 #' operators are supported, including '~', '.', ':', '+', '-', '*', and '^'.
0054 #' @param regParam The regularization parameter. Only supports L2 regularization currently.
0055 #' @param maxIter Maximum iteration number.
0056 #' @param tol Convergence tolerance of iterations.
0057 #' @param standardization Whether to standardize the training features before fitting the model.
0058 #' The coefficients of models will be always returned on the original scale,
0059 #' so it will be transparent for users. Note that with/without
0060 #' standardization, the models should be always converged to the same
0061 #' solution when no regularization is applied.
0062 #' @param threshold The threshold in binary classification applied to the linear model prediction.
0063 #' This threshold can be any real number, where Inf will make all predictions 0.0
0064 #' and -Inf will make all predictions 1.0.
0065 #' @param weightCol The weight column name.
0066 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
0067 #' dimensions of features or the number of partitions are large, this param
0068 #' could be adjusted to a larger size.
0069 #' This is an expert parameter. Default value should be good for most cases.
0070 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0071 #' label column of string type.
0072 #' Supported options: "skip" (filter out rows with invalid data),
0073 #' "error" (throw an error), "keep" (put invalid data in
0074 #' a special additional bucket, at index numLabels). Default
0075 #' is "error".
0076 #' @param ... additional arguments passed to the method.
0077 #' @return \code{spark.svmLinear} returns a fitted linear SVM model.
0078 #' @rdname spark.svmLinear
0079 #' @aliases spark.svmLinear,SparkDataFrame,formula-method
0080 #' @name spark.svmLinear
0081 #' @examples
0082 #' \dontrun{
0083 #' sparkR.session()
0084 #' t <- as.data.frame(Titanic)
0085 #' training <- createDataFrame(t)
0086 #' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5)
0087 #' summary <- summary(model)
0088 #'
0089 #' # fitted values on training data
0090 #' fitted <- predict(model, training)
0091 #'
0092 #' # save fitted model to input path
0093 #' path <- "path/to/model"
0094 #' write.ml(model, path)
0095 #'
0096 #' # can also read back the saved model and predict
0097 #' # Note that summary deos not work on loaded model
0098 #' savedModel <- read.ml(path)
0099 #' summary(savedModel)
0100 #' }
0101 #' @note spark.svmLinear since 2.2.0
0102 setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formula"),
0103 function(data, formula, regParam = 0.0, maxIter = 100, tol = 1E-6, standardization = TRUE,
0104 threshold = 0.0, weightCol = NULL, aggregationDepth = 2,
0105 handleInvalid = c("error", "keep", "skip")) {
0106 formula <- paste(deparse(formula), collapse = "")
0107
0108 if (!is.null(weightCol) && weightCol == "") {
0109 weightCol <- NULL
0110 } else if (!is.null(weightCol)) {
0111 weightCol <- as.character(weightCol)
0112 }
0113
0114 handleInvalid <- match.arg(handleInvalid)
0115
0116 jobj <- callJStatic("org.apache.spark.ml.r.LinearSVCWrapper", "fit",
0117 data@sdf, formula, as.numeric(regParam), as.integer(maxIter),
0118 as.numeric(tol), as.logical(standardization), as.numeric(threshold),
0119 weightCol, as.integer(aggregationDepth), handleInvalid)
0120 new("LinearSVCModel", jobj = jobj)
0121 })
0122
0123 # Predicted values based on a LinearSVCModel model
0124
0125 #' @param newData a SparkDataFrame for testing.
0126 #' @return \code{predict} returns the predicted values based on a LinearSVCModel.
0127 #' @rdname spark.svmLinear
0128 #' @aliases predict,LinearSVCModel,SparkDataFrame-method
0129 #' @note predict(LinearSVCModel) since 2.2.0
0130 setMethod("predict", signature(object = "LinearSVCModel"),
0131 function(object, newData) {
0132 predict_internal(object, newData)
0133 })
0134
0135 # Get the summary of a LinearSVCModel
0136
0137 #' @param object a LinearSVCModel fitted by \code{spark.svmLinear}.
0138 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0139 #' The list includes \code{coefficients} (coefficients of the fitted model),
0140 #' \code{numClasses} (number of classes), \code{numFeatures} (number of features).
0141 #' @rdname spark.svmLinear
0142 #' @aliases summary,LinearSVCModel-method
0143 #' @note summary(LinearSVCModel) since 2.2.0
0144 setMethod("summary", signature(object = "LinearSVCModel"),
0145 function(object) {
0146 jobj <- object@jobj
0147 features <- callJMethod(jobj, "rFeatures")
0148 coefficients <- callJMethod(jobj, "rCoefficients")
0149 coefficients <- as.matrix(unlist(coefficients))
0150 colnames(coefficients) <- c("Estimate")
0151 rownames(coefficients) <- unlist(features)
0152 numClasses <- callJMethod(jobj, "numClasses")
0153 numFeatures <- callJMethod(jobj, "numFeatures")
0154 list(coefficients = coefficients, numClasses = numClasses, numFeatures = numFeatures)
0155 })
0156
0157 # Save fitted LinearSVCModel to the input path
0158
0159 #' @param path The directory where the model is saved.
0160 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
0161 #' which means throw exception if the output path exists.
0162 #'
0163 #' @rdname spark.svmLinear
0164 #' @aliases write.ml,LinearSVCModel,character-method
0165 #' @note write.ml(LogisticRegression, character) since 2.2.0
0166 setMethod("write.ml", signature(object = "LinearSVCModel", path = "character"),
0167 function(object, path, overwrite = FALSE) {
0168 write_internal(object, path, overwrite)
0169 })
0170
0171 #' Logistic Regression Model
0172 #'
0173 #' Fits an logistic regression model against a SparkDataFrame. It supports "binomial": Binary
0174 #' logistic regression with pivoting; "multinomial": Multinomial logistic (softmax) regression
0175 #' without pivoting, similar to glmnet. Users can print, make predictions on the produced model
0176 #' and save the model to the input path.
0177 #'
0178 #' @param data SparkDataFrame for training.
0179 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
0180 #' operators are supported, including '~', '.', ':', '+', and '-'.
0181 #' @param regParam the regularization parameter.
0182 #' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2
0183 #' penalty. For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0,
0184 #' the penalty is a combination of L1 and L2. Default is 0.0 which is an
0185 #' L2 penalty.
0186 #' @param maxIter maximum iteration number.
0187 #' @param tol convergence tolerance of iterations.
0188 #' @param family the name of family which is a description of the label distribution to be used
0189 #' in the model.
0190 #' Supported options:
0191 #' \itemize{
0192 #' \item{"auto": Automatically select the family based on the number of classes:
0193 #' If number of classes == 1 || number of classes == 2, set to "binomial".
0194 #' Else, set to "multinomial".}
0195 #' \item{"binomial": Binary logistic regression with pivoting.}
0196 #' \item{"multinomial": Multinomial logistic (softmax) regression without
0197 #' pivoting.}
0198 #' }
0199 #' @param standardization whether to standardize the training features before fitting the model.
0200 #' The coefficients of models will be always returned on the original scale,
0201 #' so it will be transparent for users. Note that with/without
0202 #' standardization, the models should be always converged to the same
0203 #' solution when no regularization is applied. Default is TRUE, same as
0204 #' glmnet.
0205 #' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of
0206 #' class label 1 is > threshold, then predict 1, else 0. A high threshold
0207 #' encourages the model to predict 0 more often; a low threshold encourages the
0208 #' model to predict 1 more often. Note: Setting this with threshold p is
0209 #' equivalent to setting thresholds c(1-p, p). In multiclass (or binary)
0210 #' classification to adjust the probability of predicting each class. Array must
0211 #' have length equal to the number of classes, with values > 0, excepting that
0212 #' at most one value may be 0. The class with largest value p/t is predicted,
0213 #' where p is the original probability of that class and t is the class's
0214 #' threshold.
0215 #' @param weightCol The weight column name.
0216 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
0217 #' dimensions of features or the number of partitions are large, this param
0218 #' could be adjusted to a larger size. This is an expert parameter. Default
0219 #' value should be good for most cases.
0220 #' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound
0221 #' constrained optimization.
0222 #' The bound matrix must be compatible with the shape (1, number
0223 #' of features) for binomial regression, or (number of classes,
0224 #' number of features) for multinomial regression.
0225 #' It is a R matrix.
0226 #' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound
0227 #' constrained optimization.
0228 #' The bound matrix must be compatible with the shape (1, number
0229 #' of features) for binomial regression, or (number of classes,
0230 #' number of features) for multinomial regression.
0231 #' It is a R matrix.
0232 #' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained
0233 #' optimization.
0234 #' The bounds vector size must be equal to 1 for binomial regression,
0235 #' or the number
0236 #' of classes for multinomial regression.
0237 #' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained
0238 #' optimization.
0239 #' The bound vector size must be equal to 1 for binomial regression,
0240 #' or the number of classes for multinomial regression.
0241 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0242 #' label column of string type.
0243 #' Supported options: "skip" (filter out rows with invalid data),
0244 #' "error" (throw an error), "keep" (put invalid data in
0245 #' a special additional bucket, at index numLabels). Default
0246 #' is "error".
0247 #' @param ... additional arguments passed to the method.
0248 #' @return \code{spark.logit} returns a fitted logistic regression model.
0249 #' @rdname spark.logit
0250 #' @aliases spark.logit,SparkDataFrame,formula-method
0251 #' @name spark.logit
0252 #' @examples
0253 #' \dontrun{
0254 #' sparkR.session()
0255 #' # binary logistic regression
0256 #' t <- as.data.frame(Titanic)
0257 #' training <- createDataFrame(t)
0258 #' model <- spark.logit(training, Survived ~ ., regParam = 0.5)
0259 #' summary <- summary(model)
0260 #'
0261 #' # fitted values on training data
0262 #' fitted <- predict(model, training)
0263 #'
0264 #' # save fitted model to input path
0265 #' path <- "path/to/model"
0266 #' write.ml(model, path)
0267 #'
0268 #' # can also read back the saved model and predict
0269 #' # Note that summary deos not work on loaded model
0270 #' savedModel <- read.ml(path)
0271 #' summary(savedModel)
0272 #'
0273 #' # binary logistic regression against two classes with
0274 #' # upperBoundsOnCoefficients and upperBoundsOnIntercepts
0275 #' ubc <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
0276 #' model <- spark.logit(training, Species ~ .,
0277 #' upperBoundsOnCoefficients = ubc,
0278 #' upperBoundsOnIntercepts = 1.0)
0279 #'
0280 #' # multinomial logistic regression
0281 #' model <- spark.logit(training, Class ~ ., regParam = 0.5)
0282 #' summary <- summary(model)
0283 #'
0284 #' # multinomial logistic regression with
0285 #' # lowerBoundsOnCoefficients and lowerBoundsOnIntercepts
0286 #' lbc <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
0287 #' lbi <- as.array(c(0.0, 0.0))
0288 #' model <- spark.logit(training, Species ~ ., family = "multinomial",
0289 #' lowerBoundsOnCoefficients = lbc,
0290 #' lowerBoundsOnIntercepts = lbi)
0291 #' }
0292 #' @note spark.logit since 2.1.0
0293 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
0294 function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
0295 tol = 1E-6, family = "auto", standardization = TRUE,
0296 thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
0297 lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
0298 lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL,
0299 handleInvalid = c("error", "keep", "skip")) {
0300 formula <- paste(deparse(formula), collapse = "")
0301 row <- 0
0302 col <- 0
0303
0304 if (!is.null(weightCol) && weightCol == "") {
0305 weightCol <- NULL
0306 } else if (!is.null(weightCol)) {
0307 weightCol <- as.character(weightCol)
0308 }
0309
0310 if (!is.null(lowerBoundsOnIntercepts)) {
0311 lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
0312 }
0313
0314 if (!is.null(upperBoundsOnIntercepts)) {
0315 upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
0316 }
0317
0318 if (!is.null(lowerBoundsOnCoefficients)) {
0319 if (class(lowerBoundsOnCoefficients) != "matrix") {
0320 stop("lowerBoundsOnCoefficients must be a matrix.")
0321 }
0322 row <- nrow(lowerBoundsOnCoefficients)
0323 col <- ncol(lowerBoundsOnCoefficients)
0324 lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
0325 }
0326
0327 if (!is.null(upperBoundsOnCoefficients)) {
0328 if (class(upperBoundsOnCoefficients) != "matrix") {
0329 stop("upperBoundsOnCoefficients must be a matrix.")
0330 }
0331
0332 if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
0333 || col != ncol(upperBoundsOnCoefficients))) {
0334 stop("dimension of upperBoundsOnCoefficients ",
0335 "is not the same as lowerBoundsOnCoefficients")
0336 }
0337
0338 if (is.null(lowerBoundsOnCoefficients)) {
0339 row <- nrow(upperBoundsOnCoefficients)
0340 col <- ncol(upperBoundsOnCoefficients)
0341 }
0342
0343 upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
0344 }
0345
0346 handleInvalid <- match.arg(handleInvalid)
0347
0348 jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
0349 data@sdf, formula, as.numeric(regParam),
0350 as.numeric(elasticNetParam), as.integer(maxIter),
0351 as.numeric(tol), as.character(family),
0352 as.logical(standardization), as.array(thresholds),
0353 weightCol, as.integer(aggregationDepth),
0354 as.integer(row), as.integer(col),
0355 lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
0356 lowerBoundsOnIntercepts, upperBoundsOnIntercepts,
0357 handleInvalid)
0358 new("LogisticRegressionModel", jobj = jobj)
0359 })
0360
0361 # Get the summary of an LogisticRegressionModel
0362
0363 #' @param object an LogisticRegressionModel fitted by \code{spark.logit}.
0364 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0365 #' The list includes \code{coefficients} (coefficients matrix of the fitted model).
0366 #' @rdname spark.logit
0367 #' @aliases summary,LogisticRegressionModel-method
0368 #' @note summary(LogisticRegressionModel) since 2.1.0
0369 setMethod("summary", signature(object = "LogisticRegressionModel"),
0370 function(object) {
0371 jobj <- object@jobj
0372 features <- callJMethod(jobj, "rFeatures")
0373 labels <- callJMethod(jobj, "labels")
0374 coefficients <- callJMethod(jobj, "rCoefficients")
0375 nCol <- length(coefficients) / length(features)
0376 coefficients <- matrix(unlist(coefficients), ncol = nCol)
0377 # If nCol == 1, means this is a binomial logistic regression model with pivoting.
0378 # Otherwise, it's a multinomial logistic regression model without pivoting.
0379 if (nCol == 1) {
0380 colnames(coefficients) <- c("Estimate")
0381 } else {
0382 colnames(coefficients) <- unlist(labels)
0383 }
0384 rownames(coefficients) <- unlist(features)
0385
0386 list(coefficients = coefficients)
0387 })
0388
0389 # Predicted values based on an LogisticRegressionModel model
0390
0391 #' @param newData a SparkDataFrame for testing.
0392 #' @return \code{predict} returns the predicted values based on an LogisticRegressionModel.
0393 #' @rdname spark.logit
0394 #' @aliases predict,LogisticRegressionModel,SparkDataFrame-method
0395 #' @note predict(LogisticRegressionModel) since 2.1.0
0396 setMethod("predict", signature(object = "LogisticRegressionModel"),
0397 function(object, newData) {
0398 predict_internal(object, newData)
0399 })
0400
0401 # Save fitted LogisticRegressionModel to the input path
0402
0403 #' @param path The directory where the model is saved.
0404 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
0405 #' which means throw exception if the output path exists.
0406 #'
0407 #' @rdname spark.logit
0408 #' @aliases write.ml,LogisticRegressionModel,character-method
0409 #' @note write.ml(LogisticRegression, character) since 2.1.0
0410 setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "character"),
0411 function(object, path, overwrite = FALSE) {
0412 write_internal(object, path, overwrite)
0413 })
0414
0415 #' Multilayer Perceptron Classification Model
0416 #'
0417 #' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame.
0418 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
0419 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
0420 #' Only categorical data is supported.
0421 #' For more details, see
0422 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{
0423 #' Multilayer Perceptron}
0424 #'
0425 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
0426 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
0427 #' operators are supported, including '~', '.', ':', '+', and '-'.
0428 #' @param blockSize blockSize parameter.
0429 #' @param layers integer vector containing the number of nodes for each layer.
0430 #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
0431 #' @param maxIter maximum iteration number.
0432 #' @param tol convergence tolerance of iterations.
0433 #' @param stepSize stepSize parameter.
0434 #' @param seed seed parameter for weights initialization.
0435 #' @param initialWeights initialWeights parameter for weights initialization, it should be a
0436 #' numeric vector.
0437 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0438 #' label column of string type.
0439 #' Supported options: "skip" (filter out rows with invalid data),
0440 #' "error" (throw an error), "keep" (put invalid data in
0441 #' a special additional bucket, at index numLabels). Default
0442 #' is "error".
0443 #' @param ... additional arguments passed to the method.
0444 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
0445 #' @rdname spark.mlp
0446 #' @aliases spark.mlp,SparkDataFrame,formula-method
0447 #' @name spark.mlp
0448 #' @seealso \link{read.ml}
0449 #' @examples
0450 #' \dontrun{
0451 #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
0452 #'
0453 #' # fit a Multilayer Perceptron Classification Model
0454 #' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
0455 #' maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
0456 #' initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
0457 #'
0458 #' # get the summary of the model
0459 #' summary(model)
0460 #'
0461 #' # make predictions
0462 #' predictions <- predict(model, df)
0463 #'
0464 #' # save and load the model
0465 #' path <- "path/to/model"
0466 #' write.ml(model, path)
0467 #' savedModel <- read.ml(path)
0468 #' summary(savedModel)
0469 #' }
0470 #' @note spark.mlp since 2.1.0
0471 setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
0472 function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
0473 tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL,
0474 handleInvalid = c("error", "keep", "skip")) {
0475 formula <- paste(deparse(formula), collapse = "")
0476 if (is.null(layers)) {
0477 stop("layers must be a integer vector with length > 1.")
0478 }
0479 layers <- as.integer(na.omit(layers))
0480 if (length(layers) <= 1) {
0481 stop("layers must be a integer vector with length > 1.")
0482 }
0483 if (!is.null(seed)) {
0484 seed <- as.character(as.integer(seed))
0485 }
0486 if (!is.null(initialWeights)) {
0487 initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
0488 }
0489 handleInvalid <- match.arg(handleInvalid)
0490 jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
0491 "fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
0492 as.character(solver), as.integer(maxIter), as.numeric(tol),
0493 as.numeric(stepSize), seed, initialWeights, handleInvalid)
0494 new("MultilayerPerceptronClassificationModel", jobj = jobj)
0495 })
0496
0497 # Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
0498
0499 #' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
0500 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0501 #' The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs}
0502 #' (number of outputs), \code{layers} (array of layer sizes including input
0503 #' and output layers), and \code{weights} (the weights of layers).
0504 #' For \code{weights}, it is a numeric vector with length equal to the expected
0505 #' given the architecture (i.e., for 8-10-2 network, 112 connection weights).
0506 #' @rdname spark.mlp
0507 #' @aliases summary,MultilayerPerceptronClassificationModel-method
0508 #' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0
0509 setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"),
0510 function(object) {
0511 jobj <- object@jobj
0512 layers <- unlist(callJMethod(jobj, "layers"))
0513 numOfInputs <- head(layers, n = 1)
0514 numOfOutputs <- tail(layers, n = 1)
0515 weights <- callJMethod(jobj, "weights")
0516 list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs,
0517 layers = layers, weights = weights)
0518 })
0519
0520 # Makes predictions from a model produced by spark.mlp().
0521
0522 #' @param newData a SparkDataFrame for testing.
0523 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
0524 #' "prediction".
0525 #' @rdname spark.mlp
0526 #' @aliases predict,MultilayerPerceptronClassificationModel-method
0527 #' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0
0528 setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel"),
0529 function(object, newData) {
0530 predict_internal(object, newData)
0531 })
0532
0533 # Saves the Multilayer Perceptron Classification Model to the input path.
0534
0535 #' @param path the directory where the model is saved.
0536 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
0537 #' which means throw exception if the output path exists.
0538 #'
0539 #' @rdname spark.mlp
0540 #' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method
0541 #' @seealso \link{write.ml}
0542 #' @note write.ml(MultilayerPerceptronClassificationModel, character) since 2.1.0
0543 setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationModel",
0544 path = "character"),
0545 function(object, path, overwrite = FALSE) {
0546 write_internal(object, path, overwrite)
0547 })
0548
0549 #' Naive Bayes Models
0550 #'
0551 #' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a SparkDataFrame.
0552 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
0553 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
0554 #' Only categorical data is supported.
0555 #'
0556 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
0557 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
0558 #' operators are supported, including '~', '.', ':', '+', and '-'.
0559 #' @param smoothing smoothing parameter.
0560 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0561 #' label column of string type.
0562 #' Supported options: "skip" (filter out rows with invalid data),
0563 #' "error" (throw an error), "keep" (put invalid data in
0564 #' a special additional bucket, at index numLabels). Default
0565 #' is "error".
0566 #' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
0567 #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
0568 #' @rdname spark.naiveBayes
0569 #' @aliases spark.naiveBayes,SparkDataFrame,formula-method
0570 #' @name spark.naiveBayes
0571 #' @seealso e1071: \url{https://cran.r-project.org/package=e1071}
0572 #' @examples
0573 #' \dontrun{
0574 #' data <- as.data.frame(UCBAdmissions)
0575 #' df <- createDataFrame(data)
0576 #'
0577 #' # fit a Bernoulli naive Bayes model
0578 #' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0)
0579 #'
0580 #' # get the summary of the model
0581 #' summary(model)
0582 #'
0583 #' # make predictions
0584 #' predictions <- predict(model, df)
0585 #'
0586 #' # save and load the model
0587 #' path <- "path/to/model"
0588 #' write.ml(model, path)
0589 #' savedModel <- read.ml(path)
0590 #' summary(savedModel)
0591 #' }
0592 #' @note spark.naiveBayes since 2.0.0
0593 setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
0594 function(data, formula, smoothing = 1.0,
0595 handleInvalid = c("error", "keep", "skip")) {
0596 formula <- paste(deparse(formula), collapse = "")
0597 handleInvalid <- match.arg(handleInvalid)
0598 jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
0599 formula, data@sdf, smoothing, handleInvalid)
0600 new("NaiveBayesModel", jobj = jobj)
0601 })
0602
0603 # Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes}
0604
0605 #' @param object a naive Bayes model fitted by \code{spark.naiveBayes}.
0606 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0607 #' The list includes \code{apriori} (the label distribution) and
0608 #' \code{tables} (conditional probabilities given the target label).
0609 #' @rdname spark.naiveBayes
0610 #' @note summary(NaiveBayesModel) since 2.0.0
0611 setMethod("summary", signature(object = "NaiveBayesModel"),
0612 function(object) {
0613 jobj <- object@jobj
0614 features <- callJMethod(jobj, "features")
0615 labels <- callJMethod(jobj, "labels")
0616 apriori <- callJMethod(jobj, "apriori")
0617 apriori <- t(as.matrix(unlist(apriori)))
0618 colnames(apriori) <- unlist(labels)
0619 tables <- callJMethod(jobj, "tables")
0620 tables <- matrix(tables, nrow = length(labels))
0621 rownames(tables) <- unlist(labels)
0622 colnames(tables) <- unlist(features)
0623 list(apriori = apriori, tables = tables)
0624 })
0625
0626 # Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(),
0627 # similarly to R package e1071's predict.
0628
0629 #' @param newData a SparkDataFrame for testing.
0630 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
0631 #' "prediction".
0632 #' @rdname spark.naiveBayes
0633 #' @note predict(NaiveBayesModel) since 2.0.0
0634 setMethod("predict", signature(object = "NaiveBayesModel"),
0635 function(object, newData) {
0636 predict_internal(object, newData)
0637 })
0638
0639 # Saves the Bernoulli naive Bayes model to the input path.
0640
0641 #' @param path the directory where the model is saved.
0642 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
0643 #' which means throw exception if the output path exists.
0644 #'
0645 #' @rdname spark.naiveBayes
0646 #' @seealso \link{write.ml}
0647 #' @note write.ml(NaiveBayesModel, character) since 2.0.0
0648 setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"),
0649 function(object, path, overwrite = FALSE) {
0650 write_internal(object, path, overwrite)
0651 })