pkg/R/mllib_classification.R

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 # mllib_regression.R: Provides methods for MLlib classification algorithms
0019 #                     (except for tree-based algorithms) integration
0020
0021 #' S4 class that represents an LinearSVCModel
0022 #'
0023 #' @param jobj a Java object reference to the backing Scala LinearSVCModel
0024 #' @note LinearSVCModel since 2.2.0
0025 setClass("LinearSVCModel", representation(jobj = "jobj"))
0026
0027 #' S4 class that represents an LogisticRegressionModel
0028 #'
0029 #' @param jobj a Java object reference to the backing Scala LogisticRegressionModel
0030 #' @note LogisticRegressionModel since 2.1.0
0031 setClass("LogisticRegressionModel", representation(jobj = "jobj"))
0032
0033 #' S4 class that represents a MultilayerPerceptronClassificationModel
0034 #'
0035 #' @param jobj a Java object reference to the backing Scala MultilayerPerceptronClassifierWrapper
0036 #' @note MultilayerPerceptronClassificationModel since 2.1.0
0037 setClass("MultilayerPerceptronClassificationModel", representation(jobj = "jobj"))
0038
0039 #' S4 class that represents a NaiveBayesModel
0040 #'
0041 #' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
0042 #' @note NaiveBayesModel since 2.0.0
0043 setClass("NaiveBayesModel", representation(jobj = "jobj"))
0044
0045 #' Linear SVM Model
0046 #'
0047 #' Fits a linear SVM model against a SparkDataFrame, similar to svm in e1071 package.
0048 #' Currently only supports binary classification model with linear kernel.
0049 #' Users can print, make predictions on the produced model and save the model to the input path.
0050 #'
0051 #' @param data SparkDataFrame for training.
0052 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
0053 #'                operators are supported, including '~', '.', ':', '+', '-', '*', and '^'.
0054 #' @param regParam The regularization parameter. Only supports L2 regularization currently.
0055 #' @param maxIter Maximum iteration number.
0056 #' @param tol Convergence tolerance of iterations.
0057 #' @param standardization Whether to standardize the training features before fitting the model.
0058 #'                        The coefficients of models will be always returned on the original scale,
0059 #'                        so it will be transparent for users. Note that with/without
0060 #'                        standardization, the models should be always converged to the same
0061 #'                        solution when no regularization is applied.
0062 #' @param threshold The threshold in binary classification applied to the linear model prediction.
0063 #'                  This threshold can be any real number, where Inf will make all predictions 0.0
0064 #'                  and -Inf will make all predictions 1.0.
0065 #' @param weightCol The weight column name.
0066 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
0067 #'                         dimensions of features or the number of partitions are large, this param
0068 #'                         could be adjusted to a larger size.
0069 #'                         This is an expert parameter. Default value should be good for most cases.
0070 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0071 #'                      label column of string type.
0072 #'                      Supported options: "skip" (filter out rows with invalid data),
0073 #'                                         "error" (throw an error), "keep" (put invalid data in
0074 #'                                         a special additional bucket, at index numLabels). Default
0075 #'                                         is "error".
0076 #' @param ... additional arguments passed to the method.
0077 #' @return \code{spark.svmLinear} returns a fitted linear SVM model.
0078 #' @rdname spark.svmLinear
0079 #' @aliases spark.svmLinear,SparkDataFrame,formula-method
0080 #' @name spark.svmLinear
0081 #' @examples
0082 #' \dontrun{
0083 #' sparkR.session()
0084 #' t <- as.data.frame(Titanic)
0085 #' training <- createDataFrame(t)
0086 #' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5)
0087 #' summary <- summary(model)
0088 #'
0089 #' # fitted values on training data
0090 #' fitted <- predict(model, training)
0091 #'
0092 #' # save fitted model to input path
0093 #' path <- "path/to/model"
0094 #' write.ml(model, path)
0095 #'
0096 #' # can also read back the saved model and predict
0097 #' # Note that summary deos not work on loaded model
0098 #' savedModel <- read.ml(path)
0099 #' summary(savedModel)
0100 #' }
0101 #' @note spark.svmLinear since 2.2.0
0102 setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formula"),
0103           function(data, formula, regParam = 0.0, maxIter = 100, tol = 1E-6, standardization = TRUE,
0104                    threshold = 0.0, weightCol = NULL, aggregationDepth = 2,
0105                    handleInvalid = c("error", "keep", "skip")) {
0106             formula <- paste(deparse(formula), collapse = "")
0107
0108             if (!is.null(weightCol) && weightCol == "") {
0109               weightCol <- NULL
0110             } else if (!is.null(weightCol)) {
0111               weightCol <- as.character(weightCol)
0112             }
0113
0114             handleInvalid <- match.arg(handleInvalid)
0115
0116             jobj <- callJStatic("org.apache.spark.ml.r.LinearSVCWrapper", "fit",
0117                                 data@sdf, formula, as.numeric(regParam), as.integer(maxIter),
0118                                 as.numeric(tol), as.logical(standardization), as.numeric(threshold),
0119                                 weightCol, as.integer(aggregationDepth), handleInvalid)
0120             new("LinearSVCModel", jobj = jobj)
0121           })
0122
0123 #  Predicted values based on a LinearSVCModel model
0124
0125 #' @param newData a SparkDataFrame for testing.
0126 #' @return \code{predict} returns the predicted values based on a LinearSVCModel.
0127 #' @rdname spark.svmLinear
0128 #' @aliases predict,LinearSVCModel,SparkDataFrame-method
0129 #' @note predict(LinearSVCModel) since 2.2.0
0130 setMethod("predict", signature(object = "LinearSVCModel"),
0131           function(object, newData) {
0132             predict_internal(object, newData)
0133           })
0134
0135 #  Get the summary of a LinearSVCModel
0136
0137 #' @param object a LinearSVCModel fitted by \code{spark.svmLinear}.
0138 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0139 #'         The list includes \code{coefficients} (coefficients of the fitted model),
0140 #'         \code{numClasses} (number of classes), \code{numFeatures} (number of features).
0141 #' @rdname spark.svmLinear
0142 #' @aliases summary,LinearSVCModel-method
0143 #' @note summary(LinearSVCModel) since 2.2.0
0144 setMethod("summary", signature(object = "LinearSVCModel"),
0145           function(object) {
0146             jobj <- object@jobj
0147             features <- callJMethod(jobj, "rFeatures")
0148             coefficients <- callJMethod(jobj, "rCoefficients")
0149             coefficients <- as.matrix(unlist(coefficients))
0150             colnames(coefficients) <- c("Estimate")
0151             rownames(coefficients) <- unlist(features)
0152             numClasses <- callJMethod(jobj, "numClasses")
0153             numFeatures <- callJMethod(jobj, "numFeatures")
0154             list(coefficients = coefficients, numClasses = numClasses, numFeatures = numFeatures)
0155           })
0156
0157 #  Save fitted LinearSVCModel to the input path
0158
0159 #' @param path The directory where the model is saved.
0160 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
0161 #'                  which means throw exception if the output path exists.
0162 #'
0163 #' @rdname spark.svmLinear
0164 #' @aliases write.ml,LinearSVCModel,character-method
0165 #' @note write.ml(LogisticRegression, character) since 2.2.0
0166 setMethod("write.ml", signature(object = "LinearSVCModel", path = "character"),
0167 function(object, path, overwrite = FALSE) {
0168     write_internal(object, path, overwrite)
0169 })
0170
0171 #' Logistic Regression Model
0172 #'
0173 #' Fits an logistic regression model against a SparkDataFrame. It supports "binomial": Binary
0174 #' logistic regression with pivoting; "multinomial": Multinomial logistic (softmax) regression
0175 #' without pivoting, similar to glmnet. Users can print, make predictions on the produced model
0176 #' and save the model to the input path.
0177 #'
0178 #' @param data SparkDataFrame for training.
0179 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
0180 #'                operators are supported, including '~', '.', ':', '+', and '-'.
0181 #' @param regParam the regularization parameter.
0182 #' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, the penalty is an L2
0183 #'                        penalty. For alpha = 1.0, it is an L1 penalty. For 0.0 < alpha < 1.0,
0184 #'                        the penalty is a combination of L1 and L2. Default is 0.0 which is an
0185 #'                        L2 penalty.
0186 #' @param maxIter maximum iteration number.
0187 #' @param tol convergence tolerance of iterations.
0188 #' @param family the name of family which is a description of the label distribution to be used
0189 #'               in the model.
0190 #'               Supported options:
0191 #'                 \itemize{
0192 #'                   \item{"auto": Automatically select the family based on the number of classes:
0193 #'                           If number of classes == 1 || number of classes == 2, set to "binomial".
0194 #'                           Else, set to "multinomial".}
0195 #'                   \item{"binomial": Binary logistic regression with pivoting.}
0196 #'                   \item{"multinomial": Multinomial logistic (softmax) regression without
0197 #'                           pivoting.}
0198 #'                 }
0199 #' @param standardization whether to standardize the training features before fitting the model.
0200 #'                        The coefficients of models will be always returned on the original scale,
0201 #'                        so it will be transparent for users. Note that with/without
0202 #'                        standardization, the models should be always converged to the same
0203 #'                        solution when no regularization is applied. Default is TRUE, same as
0204 #'                        glmnet.
0205 #' @param thresholds in binary classification, in range [0, 1]. If the estimated probability of
0206 #'                   class label 1 is > threshold, then predict 1, else 0. A high threshold
0207 #'                   encourages the model to predict 0 more often; a low threshold encourages the
0208 #'                   model to predict 1 more often. Note: Setting this with threshold p is
0209 #'                   equivalent to setting thresholds c(1-p, p). In multiclass (or binary)
0210 #'                   classification to adjust the probability of predicting each class. Array must
0211 #'                   have length equal to the number of classes, with values > 0, excepting that
0212 #'                   at most one value may be 0. The class with largest value p/t is predicted,
0213 #'                   where p is the original probability of that class and t is the class's
0214 #'                   threshold.
0215 #' @param weightCol The weight column name.
0216 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the
0217 #'                         dimensions of features or the number of partitions are large, this param
0218 #'                         could be adjusted to a larger size. This is an expert parameter. Default
0219 #'                         value should be good for most cases.
0220 #' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound
0221 #'                                  constrained optimization.
0222 #'                                  The bound matrix must be compatible with the shape (1, number
0223 #'                                  of features) for binomial regression, or (number of classes,
0224 #'                                  number of features) for multinomial regression.
0225 #'                                  It is a R matrix.
0226 #' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound
0227 #'                                  constrained optimization.
0228 #'                                  The bound matrix must be compatible with the shape (1, number
0229 #'                                  of features) for binomial regression, or (number of classes,
0230 #'                                  number of features) for multinomial regression.
0231 #'                                  It is a R matrix.
0232 #' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained
0233 #'                                optimization.
0234 #'                                The bounds vector size must be equal to 1 for binomial regression,
0235 #'                                or the number
0236 #'                                of classes for multinomial regression.
0237 #' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained
0238 #'                                optimization.
0239 #'                                The bound vector size must be equal to 1 for binomial regression,
0240 #'                                or the number of classes for multinomial regression.
0241 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0242 #'                      label column of string type.
0243 #'                      Supported options: "skip" (filter out rows with invalid data),
0244 #'                                         "error" (throw an error), "keep" (put invalid data in
0245 #'                                         a special additional bucket, at index numLabels). Default
0246 #'                                         is "error".
0247 #' @param ... additional arguments passed to the method.
0248 #' @return \code{spark.logit} returns a fitted logistic regression model.
0249 #' @rdname spark.logit
0250 #' @aliases spark.logit,SparkDataFrame,formula-method
0251 #' @name spark.logit
0252 #' @examples
0253 #' \dontrun{
0254 #' sparkR.session()
0255 #' # binary logistic regression
0256 #' t <- as.data.frame(Titanic)
0257 #' training <- createDataFrame(t)
0258 #' model <- spark.logit(training, Survived ~ ., regParam = 0.5)
0259 #' summary <- summary(model)
0260 #'
0261 #' # fitted values on training data
0262 #' fitted <- predict(model, training)
0263 #'
0264 #' # save fitted model to input path
0265 #' path <- "path/to/model"
0266 #' write.ml(model, path)
0267 #'
0268 #' # can also read back the saved model and predict
0269 #' # Note that summary deos not work on loaded model
0270 #' savedModel <- read.ml(path)
0271 #' summary(savedModel)
0272 #'
0273 #' # binary logistic regression against two classes with
0274 #' # upperBoundsOnCoefficients and upperBoundsOnIntercepts
0275 #' ubc <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
0276 #' model <- spark.logit(training, Species ~ .,
0277 #'                       upperBoundsOnCoefficients = ubc,
0278 #'                       upperBoundsOnIntercepts = 1.0)
0279 #'
0280 #' # multinomial logistic regression
0281 #' model <- spark.logit(training, Class ~ ., regParam = 0.5)
0282 #' summary <- summary(model)
0283 #'
0284 #' # multinomial logistic regression with
0285 #' # lowerBoundsOnCoefficients and lowerBoundsOnIntercepts
0286 #' lbc <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
0287 #' lbi <- as.array(c(0.0, 0.0))
0288 #' model <- spark.logit(training, Species ~ ., family = "multinomial",
0289 #'                      lowerBoundsOnCoefficients = lbc,
0290 #'                      lowerBoundsOnIntercepts = lbi)
0291 #' }
0292 #' @note spark.logit since 2.1.0
0293 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
0294           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
0295                    tol = 1E-6, family = "auto", standardization = TRUE,
0296                    thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
0297                    lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
0298                    lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL,
0299                    handleInvalid = c("error", "keep", "skip")) {
0300             formula <- paste(deparse(formula), collapse = "")
0301             row <- 0
0302             col <- 0
0303
0304             if (!is.null(weightCol) && weightCol == "") {
0305               weightCol <- NULL
0306             } else if (!is.null(weightCol)) {
0307               weightCol <- as.character(weightCol)
0308             }
0309
0310             if (!is.null(lowerBoundsOnIntercepts)) {
0311                 lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
0312             }
0313
0314             if (!is.null(upperBoundsOnIntercepts)) {
0315                 upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
0316             }
0317
0318             if (!is.null(lowerBoundsOnCoefficients)) {
0319               if (class(lowerBoundsOnCoefficients) != "matrix") {
0320                 stop("lowerBoundsOnCoefficients must be a matrix.")
0321               }
0322               row <- nrow(lowerBoundsOnCoefficients)
0323               col <- ncol(lowerBoundsOnCoefficients)
0324               lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
0325             }
0326
0327             if (!is.null(upperBoundsOnCoefficients)) {
0328               if (class(upperBoundsOnCoefficients) != "matrix") {
0329                 stop("upperBoundsOnCoefficients must be a matrix.")
0330               }
0331
0332               if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
0333                 || col != ncol(upperBoundsOnCoefficients))) {
0334                 stop("dimension of upperBoundsOnCoefficients ",
0335                      "is not the same as lowerBoundsOnCoefficients")
0336               }
0337
0338               if (is.null(lowerBoundsOnCoefficients)) {
0339                 row <- nrow(upperBoundsOnCoefficients)
0340                 col <- ncol(upperBoundsOnCoefficients)
0341               }
0342
0343               upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
0344             }
0345
0346             handleInvalid <- match.arg(handleInvalid)
0347
0348             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
0349                                 data@sdf, formula, as.numeric(regParam),
0350                                 as.numeric(elasticNetParam), as.integer(maxIter),
0351                                 as.numeric(tol), as.character(family),
0352                                 as.logical(standardization), as.array(thresholds),
0353                                 weightCol, as.integer(aggregationDepth),
0354                                 as.integer(row), as.integer(col),
0355                                 lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
0356                                 lowerBoundsOnIntercepts, upperBoundsOnIntercepts,
0357                                 handleInvalid)
0358             new("LogisticRegressionModel", jobj = jobj)
0359           })
0360
0361 #  Get the summary of an LogisticRegressionModel
0362
0363 #' @param object an LogisticRegressionModel fitted by \code{spark.logit}.
0364 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0365 #'         The list includes \code{coefficients} (coefficients matrix of the fitted model).
0366 #' @rdname spark.logit
0367 #' @aliases summary,LogisticRegressionModel-method
0368 #' @note summary(LogisticRegressionModel) since 2.1.0
0369 setMethod("summary", signature(object = "LogisticRegressionModel"),
0370           function(object) {
0371             jobj <- object@jobj
0372             features <- callJMethod(jobj, "rFeatures")
0373             labels <- callJMethod(jobj, "labels")
0374             coefficients <- callJMethod(jobj, "rCoefficients")
0375             nCol <- length(coefficients) / length(features)
0376             coefficients <- matrix(unlist(coefficients), ncol = nCol)
0377             # If nCol == 1, means this is a binomial logistic regression model with pivoting.
0378             # Otherwise, it's a multinomial logistic regression model without pivoting.
0379             if (nCol == 1) {
0380               colnames(coefficients) <- c("Estimate")
0381             } else {
0382               colnames(coefficients) <- unlist(labels)
0383             }
0384             rownames(coefficients) <- unlist(features)
0385
0386             list(coefficients = coefficients)
0387           })
0388
0389 #  Predicted values based on an LogisticRegressionModel model
0390
0391 #' @param newData a SparkDataFrame for testing.
0392 #' @return \code{predict} returns the predicted values based on an LogisticRegressionModel.
0393 #' @rdname spark.logit
0394 #' @aliases predict,LogisticRegressionModel,SparkDataFrame-method
0395 #' @note predict(LogisticRegressionModel) since 2.1.0
0396 setMethod("predict", signature(object = "LogisticRegressionModel"),
0397           function(object, newData) {
0398             predict_internal(object, newData)
0399           })
0400
0401 #  Save fitted LogisticRegressionModel to the input path
0402
0403 #' @param path The directory where the model is saved.
0404 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
0405 #'                  which means throw exception if the output path exists.
0406 #'
0407 #' @rdname spark.logit
0408 #' @aliases write.ml,LogisticRegressionModel,character-method
0409 #' @note write.ml(LogisticRegression, character) since 2.1.0
0410 setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "character"),
0411           function(object, path, overwrite = FALSE) {
0412             write_internal(object, path, overwrite)
0413           })
0414
0415 #' Multilayer Perceptron Classification Model
0416 #'
0417 #' \code{spark.mlp} fits a multi-layer perceptron neural network model against a SparkDataFrame.
0418 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
0419 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
0420 #' Only categorical data is supported.
0421 #' For more details, see
0422 #' \href{http://spark.apache.org/docs/latest/ml-classification-regression.html}{
0423 #'   Multilayer Perceptron}
0424 #'
0425 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
0426 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
0427 #'                operators are supported, including '~', '.', ':', '+', and '-'.
0428 #' @param blockSize blockSize parameter.
0429 #' @param layers integer vector containing the number of nodes for each layer.
0430 #' @param solver solver parameter, supported options: "gd" (minibatch gradient descent) or "l-bfgs".
0431 #' @param maxIter maximum iteration number.
0432 #' @param tol convergence tolerance of iterations.
0433 #' @param stepSize stepSize parameter.
0434 #' @param seed seed parameter for weights initialization.
0435 #' @param initialWeights initialWeights parameter for weights initialization, it should be a
0436 #'        numeric vector.
0437 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0438 #'                      label column of string type.
0439 #'                      Supported options: "skip" (filter out rows with invalid data),
0440 #'                                         "error" (throw an error), "keep" (put invalid data in
0441 #'                                         a special additional bucket, at index numLabels). Default
0442 #'                                         is "error".
0443 #' @param ... additional arguments passed to the method.
0444 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
0445 #' @rdname spark.mlp
0446 #' @aliases spark.mlp,SparkDataFrame,formula-method
0447 #' @name spark.mlp
0448 #' @seealso \link{read.ml}
0449 #' @examples
0450 #' \dontrun{
0451 #' df <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
0452 #'
0453 #' # fit a Multilayer Perceptron Classification Model
0454 #' model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 3), solver = "l-bfgs",
0455 #'                    maxIter = 100, tol = 0.5, stepSize = 1, seed = 1,
0456 #'                    initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
0457 #'
0458 #' # get the summary of the model
0459 #' summary(model)
0460 #'
0461 #' # make predictions
0462 #' predictions <- predict(model, df)
0463 #'
0464 #' # save and load the model
0465 #' path <- "path/to/model"
0466 #' write.ml(model, path)
0467 #' savedModel <- read.ml(path)
0468 #' summary(savedModel)
0469 #' }
0470 #' @note spark.mlp since 2.1.0
0471 setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
0472           function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
0473                    tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL,
0474                    handleInvalid = c("error", "keep", "skip")) {
0475             formula <- paste(deparse(formula), collapse = "")
0476             if (is.null(layers)) {
0477               stop("layers must be a integer vector with length > 1.")
0478             }
0479             layers <- as.integer(na.omit(layers))
0480             if (length(layers) <= 1) {
0481               stop("layers must be a integer vector with length > 1.")
0482             }
0483             if (!is.null(seed)) {
0484               seed <- as.character(as.integer(seed))
0485             }
0486             if (!is.null(initialWeights)) {
0487               initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
0488             }
0489             handleInvalid <- match.arg(handleInvalid)
0490             jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
0491                                 "fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
0492                                 as.character(solver), as.integer(maxIter), as.numeric(tol),
0493                                 as.numeric(stepSize), seed, initialWeights, handleInvalid)
0494             new("MultilayerPerceptronClassificationModel", jobj = jobj)
0495           })
0496
0497 #  Returns the summary of a Multilayer Perceptron Classification Model produced by \code{spark.mlp}
0498
0499 #' @param object a Multilayer Perceptron Classification Model fitted by \code{spark.mlp}
0500 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0501 #'         The list includes \code{numOfInputs} (number of inputs), \code{numOfOutputs}
0502 #'         (number of outputs), \code{layers} (array of layer sizes including input
0503 #'         and output layers), and \code{weights} (the weights of layers).
0504 #'         For \code{weights}, it is a numeric vector with length equal to the expected
0505 #'         given the architecture (i.e., for 8-10-2 network, 112 connection weights).
0506 #' @rdname spark.mlp
0507 #' @aliases summary,MultilayerPerceptronClassificationModel-method
0508 #' @note summary(MultilayerPerceptronClassificationModel) since 2.1.0
0509 setMethod("summary", signature(object = "MultilayerPerceptronClassificationModel"),
0510           function(object) {
0511             jobj <- object@jobj
0512             layers <- unlist(callJMethod(jobj, "layers"))
0513             numOfInputs <- head(layers, n = 1)
0514             numOfOutputs <- tail(layers, n = 1)
0515             weights <- callJMethod(jobj, "weights")
0516             list(numOfInputs = numOfInputs, numOfOutputs = numOfOutputs,
0517                  layers = layers, weights = weights)
0518           })
0519
0520 #  Makes predictions from a model produced by spark.mlp().
0521
0522 #' @param newData a SparkDataFrame for testing.
0523 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
0524 #' "prediction".
0525 #' @rdname spark.mlp
0526 #' @aliases predict,MultilayerPerceptronClassificationModel-method
0527 #' @note predict(MultilayerPerceptronClassificationModel) since 2.1.0
0528 setMethod("predict", signature(object = "MultilayerPerceptronClassificationModel"),
0529           function(object, newData) {
0530             predict_internal(object, newData)
0531           })
0532
0533 #  Saves the Multilayer Perceptron Classification Model to the input path.
0534
0535 #' @param path the directory where the model is saved.
0536 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
0537 #'                  which means throw exception if the output path exists.
0538 #'
0539 #' @rdname spark.mlp
0540 #' @aliases write.ml,MultilayerPerceptronClassificationModel,character-method
0541 #' @seealso \link{write.ml}
0542 #' @note write.ml(MultilayerPerceptronClassificationModel, character) since 2.1.0
0543 setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationModel",
0544           path = "character"),
0545           function(object, path, overwrite = FALSE) {
0546             write_internal(object, path, overwrite)
0547           })
0548
0549 #' Naive Bayes Models
0550 #'
0551 #' \code{spark.naiveBayes} fits a Bernoulli naive Bayes model against a SparkDataFrame.
0552 #' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
0553 #' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
0554 #' Only categorical data is supported.
0555 #'
0556 #' @param data a \code{SparkDataFrame} of observations and labels for model fitting.
0557 #' @param formula a symbolic description of the model to be fitted. Currently only a few formula
0558 #'               operators are supported, including '~', '.', ':', '+', and '-'.
0559 #' @param smoothing smoothing parameter.
0560 #' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and
0561 #'                      label column of string type.
0562 #'                      Supported options: "skip" (filter out rows with invalid data),
0563 #'                                         "error" (throw an error), "keep" (put invalid data in
0564 #'                                         a special additional bucket, at index numLabels). Default
0565 #'                                         is "error".
0566 #' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
0567 #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
0568 #' @rdname spark.naiveBayes
0569 #' @aliases spark.naiveBayes,SparkDataFrame,formula-method
0570 #' @name spark.naiveBayes
0571 #' @seealso e1071: \url{https://cran.r-project.org/package=e1071}
0572 #' @examples
0573 #' \dontrun{
0574 #' data <- as.data.frame(UCBAdmissions)
0575 #' df <- createDataFrame(data)
0576 #'
0577 #' # fit a Bernoulli naive Bayes model
0578 #' model <- spark.naiveBayes(df, Admit ~ Gender + Dept, smoothing = 0)
0579 #'
0580 #' # get the summary of the model
0581 #' summary(model)
0582 #'
0583 #' # make predictions
0584 #' predictions <- predict(model, df)
0585 #'
0586 #' # save and load the model
0587 #' path <- "path/to/model"
0588 #' write.ml(model, path)
0589 #' savedModel <- read.ml(path)
0590 #' summary(savedModel)
0591 #' }
0592 #' @note spark.naiveBayes since 2.0.0
0593 setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
0594           function(data, formula, smoothing = 1.0,
0595                    handleInvalid = c("error", "keep", "skip")) {
0596             formula <- paste(deparse(formula), collapse = "")
0597             handleInvalid <- match.arg(handleInvalid)
0598             jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
0599                                 formula, data@sdf, smoothing, handleInvalid)
0600             new("NaiveBayesModel", jobj = jobj)
0601           })
0602
0603 #  Returns the summary of a naive Bayes model produced by \code{spark.naiveBayes}
0604
0605 #' @param object a naive Bayes model fitted by \code{spark.naiveBayes}.
0606 #' @return \code{summary} returns summary information of the fitted model, which is a list.
0607 #'         The list includes \code{apriori} (the label distribution) and
0608 #'         \code{tables} (conditional probabilities given the target label).
0609 #' @rdname spark.naiveBayes
0610 #' @note summary(NaiveBayesModel) since 2.0.0
0611 setMethod("summary", signature(object = "NaiveBayesModel"),
0612           function(object) {
0613             jobj <- object@jobj
0614             features <- callJMethod(jobj, "features")
0615             labels <- callJMethod(jobj, "labels")
0616             apriori <- callJMethod(jobj, "apriori")
0617             apriori <- t(as.matrix(unlist(apriori)))
0618             colnames(apriori) <- unlist(labels)
0619             tables <- callJMethod(jobj, "tables")
0620             tables <- matrix(tables, nrow = length(labels))
0621             rownames(tables) <- unlist(labels)
0622             colnames(tables) <- unlist(features)
0623             list(apriori = apriori, tables = tables)
0624           })
0625
0626 #  Makes predictions from a naive Bayes model or a model produced by spark.naiveBayes(),
0627 #  similarly to R package e1071's predict.
0628
0629 #' @param newData a SparkDataFrame for testing.
0630 #' @return \code{predict} returns a SparkDataFrame containing predicted labeled in a column named
0631 #' "prediction".
0632 #' @rdname spark.naiveBayes
0633 #' @note predict(NaiveBayesModel) since 2.0.0
0634 setMethod("predict", signature(object = "NaiveBayesModel"),
0635           function(object, newData) {
0636             predict_internal(object, newData)
0637           })
0638
0639 #  Saves the Bernoulli naive Bayes model to the input path.
0640
0641 #' @param path the directory where the model is saved.
0642 #' @param overwrite overwrites or not if the output path already exists. Default is FALSE
0643 #'                  which means throw exception if the output path exists.
0644 #'
0645 #' @rdname spark.naiveBayes
0646 #' @seealso \link{write.ml}
0647 #' @note write.ml(NaiveBayesModel, character) since 2.0.0
0648 setMethod("write.ml", signature(object = "NaiveBayesModel", path = "character"),
0649           function(object, path, overwrite = FALSE) {
0650             write_internal(object, path, overwrite)
0651           })