Back to home page

OSCL-LXR

 
 

    


0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017 
0018 # A set of S3 classes and methods that support the SparkSQL `StructType` and `StructField
0019 # datatypes. These are used to create and interact with SparkDataFrame schemas.
0020 
0021 #' structType
0022 #'
0023 #' Create a structType object that contains the metadata for a SparkDataFrame. Intended for
0024 #' use with createDataFrame and toDF.
0025 #'
0026 #' @param x a structField object (created with the \code{structField} method). Since Spark 2.3,
0027 #'          this can be a DDL-formatted string, which is a comma separated list of field
0028 #'          definitions, e.g., "a INT, b STRING".
0029 #' @param ... additional structField objects
0030 #' @return a structType object
0031 #' @rdname structType
0032 #' @examples
0033 #'\dontrun{
0034 #' schema <- structType(structField("a", "integer"), structField("c", "string"),
0035 #'                       structField("avg", "double"))
0036 #' df1 <- gapply(df, list("a", "c"),
0037 #'               function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
0038 #'               schema)
0039 #' schema <- structType("a INT, c STRING, avg DOUBLE")
0040 #' df1 <- gapply(df, list("a", "c"),
0041 #'               function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
0042 #'               schema)
0043 #' }
0044 #' @note structType since 1.4.0
0045 structType <- function(x, ...) {
0046   UseMethod("structType", x)
0047 }
0048 
0049 #' @rdname structType
0050 #' @method structType jobj
0051 structType.jobj <- function(x, ...) {
0052   obj <- structure(list(), class = "structType")
0053   obj$jobj <- x
0054   obj$fields <- function() { lapply(callJMethod(obj$jobj, "fields"), structField) }
0055   obj
0056 }
0057 
0058 #' @rdname structType
0059 #' @method structType structField
0060 structType.structField <- function(x, ...) {
0061   fields <- list(x, ...)
0062   if (!all(sapply(fields, inherits, "structField"))) {
0063     stop("All arguments must be structField objects.")
0064   }
0065   sfObjList <- lapply(fields, function(field) {
0066     field$jobj
0067   })
0068   stObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
0069                        "createStructType",
0070                        sfObjList)
0071   structType(stObj)
0072 }
0073 
0074 #' @rdname structType
0075 #' @method structType character
0076 structType.character <- function(x, ...) {
0077   if (!is.character(x)) {
0078     stop("schema must be a DDL-formatted string.")
0079   }
0080   if (length(list(...)) > 0) {
0081     stop("multiple DDL-formatted strings are not supported")
0082   }
0083 
0084   stObj <- handledCallJStatic("org.apache.spark.sql.types.StructType",
0085                               "fromDDL",
0086                               x)
0087   structType(stObj)
0088 }
0089 
0090 #' Print a Spark StructType.
0091 #'
0092 #' This function prints the contents of a StructType returned from the
0093 #' SparkR JVM backend.
0094 #'
0095 #' @param x A StructType object
0096 #' @param ... further arguments passed to or from other methods
0097 #' @note print.structType since 1.4.0
0098 print.structType <- function(x, ...) {
0099   cat("StructType\n",
0100       sapply(x$fields(),
0101              function(field) {
0102                paste0("|-", "name = \"", field$name(),
0103                       "\", type = \"", field$dataType.toString(),
0104                       "\", nullable = ", field$nullable(), "\n")
0105              }),
0106       sep = "")
0107 }
0108 
0109 #' structField
0110 #'
0111 #' Create a structField object that contains the metadata for a single field in a schema.
0112 #'
0113 #' @param x the name of the field.
0114 #' @param ... additional argument(s) passed to the method.
0115 #' @return A structField object.
0116 #' @rdname structField
0117 #' @examples
0118 #'\dontrun{
0119 #' field1 <- structField("a", "integer")
0120 #' field2 <- structField("c", "string")
0121 #' field3 <- structField("avg", "double")
0122 #' schema <- structType(field1, field2, field3)
0123 #' df1 <- gapply(df, list("a", "c"),
0124 #'               function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
0125 #'               schema)
0126 #' }
0127 #' @note structField since 1.4.0
0128 structField <- function(x, ...) {
0129   UseMethod("structField", x)
0130 }
0131 
0132 #' @rdname structField
0133 #' @method structField jobj
0134 structField.jobj <- function(x, ...) {
0135   obj <- structure(list(), class = "structField")
0136   obj$jobj <- x
0137   obj$name <- function() { callJMethod(x, "name") }
0138   obj$dataType <- function() { callJMethod(x, "dataType") }
0139   obj$dataType.toString <- function() { callJMethod(obj$dataType(), "toString") }
0140   obj$dataType.simpleString <- function() { callJMethod(obj$dataType(), "simpleString") }
0141   obj$nullable <- function() { callJMethod(x, "nullable") }
0142   obj
0143 }
0144 
0145 checkType <- function(type) {
0146   if (!is.null(PRIMITIVE_TYPES[[type]])) {
0147     return()
0148   } else {
0149     # Check complex types
0150     firstChar <- substr(type, 1, 1)
0151     switch(firstChar,
0152             a = {
0153               # Array type
0154               m <- regexec("^array<(.+)>$", type)
0155               matchedStrings <- regmatches(type, m)
0156               if (length(matchedStrings[[1]]) >= 2) {
0157                 elemType <- matchedStrings[[1]][2]
0158                 checkType(elemType)
0159                 return()
0160               }
0161             },
0162             m = {
0163               # Map type
0164               m <- regexec("^map<(.+),(.+)>$", type)
0165               matchedStrings <- regmatches(type, m)
0166               if (length(matchedStrings[[1]]) >= 3) {
0167                 keyType <- matchedStrings[[1]][2]
0168                 if (keyType != "string" && keyType != "character") {
0169                   stop("Key type in a map must be string or character")
0170                 }
0171                 valueType <- matchedStrings[[1]][3]
0172                 checkType(valueType)
0173                 return()
0174               }
0175             },
0176             s = {
0177               # Struct type
0178               m <- regexec("^struct<(.+)>$", type)
0179               matchedStrings <- regmatches(type, m)
0180               if (length(matchedStrings[[1]]) >= 2) {
0181                 fieldsString <- matchedStrings[[1]][2]
0182                 # strsplit does not return the final empty string, so check if
0183                 # the final char is ","
0184                 if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") {
0185                   fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]]
0186                   for (field in fields) {
0187                     m <- regexec("^(.+):(.+)$", field)
0188                     matchedStrings <- regmatches(field, m)
0189                     if (length(matchedStrings[[1]]) >= 3) {
0190                       fieldType <- matchedStrings[[1]][3]
0191                       checkType(fieldType)
0192                     } else {
0193                       break
0194                     }
0195                   }
0196                   return()
0197                 }
0198               }
0199             })
0200   }
0201 
0202   stop("Unsupported type for SparkDataframe: ", type)
0203 }
0204 
0205 #' @param type The data type of the field
0206 #' @param nullable A logical vector indicating whether or not the field is nullable
0207 #' @rdname structField
0208 structField.character <- function(x, type, nullable = TRUE, ...) {
0209   if (class(x) != "character") {
0210     stop("Field name must be a string.")
0211   }
0212   if (class(type) != "character") {
0213     stop("Field type must be a string.")
0214   }
0215   if (class(nullable) != "logical") {
0216     stop("nullable must be either TRUE or FALSE")
0217   }
0218 
0219   checkType(type)
0220 
0221   sfObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
0222                        "createStructField",
0223                        x,
0224                        type,
0225                        nullable)
0226   structField(sfObj)
0227 }
0228 
0229 #' Print a Spark StructField.
0230 #'
0231 #' This function prints the contents of a StructField returned from the
0232 #' SparkR JVM backend.
0233 #'
0234 #' @param x A StructField object
0235 #' @param ... further arguments passed to or from other methods
0236 #' @note print.structField since 1.4.0
0237 print.structField <- function(x, ...) {
0238   cat("StructField(name = \"", x$name(),
0239       "\", type = \"", x$dataType.toString(),
0240       "\", nullable = ", x$nullable(),
0241       ")",
0242       sep = "")
0243 }