0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements. See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License. You may obtain a copy of the License at
0008 #
0009 # http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 # A set of S3 classes and methods that support the SparkSQL `StructType` and `StructField
0019 # datatypes. These are used to create and interact with SparkDataFrame schemas.
0020
0021 #' structType
0022 #'
0023 #' Create a structType object that contains the metadata for a SparkDataFrame. Intended for
0024 #' use with createDataFrame and toDF.
0025 #'
0026 #' @param x a structField object (created with the \code{structField} method). Since Spark 2.3,
0027 #' this can be a DDL-formatted string, which is a comma separated list of field
0028 #' definitions, e.g., "a INT, b STRING".
0029 #' @param ... additional structField objects
0030 #' @return a structType object
0031 #' @rdname structType
0032 #' @examples
0033 #'\dontrun{
0034 #' schema <- structType(structField("a", "integer"), structField("c", "string"),
0035 #' structField("avg", "double"))
0036 #' df1 <- gapply(df, list("a", "c"),
0037 #' function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
0038 #' schema)
0039 #' schema <- structType("a INT, c STRING, avg DOUBLE")
0040 #' df1 <- gapply(df, list("a", "c"),
0041 #' function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
0042 #' schema)
0043 #' }
0044 #' @note structType since 1.4.0
0045 structType <- function(x, ...) {
0046 UseMethod("structType", x)
0047 }
0048
0049 #' @rdname structType
0050 #' @method structType jobj
0051 structType.jobj <- function(x, ...) {
0052 obj <- structure(list(), class = "structType")
0053 obj$jobj <- x
0054 obj$fields <- function() { lapply(callJMethod(obj$jobj, "fields"), structField) }
0055 obj
0056 }
0057
0058 #' @rdname structType
0059 #' @method structType structField
0060 structType.structField <- function(x, ...) {
0061 fields <- list(x, ...)
0062 if (!all(sapply(fields, inherits, "structField"))) {
0063 stop("All arguments must be structField objects.")
0064 }
0065 sfObjList <- lapply(fields, function(field) {
0066 field$jobj
0067 })
0068 stObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
0069 "createStructType",
0070 sfObjList)
0071 structType(stObj)
0072 }
0073
0074 #' @rdname structType
0075 #' @method structType character
0076 structType.character <- function(x, ...) {
0077 if (!is.character(x)) {
0078 stop("schema must be a DDL-formatted string.")
0079 }
0080 if (length(list(...)) > 0) {
0081 stop("multiple DDL-formatted strings are not supported")
0082 }
0083
0084 stObj <- handledCallJStatic("org.apache.spark.sql.types.StructType",
0085 "fromDDL",
0086 x)
0087 structType(stObj)
0088 }
0089
0090 #' Print a Spark StructType.
0091 #'
0092 #' This function prints the contents of a StructType returned from the
0093 #' SparkR JVM backend.
0094 #'
0095 #' @param x A StructType object
0096 #' @param ... further arguments passed to or from other methods
0097 #' @note print.structType since 1.4.0
0098 print.structType <- function(x, ...) {
0099 cat("StructType\n",
0100 sapply(x$fields(),
0101 function(field) {
0102 paste0("|-", "name = \"", field$name(),
0103 "\", type = \"", field$dataType.toString(),
0104 "\", nullable = ", field$nullable(), "\n")
0105 }),
0106 sep = "")
0107 }
0108
0109 #' structField
0110 #'
0111 #' Create a structField object that contains the metadata for a single field in a schema.
0112 #'
0113 #' @param x the name of the field.
0114 #' @param ... additional argument(s) passed to the method.
0115 #' @return A structField object.
0116 #' @rdname structField
0117 #' @examples
0118 #'\dontrun{
0119 #' field1 <- structField("a", "integer")
0120 #' field2 <- structField("c", "string")
0121 #' field3 <- structField("avg", "double")
0122 #' schema <- structType(field1, field2, field3)
0123 #' df1 <- gapply(df, list("a", "c"),
0124 #' function(key, x) { y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE) },
0125 #' schema)
0126 #' }
0127 #' @note structField since 1.4.0
0128 structField <- function(x, ...) {
0129 UseMethod("structField", x)
0130 }
0131
0132 #' @rdname structField
0133 #' @method structField jobj
0134 structField.jobj <- function(x, ...) {
0135 obj <- structure(list(), class = "structField")
0136 obj$jobj <- x
0137 obj$name <- function() { callJMethod(x, "name") }
0138 obj$dataType <- function() { callJMethod(x, "dataType") }
0139 obj$dataType.toString <- function() { callJMethod(obj$dataType(), "toString") }
0140 obj$dataType.simpleString <- function() { callJMethod(obj$dataType(), "simpleString") }
0141 obj$nullable <- function() { callJMethod(x, "nullable") }
0142 obj
0143 }
0144
0145 checkType <- function(type) {
0146 if (!is.null(PRIMITIVE_TYPES[[type]])) {
0147 return()
0148 } else {
0149 # Check complex types
0150 firstChar <- substr(type, 1, 1)
0151 switch(firstChar,
0152 a = {
0153 # Array type
0154 m <- regexec("^array<(.+)>$", type)
0155 matchedStrings <- regmatches(type, m)
0156 if (length(matchedStrings[[1]]) >= 2) {
0157 elemType <- matchedStrings[[1]][2]
0158 checkType(elemType)
0159 return()
0160 }
0161 },
0162 m = {
0163 # Map type
0164 m <- regexec("^map<(.+),(.+)>$", type)
0165 matchedStrings <- regmatches(type, m)
0166 if (length(matchedStrings[[1]]) >= 3) {
0167 keyType <- matchedStrings[[1]][2]
0168 if (keyType != "string" && keyType != "character") {
0169 stop("Key type in a map must be string or character")
0170 }
0171 valueType <- matchedStrings[[1]][3]
0172 checkType(valueType)
0173 return()
0174 }
0175 },
0176 s = {
0177 # Struct type
0178 m <- regexec("^struct<(.+)>$", type)
0179 matchedStrings <- regmatches(type, m)
0180 if (length(matchedStrings[[1]]) >= 2) {
0181 fieldsString <- matchedStrings[[1]][2]
0182 # strsplit does not return the final empty string, so check if
0183 # the final char is ","
0184 if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") {
0185 fields <- strsplit(fieldsString, ",", fixed = TRUE)[[1]]
0186 for (field in fields) {
0187 m <- regexec("^(.+):(.+)$", field)
0188 matchedStrings <- regmatches(field, m)
0189 if (length(matchedStrings[[1]]) >= 3) {
0190 fieldType <- matchedStrings[[1]][3]
0191 checkType(fieldType)
0192 } else {
0193 break
0194 }
0195 }
0196 return()
0197 }
0198 }
0199 })
0200 }
0201
0202 stop("Unsupported type for SparkDataframe: ", type)
0203 }
0204
0205 #' @param type The data type of the field
0206 #' @param nullable A logical vector indicating whether or not the field is nullable
0207 #' @rdname structField
0208 structField.character <- function(x, type, nullable = TRUE, ...) {
0209 if (class(x) != "character") {
0210 stop("Field name must be a string.")
0211 }
0212 if (class(type) != "character") {
0213 stop("Field type must be a string.")
0214 }
0215 if (class(nullable) != "logical") {
0216 stop("nullable must be either TRUE or FALSE")
0217 }
0218
0219 checkType(type)
0220
0221 sfObj <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
0222 "createStructField",
0223 x,
0224 type,
0225 nullable)
0226 structField(sfObj)
0227 }
0228
0229 #' Print a Spark StructField.
0230 #'
0231 #' This function prints the contents of a StructField returned from the
0232 #' SparkR JVM backend.
0233 #'
0234 #' @param x A StructField object
0235 #' @param ... further arguments passed to or from other methods
0236 #' @note print.structField since 1.4.0
0237 print.structField <- function(x, ...) {
0238 cat("StructField(name = \"", x$name(),
0239 "\", type = \"", x$dataType.toString(),
0240 "\", nullable = ", x$nullable(),
0241 ")",
0242 sep = "")
0243 }