pkg/R/functions.R

0001 #
0002 # Licensed to the Apache Software Foundation (ASF) under one or more
0003 # contributor license agreements.  See the NOTICE file distributed with
0004 # this work for additional information regarding copyright ownership.
0005 # The ASF licenses this file to You under the Apache License, Version 2.0
0006 # (the "License"); you may not use this file except in compliance with
0007 # the License.  You may obtain a copy of the License at
0008 #
0009 #    http://www.apache.org/licenses/LICENSE-2.0
0010 #
0011 # Unless required by applicable law or agreed to in writing, software
0012 # distributed under the License is distributed on an "AS IS" BASIS,
0013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 # See the License for the specific language governing permissions and
0015 # limitations under the License.
0016 #
0017
0018 #' @include generics.R column.R
0019 NULL
0020
0021 #' Aggregate functions for Column operations
0022 #'
0023 #' Aggregate functions defined for \code{Column}.
0024 #'
0025 #' @param x Column to compute on.
0026 #' @param y,na.rm,use currently not used.
0027 #' @param ... additional argument(s). For example, it could be used to pass additional Columns.
0028 #' @name column_aggregate_functions
0029 #' @rdname column_aggregate_functions
0030 #' @family aggregate functions
0031 #' @examples
0032 #' \dontrun{
0033 #' # Dataframe used throughout this doc
0034 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
0035 NULL
0036
0037 #' Date time functions for Column operations
0038 #'
0039 #' Date time functions defined for \code{Column}.
0040 #'
0041 #' @param x Column to compute on. In \code{window}, it must be a time Column of
0042 #'          \code{TimestampType}. This is not used with \code{current_date} and
0043 #'          \code{current_timestamp}
0044 #' @param format The format for the given dates or timestamps in Column \code{x}. See the
0045 #'               format used in the following methods:
0046 #'               \itemize{
0047 #'               \item \code{to_date} and \code{to_timestamp}: it is the string to use to parse
0048 #'                    Column \code{x} to DateType or TimestampType.
0049 #'               \item \code{trunc}: it is the string to use to specify the truncation method.
0050 #'                    For example, "year", "yyyy", "yy" for truncate by year, or "month", "mon",
0051 #'                    "mm" for truncate by month.
0052 #'               \item \code{date_trunc}: it is similar with \code{trunc}'s but additionally
0053 #'                    supports "day", "dd", "second", "minute", "hour", "week" and "quarter".
0054 #'               }
0055 #' @param ... additional argument(s).
0056 #' @name column_datetime_functions
0057 #' @rdname column_datetime_functions
0058 #' @family data time functions
0059 #' @examples
0060 #' \dontrun{
0061 #' dts <- c("2005-01-02 18:47:22",
0062 #'         "2005-12-24 16:30:58",
0063 #'         "2005-10-28 07:30:05",
0064 #'         "2005-12-28 07:01:05",
0065 #'         "2006-01-24 00:01:10")
0066 #' y <- c(2.0, 2.2, 3.4, 2.5, 1.8)
0067 #' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))}
0068 NULL
0069
0070 #' Date time arithmetic functions for Column operations
0071 #'
0072 #' Date time arithmetic functions defined for \code{Column}.
0073 #'
0074 #' @param y Column to compute on.
0075 #' @param x For class \code{Column}, it is the column used to perform arithmetic operations
0076 #'          with column \code{y}. For class \code{numeric}, it is the number of months or
0077 #'          days to be added to or subtracted from \code{y}. For class \code{character}, it is
0078 #'          \itemize{
0079 #'          \item \code{date_format}: date format specification.
0080 #'          \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: A string detailing
0081 #'            the time zone ID that the input should be adjusted to. It should be in the format
0082 #'            of either region-based zone IDs or zone offsets. Region IDs must have the form
0083 #'            'area/city', such as 'America/Los_Angeles'. Zone offsets must be in the format
0084 #'            (+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported
0085 #'            as aliases of '+00:00'. Other short names are not recommended to use
0086 #'            because they can be ambiguous.
0087 #'          \item \code{next_day}: day of the week string.
0088 #'          }
0089 #' @param ... additional argument(s).
0090 #'          \itemize{
0091 #'          \item \code{months_between}, this contains an optional parameter to specify the
0092 #'              the result is rounded off to 8 digits.
0093 #'          }
0094 #'
0095 #' @name column_datetime_diff_functions
0096 #' @rdname column_datetime_diff_functions
0097 #' @family data time functions
0098 #' @examples
0099 #' \dontrun{
0100 #' dts <- c("2005-01-02 18:47:22",
0101 #'         "2005-12-24 16:30:58",
0102 #'         "2005-10-28 07:30:05",
0103 #'         "2005-12-28 07:01:05",
0104 #'         "2006-01-24 00:01:10")
0105 #' y <- c(2.0, 2.2, 3.4, 2.5, 1.8)
0106 #' df <- createDataFrame(data.frame(time = as.POSIXct(dts), y = y))}
0107 NULL
0108
0109 #' Math functions for Column operations
0110 #'
0111 #' Math functions defined for \code{Column}.
0112 #'
0113 #' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and
0114 #'          \code{shiftRightUnsigned}, this is the number of bits to shift.
0115 #' @param y Column to compute on.
0116 #' @param ... additional argument(s).
0117 #' @name column_math_functions
0118 #' @rdname column_math_functions
0119 #' @family math functions
0120 #' @examples
0121 #' \dontrun{
0122 #' # Dataframe used throughout this doc
0123 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
0124 #' tmp <- mutate(df, v1 = log(df$mpg), v2 = cbrt(df$disp),
0125 #'                   v3 = bround(df$wt, 1), v4 = bin(df$cyl),
0126 #'                   v5 = hex(df$wt), v6 = degrees(df$gear),
0127 #'                   v7 = atan2(df$cyl, df$am), v8 = hypot(df$cyl, df$am),
0128 #'                   v9 = pmod(df$hp, df$cyl), v10 = shiftLeft(df$disp, 1),
0129 #'                   v11 = conv(df$hp, 10, 16), v12 = sign(df$vs - 0.5),
0130 #'                   v13 = sqrt(df$disp), v14 = ceil(df$wt))
0131 #' head(tmp)}
0132 NULL
0133
0134 #' String functions for Column operations
0135 #'
0136 #' String functions defined for \code{Column}.
0137 #'
0138 #' @param x Column to compute on except in the following methods:
0139 #'      \itemize{
0140 #'      \item \code{instr}: \code{character}, the substring to check. See 'Details'.
0141 #'      \item \code{format_number}: \code{numeric}, the number of decimal place to
0142 #'           format to. See 'Details'.
0143 #'      }
0144 #' @param y Column to compute on.
0145 #' @param pos In \itemize{
0146 #'                \item \code{locate}: a start position of search.
0147 #'                \item \code{overlay}: a start postiton for replacement.
0148 #'                }
0149 #' @param len In \itemize{
0150 #'               \item \code{lpad} the maximum length of each output result.
0151 #'               \item \code{overlay} a number of bytes to replace.
0152 #'               }
0153 #' @param ... additional Columns.
0154 #' @name column_string_functions
0155 #' @rdname column_string_functions
0156 #' @family string functions
0157 #' @examples
0158 #' \dontrun{
0159 #' # Dataframe used throughout this doc
0160 #' df <- createDataFrame(as.data.frame(Titanic, stringsAsFactors = FALSE))}
0161 NULL
0162
0163 #' Non-aggregate functions for Column operations
0164 #'
0165 #' Non-aggregate functions defined for \code{Column}.
0166 #'
0167 #' @param x Column to compute on. In \code{lit}, it is a literal value or a Column.
0168 #'          In \code{expr}, it contains an expression character object to be parsed.
0169 #' @param y Column to compute on.
0170 #' @param ... additional Columns.
0171 #' @name column_nonaggregate_functions
0172 #' @rdname column_nonaggregate_functions
0173 #' @seealso coalesce,SparkDataFrame-method
0174 #' @family non-aggregate functions
0175 #' @examples
0176 #' \dontrun{
0177 #' # Dataframe used throughout this doc
0178 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))}
0179 NULL
0180
0181 #' Miscellaneous functions for Column operations
0182 #'
0183 #' Miscellaneous functions defined for \code{Column}.
0184 #'
0185 #' @param x Column to compute on. In \code{sha2}, it is one of 224, 256, 384, or 512.
0186 #' @param y Column to compute on.
0187 #' @param ... additional Columns.
0188 #' @name column_misc_functions
0189 #' @rdname column_misc_functions
0190 #' @family misc functions
0191 #' @examples
0192 #' \dontrun{
0193 #' # Dataframe used throughout this doc
0194 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars)[, 1:2])
0195 #' tmp <- mutate(df, v1 = crc32(df$model), v2 = hash(df$model),
0196 #'                   v3 = hash(df$model, df$mpg), v4 = md5(df$model),
0197 #'                   v5 = sha1(df$model), v6 = sha2(df$model, 256))
0198 #' head(tmp)}
0199 NULL
0200
0201 #' Collection functions for Column operations
0202 #'
0203 #' Collection functions defined for \code{Column}.
0204 #'
0205 #' @param x Column to compute on. Note the difference in the following methods:
0206 #'          \itemize{
0207 #'          \item \code{to_json}: it is the column containing the struct, array of the structs,
0208 #'              the map or array of maps.
0209 #'          \item \code{to_csv}: it is the column containing the struct.
0210 #'          \item \code{from_json}: it is the column containing the JSON string.
0211 #'          \item \code{from_csv}: it is the column containing the CSV string.
0212 #'          }
0213 #' @param y Column to compute on.
0214 #' @param value A value to compute on.
0215 #'          \itemize{
0216 #'          \item \code{array_contains}: a value to be checked if contained in the column.
0217 #'          \item \code{array_position}: a value to locate in the given array.
0218 #'          \item \code{array_remove}: a value to remove in the given array.
0219 #'          }
0220 #' @param schema
0221 #'          \itemize{
0222 #'          \item \code{from_json}: a structType object to use as the schema to use
0223 #'              when parsing the JSON string. Since Spark 2.3, the DDL-formatted string is
0224 #'              also supported for the schema. Since Spark 3.0, \code{schema_of_json} or
0225 #'              the DDL-formatted string literal can also be accepted.
0226 #'          \item \code{from_csv}: a structType object, DDL-formatted string or \code{schema_of_csv}
0227 #'          }
0228 #' @param ... additional argument(s).
0229 #'          \itemize{
0230 #'          \item \code{to_json}, \code{from_json} and \code{schema_of_json}: this contains
0231 #'              additional named properties to control how it is converted and accepts the
0232 #'              same options as the JSON data source.
0233 #'          \item \code{to_json}: it supports the "pretty" option which enables pretty
0234 #'              JSON generation.
0235 #'          \item \code{to_csv}, \code{from_csv} and \code{schema_of_csv}: this contains
0236 #'              additional named properties to control how it is converted and accepts the
0237 #'              same options as the CSV data source.
0238 #'          \item \code{arrays_zip}, this contains additional Columns of arrays to be merged.
0239 #'          \item \code{map_concat}, this contains additional Columns of maps to be unioned.
0240 #'          }
0241 #' @name column_collection_functions
0242 #' @rdname column_collection_functions
0243 #' @family collection functions
0244 #' @examples
0245 #' \dontrun{
0246 #' # Dataframe used throughout this doc
0247 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
0248 #' tmp <- mutate(df, v1 = create_array(df$mpg, df$cyl, df$hp))
0249 #' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1)))
0250 #' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1)))
0251 #' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1)))
0252 #' head(select(tmp, reverse(tmp$v1), array_remove(tmp$v1, 21)))
0253 #' tmp2 <- mutate(tmp, v2 = explode(tmp$v1))
0254 #' head(tmp2)
0255 #' head(select(tmp, posexplode(tmp$v1)))
0256 #' head(select(tmp, slice(tmp$v1, 2L, 2L)))
0257 #' head(select(tmp, sort_array(tmp$v1)))
0258 #' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
0259 #' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
0260 #' head(select(tmp3, map_entries(tmp3$v3), map_keys(tmp3$v3), map_values(tmp3$v3)))
0261 #' head(select(tmp3, element_at(tmp3$v3, "Valiant"), map_concat(tmp3$v3, tmp3$v3)))
0262 #' tmp4 <- mutate(df, v4 = create_array(df$mpg, df$cyl), v5 = create_array(df$cyl, df$hp))
0263 #' head(select(tmp4, concat(tmp4$v4, tmp4$v5), arrays_overlap(tmp4$v4, tmp4$v5)))
0264 #' head(select(tmp4, array_except(tmp4$v4, tmp4$v5), array_intersect(tmp4$v4, tmp4$v5)))
0265 #' head(select(tmp4, array_union(tmp4$v4, tmp4$v5)))
0266 #' head(select(tmp4, arrays_zip(tmp4$v4, tmp4$v5)))
0267 #' head(select(tmp, concat(df$mpg, df$cyl, df$hp)))
0268 #' tmp5 <- mutate(df, v6 = create_array(df$model, df$model))
0269 #' head(select(tmp5, array_join(tmp5$v6, "#"), array_join(tmp5$v6, "#", "NULL")))
0270 #' tmp6 <- mutate(df, v7 = create_array(create_array(df$model, df$model)))
0271 #' head(select(tmp6, flatten(tmp6$v7)))
0272 #' tmp7 <- mutate(df, v8 = create_array(df$model, df$cyl), v9 = create_array(df$model, df$hp))
0273 #' head(select(tmp7, map_from_arrays(tmp7$v8, tmp7$v9)))
0274 #' tmp8 <- mutate(df, v10 = create_array(struct(df$model, df$cyl)))
0275 #' head(select(tmp8, map_from_entries(tmp8$v10)))}
0276 NULL
0277
0278 #' Window functions for Column operations
0279 #'
0280 #' Window functions defined for \code{Column}.
0281 #'
0282 #' @param x In \code{lag} and \code{lead}, it is the column as a character string or a Column
0283 #'          to compute on. In \code{ntile}, it is the number of ntile groups.
0284 #' @param offset In \code{lag}, the number of rows back from the current row from which to obtain
0285 #'               a value. In \code{lead}, the number of rows after the current row from which to
0286 #'               obtain a value. If not specified, the default is 1.
0287 #' @param defaultValue (optional) default to use when the offset row does not exist.
0288 #' @param ... additional argument(s).
0289 #' @name column_window_functions
0290 #' @rdname column_window_functions
0291 #' @family window functions
0292 #' @examples
0293 #' \dontrun{
0294 #' # Dataframe used throughout this doc
0295 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
0296 #' ws <- orderBy(windowPartitionBy("am"), "hp")
0297 #' tmp <- mutate(df, dist = over(cume_dist(), ws), dense_rank = over(dense_rank(), ws),
0298 #'               lag = over(lag(df$mpg), ws), lead = over(lead(df$mpg, 1), ws),
0299 #'               percent_rank = over(percent_rank(), ws),
0300 #'               rank = over(rank(), ws), row_number = over(row_number(), ws))
0301 #' # Get ntile group id (1-4) for hp
0302 #' tmp <- mutate(tmp, ntile = over(ntile(4), ws))
0303 #' head(tmp)}
0304 NULL
0305
0306 #' @details
0307 #' \code{lit}: A new Column is created to represent the literal value.
0308 #' If the parameter is a Column, it is returned unchanged.
0309 #'
0310 #' @rdname column_nonaggregate_functions
0311 #' @aliases lit lit,ANY-method
0312 #' @examples
0313 #'
0314 #' \dontrun{
0315 #' tmp <- mutate(df, v1 = lit(df$mpg), v2 = lit("x"), v3 = lit("2015-01-01"),
0316 #'                   v4 = negate(df$mpg), v5 = expr('length(model)'),
0317 #'                   v6 = greatest(df$vs, df$am), v7 = least(df$vs, df$am),
0318 #'                   v8 = column("mpg"))
0319 #' head(tmp)}
0320 #' @note lit since 1.5.0
0321 setMethod("lit", signature("ANY"),
0322           function(x) {
0323             jc <- callJStatic("org.apache.spark.sql.functions",
0324                               "lit",
0325                               if (class(x) == "Column") { x@jc } else { x })
0326             column(jc)
0327           })
0328
0329 #' @details
0330 #' \code{abs}: Computes the absolute value.
0331 #'
0332 #' @rdname column_math_functions
0333 #' @aliases abs abs,Column-method
0334 #' @note abs since 1.5.0
0335 setMethod("abs",
0336           signature(x = "Column"),
0337           function(x) {
0338             jc <- callJStatic("org.apache.spark.sql.functions", "abs", x@jc)
0339             column(jc)
0340           })
0341
0342 #' @details
0343 #' \code{acos}: Returns the inverse cosine of the given value,
0344 #' as if computed by \code{java.lang.Math.acos()}
0345 #'
0346 #' @rdname column_math_functions
0347 #' @aliases acos acos,Column-method
0348 #' @note acos since 1.5.0
0349 setMethod("acos",
0350           signature(x = "Column"),
0351           function(x) {
0352             jc <- callJStatic("org.apache.spark.sql.functions", "acos", x@jc)
0353             column(jc)
0354           })
0355
0356 #' @details
0357 #' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group.
0358 #'
0359 #' @rdname column_aggregate_functions
0360 #' @aliases approx_count_distinct approx_count_distinct,Column-method
0361 #' @examples
0362 #'
0363 #' \dontrun{
0364 #' head(select(df, approx_count_distinct(df$gear)))
0365 #' head(select(df, approx_count_distinct(df$gear, 0.02)))
0366 #' head(select(df, countDistinct(df$gear, df$cyl)))
0367 #' head(select(df, n_distinct(df$gear)))
0368 #' head(distinct(select(df, "gear")))}
0369 #' @note approx_count_distinct(Column) since 3.0.0
0370 setMethod("approx_count_distinct",
0371           signature(x = "Column"),
0372           function(x) {
0373             jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc)
0374             column(jc)
0375           })
0376
0377 #' @details
0378 #' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group.
0379 #'
0380 #' @rdname column_aggregate_functions
0381 #' @aliases approxCountDistinct approxCountDistinct,Column-method
0382 #' @note approxCountDistinct(Column) since 1.4.0
0383 setMethod("approxCountDistinct",
0384           signature(x = "Column"),
0385           function(x) {
0386             .Deprecated("approx_count_distinct")
0387             jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc)
0388             column(jc)
0389           })
0390
0391 #' @details
0392 #' \code{ascii}: Computes the numeric value of the first character of the string column,
0393 #' and returns the result as an int column.
0394 #'
0395 #' @rdname column_string_functions
0396 #' @aliases ascii ascii,Column-method
0397 #' @examples
0398 #'
0399 #' \dontrun{
0400 #' head(select(df, ascii(df$Class), ascii(df$Sex)))}
0401 #' @note ascii since 1.5.0
0402 setMethod("ascii",
0403           signature(x = "Column"),
0404           function(x) {
0405             jc <- callJStatic("org.apache.spark.sql.functions", "ascii", x@jc)
0406             column(jc)
0407           })
0408
0409 #' @details
0410 #' \code{asin}: Returns the inverse sine of the given value,
0411 #' as if computed by \code{java.lang.Math.asin()}
0412 #'
0413 #' @rdname column_math_functions
0414 #' @aliases asin asin,Column-method
0415 #' @note asin since 1.5.0
0416 setMethod("asin",
0417           signature(x = "Column"),
0418           function(x) {
0419             jc <- callJStatic("org.apache.spark.sql.functions", "asin", x@jc)
0420             column(jc)
0421           })
0422
0423 #' @details
0424 #' \code{atan}: Returns the inverse tangent of the given value,
0425 #' as if computed by \code{java.lang.Math.atan()}
0426 #'
0427 #' @rdname column_math_functions
0428 #' @aliases atan atan,Column-method
0429 #' @note atan since 1.5.0
0430 setMethod("atan",
0431           signature(x = "Column"),
0432           function(x) {
0433             jc <- callJStatic("org.apache.spark.sql.functions", "atan", x@jc)
0434             column(jc)
0435           })
0436
0437 #' avg
0438 #'
0439 #' Aggregate function: returns the average of the values in a group.
0440 #'
0441 #' @rdname avg
0442 #' @name avg
0443 #' @family aggregate functions
0444 #' @aliases avg,Column-method
0445 #' @examples \dontrun{avg(df$c)}
0446 #' @note avg since 1.4.0
0447 setMethod("avg",
0448           signature(x = "Column"),
0449           function(x) {
0450             jc <- callJStatic("org.apache.spark.sql.functions", "avg", x@jc)
0451             column(jc)
0452           })
0453
0454 #' @details
0455 #' \code{base64}: Computes the BASE64 encoding of a binary column and returns it as
0456 #' a string column. This is the reverse of unbase64.
0457 #'
0458 #' @rdname column_string_functions
0459 #' @aliases base64 base64,Column-method
0460 #' @examples
0461 #'
0462 #' \dontrun{
0463 #' tmp <- mutate(df, s1 = encode(df$Class, "UTF-8"))
0464 #' str(tmp)
0465 #' tmp2 <- mutate(tmp, s2 = base64(tmp$s1), s3 = decode(tmp$s1, "UTF-8"),
0466 #'                     s4 = soundex(tmp$Sex))
0467 #' head(tmp2)
0468 #' head(select(tmp2, unbase64(tmp2$s2)))}
0469 #' @note base64 since 1.5.0
0470 setMethod("base64",
0471           signature(x = "Column"),
0472           function(x) {
0473             jc <- callJStatic("org.apache.spark.sql.functions", "base64", x@jc)
0474             column(jc)
0475           })
0476
0477 #' @details
0478 #' \code{bin}: Returns the string representation of the binary value
0479 #' of the given long column. For example, bin("12") returns "1100".
0480 #'
0481 #' @rdname column_math_functions
0482 #' @aliases bin bin,Column-method
0483 #' @note bin since 1.5.0
0484 setMethod("bin",
0485           signature(x = "Column"),
0486           function(x) {
0487             jc <- callJStatic("org.apache.spark.sql.functions", "bin", x@jc)
0488             column(jc)
0489           })
0490
0491 #' @details
0492 #' \code{bitwiseNOT}: Computes bitwise NOT.
0493 #'
0494 #' @rdname column_nonaggregate_functions
0495 #' @aliases bitwiseNOT bitwiseNOT,Column-method
0496 #' @examples
0497 #'
0498 #' \dontrun{
0499 #' head(select(df, bitwiseNOT(cast(df$vs, "int"))))}
0500 #' @note bitwiseNOT since 1.5.0
0501 setMethod("bitwiseNOT",
0502           signature(x = "Column"),
0503           function(x) {
0504             jc <- callJStatic("org.apache.spark.sql.functions", "bitwiseNOT", x@jc)
0505             column(jc)
0506           })
0507
0508 #' @details
0509 #' \code{cbrt}: Computes the cube-root of the given value.
0510 #'
0511 #' @rdname column_math_functions
0512 #' @aliases cbrt cbrt,Column-method
0513 #' @note cbrt since 1.4.0
0514 setMethod("cbrt",
0515           signature(x = "Column"),
0516           function(x) {
0517             jc <- callJStatic("org.apache.spark.sql.functions", "cbrt", x@jc)
0518             column(jc)
0519           })
0520
0521 #' @details
0522 #' \code{ceil}: Computes the ceiling of the given value.
0523 #'
0524 #' @rdname column_math_functions
0525 #' @aliases ceil ceil,Column-method
0526 #' @note ceil since 1.5.0
0527 setMethod("ceil",
0528           signature(x = "Column"),
0529           function(x) {
0530             jc <- callJStatic("org.apache.spark.sql.functions", "ceil", x@jc)
0531             column(jc)
0532           })
0533
0534 #' @details
0535 #' \code{ceiling}: Alias for \code{ceil}.
0536 #'
0537 #' @rdname column_math_functions
0538 #' @aliases ceiling ceiling,Column-method
0539 #' @note ceiling since 1.5.0
0540 setMethod("ceiling",
0541           signature(x = "Column"),
0542           function(x) {
0543             ceil(x)
0544           })
0545
0546 #' @details
0547 #' \code{coalesce}: Returns the first column that is not NA, or NA if all inputs are.
0548 #'
0549 #' @rdname column_nonaggregate_functions
0550 #' @aliases coalesce,Column-method
0551 #' @note coalesce(Column) since 2.1.1
0552 setMethod("coalesce",
0553           signature(x = "Column"),
0554           function(x, ...) {
0555             jcols <- lapply(list(x, ...), function(x) {
0556               stopifnot(class(x) == "Column")
0557               x@jc
0558             })
0559             jc <- callJStatic("org.apache.spark.sql.functions", "coalesce", jcols)
0560             column(jc)
0561           })
0562
0563 #' Though scala functions has "col" function, we don't expose it in SparkR
0564 #' because we don't want to conflict with the "col" function in the R base
0565 #' package and we also have "column" function exported which is an alias of "col".
0566 #' @noRd
0567 col <- function(x) {
0568   column(callJStatic("org.apache.spark.sql.functions", "col", x))
0569 }
0570
0571 #' Returns a Column based on the given column name
0572 #'
0573 #' Returns a Column based on the given column name.
0574 #'
0575 #' @param x Character column name.
0576 #'
0577 #' @rdname column
0578 #' @name column
0579 #' @family non-aggregate functions
0580 #' @aliases column,character-method
0581 #' @examples \dontrun{column("name")}
0582 #' @note column since 1.6.0
0583 setMethod("column",
0584           signature(x = "character"),
0585           function(x) {
0586             col(x)
0587           })
0588
0589 #' corr
0590 #'
0591 #' Computes the Pearson Correlation Coefficient for two Columns.
0592 #'
0593 #' @param col2 a (second) Column.
0594 #'
0595 #' @rdname corr
0596 #' @name corr
0597 #' @family aggregate functions
0598 #' @aliases corr,Column-method
0599 #' @examples
0600 #' \dontrun{
0601 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
0602 #' head(select(df, corr(df$mpg, df$hp)))}
0603 #' @note corr since 1.6.0
0604 setMethod("corr", signature(x = "Column"),
0605           function(x, col2) {
0606             stopifnot(class(col2) == "Column")
0607             jc <- callJStatic("org.apache.spark.sql.functions", "corr", x@jc, col2@jc)
0608             column(jc)
0609           })
0610
0611 #' cov
0612 #'
0613 #' Compute the covariance between two expressions.
0614 #'
0615 #' @details
0616 #' \code{cov}: Compute the sample covariance between two expressions.
0617 #'
0618 #' @rdname cov
0619 #' @name cov
0620 #' @family aggregate functions
0621 #' @aliases cov,characterOrColumn-method
0622 #' @examples
0623 #' \dontrun{
0624 #' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
0625 #' head(select(df, cov(df$mpg, df$hp), cov("mpg", "hp"),
0626 #'                 covar_samp(df$mpg, df$hp), covar_samp("mpg", "hp"),
0627 #'                 covar_pop(df$mpg, df$hp), covar_pop("mpg", "hp")))}
0628 #' @note cov since 1.6.0
0629 setMethod("cov", signature(x = "characterOrColumn"),
0630           function(x, col2) {
0631             stopifnot(is(class(col2), "characterOrColumn"))
0632             covar_samp(x, col2)
0633           })
0634
0635 #' @details
0636 #' \code{covar_sample}: Alias for \code{cov}.
0637 #'
0638 #' @rdname cov
0639 #'
0640 #' @param col1 the first Column.
0641 #' @param col2 the second Column.
0642 #' @name covar_samp
0643 #' @aliases covar_samp,characterOrColumn,characterOrColumn-method
0644 #' @note covar_samp since 2.0.0
0645 setMethod("covar_samp", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"),
0646           function(col1, col2) {
0647             stopifnot(class(col1) == class(col2))
0648             if (class(col1) == "Column") {
0649               col1 <- col1@jc
0650               col2 <- col2@jc
0651             }
0652             jc <- callJStatic("org.apache.spark.sql.functions", "covar_samp", col1, col2)
0653             column(jc)
0654           })
0655
0656 #' @details
0657 #' \code{covar_pop}: Computes the population covariance between two expressions.
0658 #'
0659 #' @rdname cov
0660 #' @name covar_pop
0661 #' @aliases covar_pop,characterOrColumn,characterOrColumn-method
0662 #' @note covar_pop since 2.0.0
0663 setMethod("covar_pop", signature(col1 = "characterOrColumn", col2 = "characterOrColumn"),
0664           function(col1, col2) {
0665             stopifnot(class(col1) == class(col2))
0666             if (class(col1) == "Column") {
0667               col1 <- col1@jc
0668               col2 <- col2@jc
0669             }
0670             jc <- callJStatic("org.apache.spark.sql.functions", "covar_pop", col1, col2)
0671             column(jc)
0672           })
0673
0674 #' @details
0675 #' \code{cos}: Returns the cosine of the given value,
0676 #' as if computed by \code{java.lang.Math.cos()}. Units in radians.
0677 #'
0678 #' @rdname column_math_functions
0679 #' @aliases cos cos,Column-method
0680 #' @note cos since 1.5.0
0681 setMethod("cos",
0682           signature(x = "Column"),
0683           function(x) {
0684             jc <- callJStatic("org.apache.spark.sql.functions", "cos", x@jc)
0685             column(jc)
0686           })
0687
0688 #' @details
0689 #' \code{cosh}: Returns the hyperbolic cosine of the given value,
0690 #' as if computed by \code{java.lang.Math.cosh()}.
0691 #'
0692 #' @rdname column_math_functions
0693 #' @aliases cosh cosh,Column-method
0694 #' @note cosh since 1.5.0
0695 setMethod("cosh",
0696           signature(x = "Column"),
0697           function(x) {
0698             jc <- callJStatic("org.apache.spark.sql.functions", "cosh", x@jc)
0699             column(jc)
0700           })
0701
0702 #' Returns the number of items in a group
0703 #'
0704 #' This can be used as a column aggregate function with \code{Column} as input,
0705 #' and returns the number of items in a group.
0706 #'
0707 #' @rdname count
0708 #' @name count
0709 #' @family aggregate functions
0710 #' @aliases count,Column-method
0711 #' @examples \dontrun{count(df$c)}
0712 #' @note count since 1.4.0
0713 setMethod("count",
0714           signature(x = "Column"),
0715           function(x) {
0716             jc <- callJStatic("org.apache.spark.sql.functions", "count", x@jc)
0717             column(jc)
0718           })
0719
0720 #' @details
0721 #' \code{crc32}: Calculates the cyclic redundancy check value  (CRC32) of a binary column
0722 #' and returns the value as a bigint.
0723 #'
0724 #' @rdname column_misc_functions
0725 #' @aliases crc32 crc32,Column-method
0726 #' @note crc32 since 1.5.0
0727 setMethod("crc32",
0728           signature(x = "Column"),
0729           function(x) {
0730             jc <- callJStatic("org.apache.spark.sql.functions", "crc32", x@jc)
0731             column(jc)
0732           })
0733
0734 #' @details
0735 #' \code{hash}: Calculates the hash code of given columns, and returns the result
0736 #' as an int column.
0737 #'
0738 #' @rdname column_misc_functions
0739 #' @aliases hash hash,Column-method
0740 #' @note hash since 2.0.0
0741 setMethod("hash",
0742           signature(x = "Column"),
0743           function(x, ...) {
0744             jcols <- lapply(list(x, ...), function(x) {
0745               stopifnot(class(x) == "Column")
0746               x@jc
0747             })
0748             jc <- callJStatic("org.apache.spark.sql.functions", "hash", jcols)
0749             column(jc)
0750           })
0751
0752 #' @details
0753 #' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit
0754 #' variant of the xxHash algorithm, and returns the result as a long
0755 #' column.
0756 #'
0757 #' @rdname column_misc_functions
0758 #' @aliases xxhash64 xxhash64,Column-method
0759 #' @note xxhash64 since 3.0.0
0760 setMethod("xxhash64",
0761           signature(x = "Column"),
0762           function(x, ...) {
0763             jcols <- lapply(list(x, ...), function(x) {
0764               stopifnot(class(x) == "Column")
0765               x@jc
0766             })
0767             jc <- callJStatic("org.apache.spark.sql.functions", "xxhash64", jcols)
0768             column(jc)
0769           })
0770
0771 #' @details
0772 #' \code{dayofmonth}: Extracts the day of the month as an integer from a
0773 #' given date/timestamp/string.
0774 #'
0775 #' @rdname column_datetime_functions
0776 #' @aliases dayofmonth dayofmonth,Column-method
0777 #' @examples
0778 #'
0779 #' \dontrun{
0780 #' head(select(df, df$time, year(df$time), quarter(df$time), month(df$time),
0781 #'             dayofmonth(df$time), dayofweek(df$time), dayofyear(df$time), weekofyear(df$time)))
0782 #' head(agg(groupBy(df, year(df$time)), count(df$y), avg(df$y)))
0783 #' head(agg(groupBy(df, month(df$time)), avg(df$y)))}
0784 #' @note dayofmonth since 1.5.0
0785 setMethod("dayofmonth",
0786           signature(x = "Column"),
0787           function(x) {
0788             jc <- callJStatic("org.apache.spark.sql.functions", "dayofmonth", x@jc)
0789             column(jc)
0790           })
0791
0792 #' @details
0793 #' \code{dayofweek}: Extracts the day of the week as an integer from a
0794 #' given date/timestamp/string.
0795 #'
0796 #' @rdname column_datetime_functions
0797 #' @aliases dayofweek dayofweek,Column-method
0798 #' @note dayofweek since 2.3.0
0799 setMethod("dayofweek",
0800           signature(x = "Column"),
0801           function(x) {
0802             jc <- callJStatic("org.apache.spark.sql.functions", "dayofweek", x@jc)
0803             column(jc)
0804           })
0805
0806 #' @details
0807 #' \code{dayofyear}: Extracts the day of the year as an integer from a
0808 #' given date/timestamp/string.
0809 #'
0810 #' @rdname column_datetime_functions
0811 #' @aliases dayofyear dayofyear,Column-method
0812 #' @note dayofyear since 1.5.0
0813 setMethod("dayofyear",
0814           signature(x = "Column"),
0815           function(x) {
0816             jc <- callJStatic("org.apache.spark.sql.functions", "dayofyear", x@jc)
0817             column(jc)
0818           })
0819
0820 #' @details
0821 #' \code{decode}: Computes the first argument into a string from a binary using the provided
0822 #' character set.
0823 #'
0824 #' @param charset character set to use (one of "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE",
0825 #'                "UTF-16LE", "UTF-16").
0826 #'
0827 #' @rdname column_string_functions
0828 #' @aliases decode decode,Column,character-method
0829 #' @note decode since 1.6.0
0830 setMethod("decode",
0831           signature(x = "Column", charset = "character"),
0832           function(x, charset) {
0833             jc <- callJStatic("org.apache.spark.sql.functions", "decode", x@jc, charset)
0834             column(jc)
0835           })
0836
0837 #' @details
0838 #' \code{encode}: Computes the first argument into a binary from a string using the provided
0839 #' character set.
0840 #'
0841 #' @rdname column_string_functions
0842 #' @aliases encode encode,Column,character-method
0843 #' @note encode since 1.6.0
0844 setMethod("encode",
0845           signature(x = "Column", charset = "character"),
0846           function(x, charset) {
0847             jc <- callJStatic("org.apache.spark.sql.functions", "encode", x@jc, charset)
0848             column(jc)
0849           })
0850
0851 #' @details
0852 #' \code{exp}: Computes the exponential of the given value.
0853 #'
0854 #' @rdname column_math_functions
0855 #' @aliases exp exp,Column-method
0856 #' @note exp since 1.5.0
0857 setMethod("exp",
0858           signature(x = "Column"),
0859           function(x) {
0860             jc <- callJStatic("org.apache.spark.sql.functions", "exp", x@jc)
0861             column(jc)
0862           })
0863
0864 #' @details
0865 #' \code{expm1}: Computes the exponential of the given value minus one.
0866 #'
0867 #' @rdname column_math_functions
0868 #' @aliases expm1 expm1,Column-method
0869 #' @note expm1 since 1.5.0
0870 setMethod("expm1",
0871           signature(x = "Column"),
0872           function(x) {
0873             jc <- callJStatic("org.apache.spark.sql.functions", "expm1", x@jc)
0874             column(jc)
0875           })
0876
0877 #' @details
0878 #' \code{factorial}: Computes the factorial of the given value.
0879 #'
0880 #' @rdname column_math_functions
0881 #' @aliases factorial factorial,Column-method
0882 #' @note factorial since 1.5.0
0883 setMethod("factorial",
0884           signature(x = "Column"),
0885           function(x) {
0886             jc <- callJStatic("org.apache.spark.sql.functions", "factorial", x@jc)
0887             column(jc)
0888           })
0889
0890 #' first
0891 #'
0892 #' Aggregate function: returns the first value in a group.
0893 #'
0894 #' The function by default returns the first values it sees. It will return the first non-missing
0895 #' value it sees when na.rm is set to true. If all values are missing, then NA is returned.
0896 #' Note: the function is non-deterministic because its results depends on the order of the rows
0897 #' which may be non-deterministic after a shuffle.
0898 #'
0899 #' @param na.rm a logical value indicating whether NA values should be stripped
0900 #'        before the computation proceeds.
0901 #'
0902 #' @rdname first
0903 #' @name first
0904 #' @aliases first,characterOrColumn-method
0905 #' @family aggregate functions
0906 #' @examples
0907 #' \dontrun{
0908 #' first(df$c)
0909 #' first(df$c, TRUE)
0910 #' }
0911 #' @note first(characterOrColumn) since 1.4.0
0912 setMethod("first",
0913           signature(x = "characterOrColumn"),
0914           function(x, na.rm = FALSE) {
0915             col <- if (class(x) == "Column") {
0916               x@jc
0917             } else {
0918               x
0919             }
0920             jc <- callJStatic("org.apache.spark.sql.functions", "first", col, na.rm)
0921             column(jc)
0922           })
0923
0924 #' @details
0925 #' \code{floor}: Computes the floor of the given value.
0926 #'
0927 #' @rdname column_math_functions
0928 #' @aliases floor floor,Column-method
0929 #' @note floor since 1.5.0
0930 setMethod("floor",
0931           signature(x = "Column"),
0932           function(x) {
0933             jc <- callJStatic("org.apache.spark.sql.functions", "floor", x@jc)
0934             column(jc)
0935           })
0936
0937 #' @details
0938 #' \code{hex}: Computes hex value of the given column.
0939 #'
0940 #' @rdname column_math_functions
0941 #' @aliases hex hex,Column-method
0942 #' @note hex since 1.5.0
0943 setMethod("hex",
0944           signature(x = "Column"),
0945           function(x) {
0946             jc <- callJStatic("org.apache.spark.sql.functions", "hex", x@jc)
0947             column(jc)
0948           })
0949
0950 #' @details
0951 #' \code{hour}: Extracts the hour as an integer from a given date/timestamp/string.
0952 #'
0953 #' @rdname column_datetime_functions
0954 #' @aliases hour hour,Column-method
0955 #' @examples
0956 #'
0957 #' \dontrun{
0958 #' head(select(df, hour(df$time), minute(df$time), second(df$time)))
0959 #' head(agg(groupBy(df, dayofmonth(df$time)), avg(df$y)))
0960 #' head(agg(groupBy(df, hour(df$time)), avg(df$y)))
0961 #' head(agg(groupBy(df, minute(df$time)), avg(df$y)))}
0962 #' @note hour since 1.5.0
0963 setMethod("hour",
0964           signature(x = "Column"),
0965           function(x) {
0966             jc <- callJStatic("org.apache.spark.sql.functions", "hour", x@jc)
0967             column(jc)
0968           })
0969
0970 #' @details
0971 #' \code{initcap}: Returns a new string column by converting the first letter of
0972 #' each word to uppercase. Words are delimited by whitespace. For example, "hello world"
0973 #' will become "Hello World".
0974 #'
0975 #' @rdname column_string_functions
0976 #' @aliases initcap initcap,Column-method
0977 #' @examples
0978 #'
0979 #' \dontrun{
0980 #' tmp <- mutate(df, sex_lower = lower(df$Sex), age_upper = upper(df$age),
0981 #'                   sex_age = concat_ws(" ", lower(df$sex), lower(df$age)))
0982 #' head(tmp)
0983 #' tmp2 <- mutate(tmp, s1 = initcap(tmp$sex_lower), s2 = initcap(tmp$sex_age),
0984 #'                     s3 = reverse(df$Sex))
0985 #' head(tmp2)}
0986 #' @note initcap since 1.5.0
0987 setMethod("initcap",
0988           signature(x = "Column"),
0989           function(x) {
0990             jc <- callJStatic("org.apache.spark.sql.functions", "initcap", x@jc)
0991             column(jc)
0992           })
0993
0994 #' @details
0995 #' \code{isnan}: Returns true if the column is NaN.
0996 #' @rdname column_nonaggregate_functions
0997 #' @aliases isnan isnan,Column-method
0998 #' @note isnan since 2.0.0
0999 setMethod("isnan",
1000           signature(x = "Column"),
1001           function(x) {
1002             jc <- callJStatic("org.apache.spark.sql.functions", "isnan", x@jc)
1003             column(jc)
1004           })
1005
1006 #' @details
1007 #' \code{is.nan}: Alias for \link{isnan}.
1008 #'
1009 #' @rdname column_nonaggregate_functions
1010 #' @aliases is.nan is.nan,Column-method
1011 #' @note is.nan since 2.0.0
1012 setMethod("is.nan",
1013           signature(x = "Column"),
1014           function(x) {
1015             isnan(x)
1016           })
1017
1018 #' @details
1019 #' \code{kurtosis}: Returns the kurtosis of the values in a group.
1020 #'
1021 #' @rdname column_aggregate_functions
1022 #' @aliases kurtosis kurtosis,Column-method
1023 #' @examples
1024 #'
1025 #' \dontrun{
1026 #' head(select(df, mean(df$mpg), sd(df$mpg), skewness(df$mpg), kurtosis(df$mpg)))}
1027 #' @note kurtosis since 1.6.0
1028 setMethod("kurtosis",
1029           signature(x = "Column"),
1030           function(x) {
1031             jc <- callJStatic("org.apache.spark.sql.functions", "kurtosis", x@jc)
1032             column(jc)
1033           })
1034
1035 #' last
1036 #'
1037 #' Aggregate function: returns the last value in a group.
1038 #'
1039 #' The function by default returns the last values it sees. It will return the last non-missing
1040 #' value it sees when na.rm is set to true. If all values are missing, then NA is returned.
1041 #' Note: the function is non-deterministic because its results depends on the order of the rows
1042 #' which may be non-deterministic after a shuffle.
1043 #'
1044 #' @param x column to compute on.
1045 #' @param na.rm a logical value indicating whether NA values should be stripped
1046 #'        before the computation proceeds.
1047 #' @param ... further arguments to be passed to or from other methods.
1048 #'
1049 #' @rdname last
1050 #' @name last
1051 #' @aliases last,characterOrColumn-method
1052 #' @family aggregate functions
1053 #' @examples
1054 #' \dontrun{
1055 #' last(df$c)
1056 #' last(df$c, TRUE)
1057 #' }
1058 #' @note last since 1.4.0
1059 setMethod("last",
1060           signature(x = "characterOrColumn"),
1061           function(x, na.rm = FALSE) {
1062             col <- if (class(x) == "Column") {
1063               x@jc
1064             } else {
1065               x
1066             }
1067             jc <- callJStatic("org.apache.spark.sql.functions", "last", col, na.rm)
1068             column(jc)
1069           })
1070
1071 #' @details
1072 #' \code{last_day}: Given a date column, returns the last day of the month which the
1073 #' given date belongs to. For example, input "2015-07-27" returns "2015-07-31" since
1074 #' July 31 is the last day of the month in July 2015.
1075 #'
1076 #' @rdname column_datetime_functions
1077 #' @aliases last_day last_day,Column-method
1078 #' @examples
1079 #'
1080 #' \dontrun{
1081 #' head(select(df, df$time, last_day(df$time), month(df$time)))}
1082 #' @note last_day since 1.5.0
1083 setMethod("last_day",
1084           signature(x = "Column"),
1085           function(x) {
1086             jc <- callJStatic("org.apache.spark.sql.functions", "last_day", x@jc)
1087             column(jc)
1088           })
1089
1090 #' @details
1091 #' \code{length}: Computes the character length of a string data or number of bytes
1092 #' of a binary data. The length of string data includes the trailing spaces.
1093 #' The length of binary data includes binary zeros.
1094 #'
1095 #' @rdname column_string_functions
1096 #' @aliases length length,Column-method
1097 #' @note length since 1.5.0
1098 setMethod("length",
1099           signature(x = "Column"),
1100           function(x) {
1101             jc <- callJStatic("org.apache.spark.sql.functions", "length", x@jc)
1102             column(jc)
1103           })
1104
1105 #' @details
1106 #' \code{log}: Computes the natural logarithm of the given value.
1107 #'
1108 #' @rdname column_math_functions
1109 #' @aliases log log,Column-method
1110 #' @note log since 1.5.0
1111 setMethod("log",
1112           signature(x = "Column"),
1113           function(x) {
1114             jc <- callJStatic("org.apache.spark.sql.functions", "log", x@jc)
1115             column(jc)
1116           })
1117
1118 #' @details
1119 #' \code{log10}: Computes the logarithm of the given value in base 10.
1120 #'
1121 #' @rdname column_math_functions
1122 #' @aliases log10 log10,Column-method
1123 #' @note log10 since 1.5.0
1124 setMethod("log10",
1125           signature(x = "Column"),
1126           function(x) {
1127             jc <- callJStatic("org.apache.spark.sql.functions", "log10", x@jc)
1128             column(jc)
1129           })
1130
1131 #' @details
1132 #' \code{log1p}: Computes the natural logarithm of the given value plus one.
1133 #'
1134 #' @rdname column_math_functions
1135 #' @aliases log1p log1p,Column-method
1136 #' @note log1p since 1.5.0
1137 setMethod("log1p",
1138           signature(x = "Column"),
1139           function(x) {
1140             jc <- callJStatic("org.apache.spark.sql.functions", "log1p", x@jc)
1141             column(jc)
1142           })
1143
1144 #' @details
1145 #' \code{log2}: Computes the logarithm of the given column in base 2.
1146 #'
1147 #' @rdname column_math_functions
1148 #' @aliases log2 log2,Column-method
1149 #' @note log2 since 1.5.0
1150 setMethod("log2",
1151           signature(x = "Column"),
1152           function(x) {
1153             jc <- callJStatic("org.apache.spark.sql.functions", "log2", x@jc)
1154             column(jc)
1155           })
1156
1157 #' @details
1158 #' \code{lower}: Converts a string column to lower case.
1159 #'
1160 #' @rdname column_string_functions
1161 #' @aliases lower lower,Column-method
1162 #' @note lower since 1.4.0
1163 setMethod("lower",
1164           signature(x = "Column"),
1165           function(x) {
1166             jc <- callJStatic("org.apache.spark.sql.functions", "lower", x@jc)
1167             column(jc)
1168           })
1169
1170 #' @details
1171 #' \code{ltrim}: Trims the spaces from left end for the specified string value. Optionally a
1172 #' \code{trimString} can be specified.
1173 #'
1174 #' @rdname column_string_functions
1175 #' @aliases ltrim ltrim,Column,missing-method
1176 #' @examples
1177 #'
1178 #' \dontrun{
1179 #' tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, " "), SexRpad = rpad(df$Sex, 7, " "))
1180 #' head(select(tmp, length(tmp$Sex), length(tmp$SexLpad), length(tmp$SexRpad)))
1181 #' tmp2 <- mutate(tmp, SexLtrim = ltrim(tmp$SexLpad), SexRtrim = rtrim(tmp$SexRpad),
1182 #'                     SexTrim = trim(tmp$SexLpad))
1183 #' head(select(tmp2, length(tmp2$Sex), length(tmp2$SexLtrim),
1184 #'                   length(tmp2$SexRtrim), length(tmp2$SexTrim)))
1185 #'
1186 #' tmp <- mutate(df, SexLpad = lpad(df$Sex, 6, "xx"), SexRpad = rpad(df$Sex, 7, "xx"))
1187 #' head(tmp)}
1188 #' @note ltrim since 1.5.0
1189 setMethod("ltrim",
1190           signature(x = "Column", trimString = "missing"),
1191           function(x, trimString) {
1192             jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc)
1193             column(jc)
1194           })
1195
1196 #' @param trimString a character string to trim with
1197 #' @rdname column_string_functions
1198 #' @aliases ltrim,Column,character-method
1199 #' @note ltrim(Column, character) since 2.3.0
1200 setMethod("ltrim",
1201           signature(x = "Column", trimString = "character"),
1202           function(x, trimString) {
1203             jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc, trimString)
1204             column(jc)
1205           })
1206
1207 #' @details
1208 #' \code{max}: Returns the maximum value of the expression in a group.
1209 #'
1210 #' @rdname column_aggregate_functions
1211 #' @aliases max max,Column-method
1212 #' @note max since 1.5.0
1213 setMethod("max",
1214           signature(x = "Column"),
1215           function(x) {
1216             jc <- callJStatic("org.apache.spark.sql.functions", "max", x@jc)
1217             column(jc)
1218           })
1219
1220 #' @details
1221 #' \code{md5}: Calculates the MD5 digest of a binary column and returns the value
1222 #' as a 32 character hex string.
1223 #'
1224 #' @rdname column_misc_functions
1225 #' @aliases md5 md5,Column-method
1226 #' @note md5 since 1.5.0
1227 setMethod("md5",
1228           signature(x = "Column"),
1229           function(x) {
1230             jc <- callJStatic("org.apache.spark.sql.functions", "md5", x@jc)
1231             column(jc)
1232           })
1233
1234 #' @details
1235 #' \code{mean}: Returns the average of the values in a group. Alias for \code{avg}.
1236 #'
1237 #' @rdname column_aggregate_functions
1238 #' @aliases mean mean,Column-method
1239 #' @examples
1240 #'
1241 #' \dontrun{
1242 #' head(select(df, avg(df$mpg), mean(df$mpg), sum(df$mpg), min(df$wt), max(df$qsec)))
1243 #'
1244 #' # metrics by num of cylinders
1245 #' tmp <- agg(groupBy(df, "cyl"), avg(df$mpg), avg(df$hp), avg(df$wt), avg(df$qsec))
1246 #' head(orderBy(tmp, "cyl"))
1247 #'
1248 #' # car with the max mpg
1249 #' mpg_max <- as.numeric(collect(agg(df, max(df$mpg))))
1250 #' head(where(df, df$mpg == mpg_max))}
1251 #' @note mean since 1.5.0
1252 setMethod("mean",
1253           signature(x = "Column"),
1254           function(x) {
1255             jc <- callJStatic("org.apache.spark.sql.functions", "mean", x@jc)
1256             column(jc)
1257           })
1258
1259 #' @details
1260 #' \code{min}: Returns the minimum value of the expression in a group.
1261 #'
1262 #' @rdname column_aggregate_functions
1263 #' @aliases min min,Column-method
1264 #' @note min since 1.5.0
1265 setMethod("min",
1266           signature(x = "Column"),
1267           function(x) {
1268             jc <- callJStatic("org.apache.spark.sql.functions", "min", x@jc)
1269             column(jc)
1270           })
1271
1272 #' @details
1273 #' \code{minute}: Extracts the minute as an integer from a given date/timestamp/string.
1274 #'
1275 #' @rdname column_datetime_functions
1276 #' @aliases minute minute,Column-method
1277 #' @note minute since 1.5.0
1278 setMethod("minute",
1279           signature(x = "Column"),
1280           function(x) {
1281             jc <- callJStatic("org.apache.spark.sql.functions", "minute", x@jc)
1282             column(jc)
1283           })
1284
1285 #' @details
1286 #' \code{monotonically_increasing_id}: Returns a column that generates monotonically increasing
1287 #' 64-bit integers. The generated ID is guaranteed to be monotonically increasing and unique,
1288 #' but not consecutive. The current implementation puts the partition ID in the upper 31 bits,
1289 #' and the record number within each partition in the lower 33 bits. The assumption is that the
1290 #' SparkDataFrame has less than 1 billion partitions, and each partition has less than 8 billion
1291 #' records. As an example, consider a SparkDataFrame with two partitions, each with 3 records.
1292 #' This expression would return the following IDs:
1293 #' 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.
1294 #' This is equivalent to the MONOTONICALLY_INCREASING_ID function in SQL.
1295 #' The method should be used with no argument.
1296 #' Note: the function is non-deterministic because its result depends on partition IDs.
1297 #'
1298 #' @rdname column_nonaggregate_functions
1299 #' @aliases monotonically_increasing_id monotonically_increasing_id,missing-method
1300 #' @examples
1301 #'
1302 #' \dontrun{head(select(df, monotonically_increasing_id()))}
1303 setMethod("monotonically_increasing_id",
1304           signature("missing"),
1305           function() {
1306             jc <- callJStatic("org.apache.spark.sql.functions", "monotonically_increasing_id")
1307             column(jc)
1308           })
1309
1310 #' @details
1311 #' \code{month}: Extracts the month as an integer from a given date/timestamp/string.
1312 #'
1313 #' @rdname column_datetime_functions
1314 #' @aliases month month,Column-method
1315 #' @note month since 1.5.0
1316 setMethod("month",
1317           signature(x = "Column"),
1318           function(x) {
1319             jc <- callJStatic("org.apache.spark.sql.functions", "month", x@jc)
1320             column(jc)
1321           })
1322
1323 #' @details
1324 #' \code{negate}: Unary minus, i.e. negate the expression.
1325 #'
1326 #' @rdname column_nonaggregate_functions
1327 #' @aliases negate negate,Column-method
1328 #' @note negate since 1.5.0
1329 setMethod("negate",
1330           signature(x = "Column"),
1331           function(x) {
1332             jc <- callJStatic("org.apache.spark.sql.functions", "negate", x@jc)
1333             column(jc)
1334           })
1335
1336 #' @details
1337 #' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace},
1338 #' starting from byte position \code{pos} of \code{src} and proceeding for
1339 #' \code{len} bytes.
1340 #'
1341 #' @param replace a Column with replacement.
1342 #'
1343 #' @rdname column_string_functions
1344 #' @aliases overlay overlay,Column-method,numericOrColumn-method
1345 #' @note overlay since 3.0.0
1346 setMethod("overlay",
1347   signature(x = "Column", replace = "Column", pos = "numericOrColumn"),
1348   function(x, replace, pos, len = -1) {
1349     if (is.numeric(pos)) {
1350       pos <- lit(as.integer(pos))
1351     }
1352
1353     if (is.numeric(len)) {
1354       len <- lit(as.integer(len))
1355     }
1356
1357     jc <- callJStatic(
1358       "org.apache.spark.sql.functions", "overlay",
1359       x@jc, replace@jc, pos@jc, len@jc
1360     )
1361
1362     column(jc)
1363   })
1364
1365 #' @details
1366 #' \code{quarter}: Extracts the quarter as an integer from a given date/timestamp/string.
1367 #'
1368 #' @rdname column_datetime_functions
1369 #' @aliases quarter quarter,Column-method
1370 #' @note quarter since 1.5.0
1371 setMethod("quarter",
1372           signature(x = "Column"),
1373           function(x) {
1374             jc <- callJStatic("org.apache.spark.sql.functions", "quarter", x@jc)
1375             column(jc)
1376           })
1377
1378 #' @details
1379 #' \code{reverse}: Returns a reversed string or an array with reverse order of elements.
1380 #'
1381 #' @rdname column_collection_functions
1382 #' @aliases reverse reverse,Column-method
1383 #' @note reverse since 1.5.0
1384 setMethod("reverse",
1385           signature(x = "Column"),
1386           function(x) {
1387             jc <- callJStatic("org.apache.spark.sql.functions", "reverse", x@jc)
1388             column(jc)
1389           })
1390
1391 #' @details
1392 #' \code{rint}: Returns the double value that is closest in value to the argument and
1393 #' is equal to a mathematical integer.
1394 #'
1395 #' @rdname column_math_functions
1396 #' @aliases rint rint,Column-method
1397 #' @note rint since 1.5.0
1398 setMethod("rint",
1399           signature(x = "Column"),
1400           function(x) {
1401             jc <- callJStatic("org.apache.spark.sql.functions", "rint", x@jc)
1402             column(jc)
1403           })
1404
1405 #' @details
1406 #' \code{round}: Returns the value of the column rounded to 0 decimal places
1407 #' using HALF_UP rounding mode.
1408 #'
1409 #' @rdname column_math_functions
1410 #' @aliases round round,Column-method
1411 #' @note round since 1.5.0
1412 setMethod("round",
1413           signature(x = "Column"),
1414           function(x) {
1415             jc <- callJStatic("org.apache.spark.sql.functions", "round", x@jc)
1416             column(jc)
1417           })
1418
1419 #' @details
1420 #' \code{bround}: Returns the value of the column \code{e} rounded to \code{scale} decimal places
1421 #' using HALF_EVEN rounding mode if \code{scale} >= 0 or at integer part when \code{scale} < 0.
1422 #' Also known as Gaussian rounding or bankers' rounding that rounds to the nearest even number.
1423 #' bround(2.5, 0) = 2, bround(3.5, 0) = 4.
1424 #'
1425 #' @param scale round to \code{scale} digits to the right of the decimal point when
1426 #'        \code{scale} > 0, the nearest even number when \code{scale} = 0, and \code{scale} digits
1427 #'        to the left of the decimal point when \code{scale} < 0.
1428 #' @rdname column_math_functions
1429 #' @aliases bround bround,Column-method
1430 #' @note bround since 2.0.0
1431 setMethod("bround",
1432           signature(x = "Column"),
1433           function(x, scale = 0) {
1434             jc <- callJStatic("org.apache.spark.sql.functions", "bround", x@jc, as.integer(scale))
1435             column(jc)
1436           })
1437
1438 #' @details
1439 #' \code{rtrim}: Trims the spaces from right end for the specified string value. Optionally a
1440 #' \code{trimString} can be specified.
1441 #'
1442 #' @rdname column_string_functions
1443 #' @aliases rtrim rtrim,Column,missing-method
1444 #' @note rtrim since 1.5.0
1445 setMethod("rtrim",
1446           signature(x = "Column", trimString = "missing"),
1447           function(x, trimString) {
1448             jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc)
1449             column(jc)
1450           })
1451
1452 #' @rdname column_string_functions
1453 #' @aliases rtrim,Column,character-method
1454 #' @note rtrim(Column, character) since 2.3.0
1455 setMethod("rtrim",
1456           signature(x = "Column", trimString = "character"),
1457           function(x, trimString) {
1458             jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc, trimString)
1459             column(jc)
1460           })
1461
1462 #' @details
1463 #' \code{sd}: Alias for \code{stddev_samp}.
1464 #'
1465 #' @rdname column_aggregate_functions
1466 #' @aliases sd sd,Column-method
1467 #' @examples
1468 #'
1469 #' \dontrun{
1470 #' head(select(df, sd(df$mpg), stddev(df$mpg), stddev_pop(df$wt), stddev_samp(df$qsec)))}
1471 #' @note sd since 1.6.0
1472 setMethod("sd",
1473           signature(x = "Column"),
1474           function(x) {
1475             # In R, sample standard deviation is calculated with the sd() function.
1476             stddev_samp(x)
1477           })
1478
1479 #' @details
1480 #' \code{second}: Extracts the second as an integer from a given date/timestamp/string.
1481 #'
1482 #' @rdname column_datetime_functions
1483 #' @aliases second second,Column-method
1484 #' @note second since 1.5.0
1485 setMethod("second",
1486           signature(x = "Column"),
1487           function(x) {
1488             jc <- callJStatic("org.apache.spark.sql.functions", "second", x@jc)
1489             column(jc)
1490           })
1491
1492 #' @details
1493 #' \code{sha1}: Calculates the SHA-1 digest of a binary column and returns the value
1494 #' as a 40 character hex string.
1495 #'
1496 #' @rdname column_misc_functions
1497 #' @aliases sha1 sha1,Column-method
1498 #' @note sha1 since 1.5.0
1499 setMethod("sha1",
1500           signature(x = "Column"),
1501           function(x) {
1502             jc <- callJStatic("org.apache.spark.sql.functions", "sha1", x@jc)
1503             column(jc)
1504           })
1505
1506 #' @details
1507 #' \code{signum}: Computes the signum of the given value.
1508 #'
1509 #' @rdname column_math_functions
1510 #' @aliases signum signum,Column-method
1511 #' @note signum since 1.5.0
1512 setMethod("signum",
1513           signature(x = "Column"),
1514           function(x) {
1515             jc <- callJStatic("org.apache.spark.sql.functions", "signum", x@jc)
1516             column(jc)
1517           })
1518
1519 #' @details
1520 #' \code{sign}: Alias for \code{signum}.
1521 #'
1522 #' @rdname column_math_functions
1523 #' @aliases sign sign,Column-method
1524 #' @note sign since 1.5.0
1525 setMethod("sign", signature(x = "Column"),
1526           function(x) {
1527             signum(x)
1528           })
1529
1530 #' @details
1531 #' \code{sin}: Returns the sine of the given value,
1532 #' as if computed by \code{java.lang.Math.sin()}. Units in radians.
1533 #'
1534 #' @rdname column_math_functions
1535 #' @aliases sin sin,Column-method
1536 #' @note sin since 1.5.0
1537 setMethod("sin",
1538           signature(x = "Column"),
1539           function(x) {
1540             jc <- callJStatic("org.apache.spark.sql.functions", "sin", x@jc)
1541             column(jc)
1542           })
1543
1544 #' @details
1545 #' \code{sinh}: Returns the hyperbolic sine of the given value,
1546 #' as if computed by \code{java.lang.Math.sinh()}.
1547 #'
1548 #' @rdname column_math_functions
1549 #' @aliases sinh sinh,Column-method
1550 #' @note sinh since 1.5.0
1551 setMethod("sinh",
1552           signature(x = "Column"),
1553           function(x) {
1554             jc <- callJStatic("org.apache.spark.sql.functions", "sinh", x@jc)
1555             column(jc)
1556           })
1557
1558 #' @details
1559 #' \code{skewness}: Returns the skewness of the values in a group.
1560 #'
1561 #' @rdname column_aggregate_functions
1562 #' @aliases skewness skewness,Column-method
1563 #' @note skewness since 1.6.0
1564 setMethod("skewness",
1565           signature(x = "Column"),
1566           function(x) {
1567             jc <- callJStatic("org.apache.spark.sql.functions", "skewness", x@jc)
1568             column(jc)
1569           })
1570
1571 #' @details
1572 #' \code{soundex}: Returns the soundex code for the specified expression.
1573 #'
1574 #' @rdname column_string_functions
1575 #' @aliases soundex soundex,Column-method
1576 #' @note soundex since 1.5.0
1577 setMethod("soundex",
1578           signature(x = "Column"),
1579           function(x) {
1580             jc <- callJStatic("org.apache.spark.sql.functions", "soundex", x@jc)
1581             column(jc)
1582           })
1583
1584 #' @details
1585 #' \code{spark_partition_id}: Returns the partition ID as a SparkDataFrame column.
1586 #' Note that this is nondeterministic because it depends on data partitioning and
1587 #' task scheduling.
1588 #' This is equivalent to the \code{SPARK_PARTITION_ID} function in SQL.
1589 #'
1590 #' @rdname column_nonaggregate_functions
1591 #' @aliases spark_partition_id spark_partition_id,missing-method
1592 #' @examples
1593 #'
1594 #' \dontrun{head(select(df, spark_partition_id()))}
1595 #' @note spark_partition_id since 2.0.0
1596 setMethod("spark_partition_id",
1597           signature("missing"),
1598           function() {
1599             jc <- callJStatic("org.apache.spark.sql.functions", "spark_partition_id")
1600             column(jc)
1601           })
1602
1603 #' @details
1604 #' \code{stddev}: Alias for \code{std_dev}.
1605 #'
1606 #' @rdname column_aggregate_functions
1607 #' @aliases stddev stddev,Column-method
1608 #' @note stddev since 1.6.0
1609 setMethod("stddev",
1610           signature(x = "Column"),
1611           function(x) {
1612             jc <- callJStatic("org.apache.spark.sql.functions", "stddev", x@jc)
1613             column(jc)
1614           })
1615
1616 #' @details
1617 #' \code{stddev_pop}: Returns the population standard deviation of the expression in a group.
1618 #'
1619 #' @rdname column_aggregate_functions
1620 #' @aliases stddev_pop stddev_pop,Column-method
1621 #' @note stddev_pop since 1.6.0
1622 setMethod("stddev_pop",
1623           signature(x = "Column"),
1624           function(x) {
1625             jc <- callJStatic("org.apache.spark.sql.functions", "stddev_pop", x@jc)
1626             column(jc)
1627           })
1628
1629 #' @details
1630 #' \code{stddev_samp}: Returns the unbiased sample standard deviation of the expression in a group.
1631 #'
1632 #' @rdname column_aggregate_functions
1633 #' @aliases stddev_samp stddev_samp,Column-method
1634 #' @note stddev_samp since 1.6.0
1635 setMethod("stddev_samp",
1636           signature(x = "Column"),
1637           function(x) {
1638             jc <- callJStatic("org.apache.spark.sql.functions", "stddev_samp", x@jc)
1639             column(jc)
1640           })
1641
1642 #' @details
1643 #' \code{struct}: Creates a new struct column that composes multiple input columns.
1644 #'
1645 #' @rdname column_nonaggregate_functions
1646 #' @aliases struct struct,characterOrColumn-method
1647 #' @examples
1648 #'
1649 #' \dontrun{
1650 #' tmp <- mutate(df, v1 = struct(df$mpg, df$cyl), v2 = struct("hp", "wt", "vs"),
1651 #'                   v3 = create_array(df$mpg, df$cyl, df$hp),
1652 #'                   v4 = create_map(lit("x"), lit(1.0), lit("y"), lit(-1.0)))
1653 #' head(tmp)}
1654 #' @note struct since 1.6.0
1655 setMethod("struct",
1656           signature(x = "characterOrColumn"),
1657           function(x, ...) {
1658             if (class(x) == "Column") {
1659               jcols <- lapply(list(x, ...), function(x) { x@jc })
1660               jc <- callJStatic("org.apache.spark.sql.functions", "struct", jcols)
1661             } else {
1662               jc <- callJStatic("org.apache.spark.sql.functions", "struct", x, list(...))
1663             }
1664             column(jc)
1665           })
1666
1667 #' @details
1668 #' \code{sqrt}: Computes the square root of the specified float value.
1669 #'
1670 #' @rdname column_math_functions
1671 #' @aliases sqrt sqrt,Column-method
1672 #' @note sqrt since 1.5.0
1673 setMethod("sqrt",
1674           signature(x = "Column"),
1675           function(x) {
1676             jc <- callJStatic("org.apache.spark.sql.functions", "sqrt", x@jc)
1677             column(jc)
1678           })
1679
1680 #' @details
1681 #' \code{sum}: Returns the sum of all values in the expression.
1682 #'
1683 #' @rdname column_aggregate_functions
1684 #' @aliases sum sum,Column-method
1685 #' @note sum since 1.5.0
1686 setMethod("sum",
1687           signature(x = "Column"),
1688           function(x) {
1689             jc <- callJStatic("org.apache.spark.sql.functions", "sum", x@jc)
1690             column(jc)
1691           })
1692
1693 #' @details
1694 #' \code{sumDistinct}: Returns the sum of distinct values in the expression.
1695 #'
1696 #' @rdname column_aggregate_functions
1697 #' @aliases sumDistinct sumDistinct,Column-method
1698 #' @examples
1699 #'
1700 #' \dontrun{
1701 #' head(select(df, sumDistinct(df$gear)))
1702 #' head(distinct(select(df, "gear")))}
1703 #' @note sumDistinct since 1.4.0
1704 setMethod("sumDistinct",
1705           signature(x = "Column"),
1706           function(x) {
1707             jc <- callJStatic("org.apache.spark.sql.functions", "sumDistinct", x@jc)
1708             column(jc)
1709           })
1710
1711 #' @details
1712 #' \code{tan}: Returns the tangent of the given value,
1713 #' as if computed by \code{java.lang.Math.tan()}.
1714 #' Units in radians.
1715 #'
1716 #' @rdname column_math_functions
1717 #' @aliases tan tan,Column-method
1718 #' @note tan since 1.5.0
1719 setMethod("tan",
1720           signature(x = "Column"),
1721           function(x) {
1722             jc <- callJStatic("org.apache.spark.sql.functions", "tan", x@jc)
1723             column(jc)
1724           })
1725
1726 #' @details
1727 #' \code{tanh}: Returns the hyperbolic tangent of the given value,
1728 #' as if computed by \code{java.lang.Math.tanh()}.
1729 #'
1730 #' @rdname column_math_functions
1731 #' @aliases tanh tanh,Column-method
1732 #' @note tanh since 1.5.0
1733 setMethod("tanh",
1734           signature(x = "Column"),
1735           function(x) {
1736             jc <- callJStatic("org.apache.spark.sql.functions", "tanh", x@jc)
1737             column(jc)
1738           })
1739
1740 #' @details
1741 #' \code{toDegrees}: Converts an angle measured in radians to an approximately equivalent angle
1742 #' measured in degrees.
1743 #'
1744 #' @rdname column_math_functions
1745 #' @aliases toDegrees toDegrees,Column-method
1746 #' @note toDegrees since 1.4.0
1747 setMethod("toDegrees",
1748           signature(x = "Column"),
1749           function(x) {
1750             .Deprecated("degrees")
1751             jc <- callJStatic("org.apache.spark.sql.functions", "degrees", x@jc)
1752             column(jc)
1753           })
1754
1755 #' @details
1756 #' \code{degrees}: Converts an angle measured in radians to an approximately equivalent angle
1757 #' measured in degrees.
1758 #'
1759 #' @rdname column_math_functions
1760 #' @aliases degrees degrees,Column-method
1761 #' @note degrees since 3.0.0
1762 setMethod("degrees",
1763           signature(x = "Column"),
1764           function(x) {
1765             jc <- callJStatic("org.apache.spark.sql.functions", "degrees", x@jc)
1766             column(jc)
1767           })
1768
1769 #' @details
1770 #' \code{toRadians}: Converts an angle measured in degrees to an approximately equivalent angle
1771 #' measured in radians.
1772 #'
1773 #' @rdname column_math_functions
1774 #' @aliases toRadians toRadians,Column-method
1775 #' @note toRadians since 1.4.0
1776 setMethod("toRadians",
1777           signature(x = "Column"),
1778           function(x) {
1779             .Deprecated("radians")
1780             jc <- callJStatic("org.apache.spark.sql.functions", "radians", x@jc)
1781             column(jc)
1782           })
1783
1784 #' @details
1785 #' \code{radians}: Converts an angle measured in degrees to an approximately equivalent angle
1786 #' measured in radians.
1787 #'
1788 #' @rdname column_math_functions
1789 #' @aliases radians radians,Column-method
1790 #' @note radians since 3.0.0
1791 setMethod("radians",
1792           signature(x = "Column"),
1793           function(x) {
1794             jc <- callJStatic("org.apache.spark.sql.functions", "radians", x@jc)
1795             column(jc)
1796           })
1797
1798 #' @details
1799 #' \code{to_date}: Converts the column into a DateType. You may optionally specify
1800 #' a format according to the rules in:
1801 #' \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{Datetime Pattern}
1802 #' If the string cannot be parsed according to the specified format (or default),
1803 #' the value of the column will be null.
1804 #' By default, it follows casting rules to a DateType if the format is omitted
1805 #' (equivalent to \code{cast(df$x, "date")}).
1806 #'
1807 #' @rdname column_datetime_functions
1808 #' @aliases to_date to_date,Column,missing-method
1809 #' @examples
1810 #'
1811 #' \dontrun{
1812 #' tmp <- createDataFrame(data.frame(time_string = dts))
1813 #' tmp2 <- mutate(tmp, date1 = to_date(tmp$time_string),
1814 #'                    date2 = to_date(tmp$time_string, "yyyy-MM-dd"),
1815 #'                    date3 = date_format(tmp$time_string, "MM/dd/yyy"),
1816 #'                    time1 = to_timestamp(tmp$time_string),
1817 #'                    time2 = to_timestamp(tmp$time_string, "yyyy-MM-dd"))
1818 #' head(tmp2)}
1819 #' @note to_date(Column) since 1.5.0
1820 setMethod("to_date",
1821           signature(x = "Column", format = "missing"),
1822           function(x, format) {
1823             jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc)
1824             column(jc)
1825           })
1826
1827 #' @rdname column_datetime_functions
1828 #' @aliases to_date,Column,character-method
1829 #' @note to_date(Column, character) since 2.2.0
1830 setMethod("to_date",
1831           signature(x = "Column", format = "character"),
1832           function(x, format) {
1833             jc <- callJStatic("org.apache.spark.sql.functions", "to_date", x@jc, format)
1834             column(jc)
1835           })
1836
1837 #' @details
1838 #' \code{to_json}: Converts a column containing a \code{structType}, a \code{mapType}
1839 #' or an \code{arrayType} into a Column of JSON string.
1840 #' Resolving the Column can fail if an unsupported type is encountered.
1841 #'
1842 #' @rdname column_collection_functions
1843 #' @aliases to_json to_json,Column-method
1844 #' @examples
1845 #'
1846 #' \dontrun{
1847 #' # Converts a struct into a JSON object
1848 #' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
1849 #' select(df2, to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
1850 #'
1851 #' # Converts an array of structs into a JSON array
1852 #' df2 <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
1853 #' df2 <- mutate(df2, people_json = to_json(df2$people))
1854 #'
1855 #' # Converts a map into a JSON object
1856 #' df2 <- sql("SELECT map('name', 'Bob') as people")
1857 #' df2 <- mutate(df2, people_json = to_json(df2$people))
1858 #'
1859 #' # Converts an array of maps into a JSON array
1860 #' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
1861 #' df2 <- mutate(df2, people_json = to_json(df2$people))
1862 #'
1863 #' # Converts a map into a pretty JSON object
1864 #' df2 <- sql("SELECT map('name', 'Bob') as people")
1865 #' df2 <- mutate(df2, people_json = to_json(df2$people, pretty = TRUE))}
1866 #' @note to_json since 2.2.0
1867 setMethod("to_json", signature(x = "Column"),
1868           function(x, ...) {
1869             options <- varargsToStrEnv(...)
1870             jc <- callJStatic("org.apache.spark.sql.functions", "to_json", x@jc, options)
1871             column(jc)
1872           })
1873
1874 #' @details
1875 #' \code{to_csv}: Converts a column containing a \code{structType} into a Column of CSV string.
1876 #' Resolving the Column can fail if an unsupported type is encountered.
1877 #'
1878 #' @rdname column_collection_functions
1879 #' @aliases to_csv to_csv,Column-method
1880 #' @examples
1881 #'
1882 #' \dontrun{
1883 #' # Converts a struct into a CSV string
1884 #' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
1885 #' select(df2, to_csv(df2$d, dateFormat = 'dd/MM/yyyy'))}
1886 #' @note to_csv since 3.0.0
1887 setMethod("to_csv", signature(x = "Column"),
1888           function(x, ...) {
1889             options <- varargsToStrEnv(...)
1890             jc <- callJStatic("org.apache.spark.sql.functions", "to_csv", x@jc, options)
1891             column(jc)
1892           })
1893
1894 #' @details
1895 #' \code{to_timestamp}: Converts the column into a TimestampType. You may optionally specify
1896 #' a format according to the rules in:
1897 #' \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{Datetime Pattern}
1898 #' If the string cannot be parsed according to the specified format (or default),
1899 #' the value of the column will be null.
1900 #' By default, it follows casting rules to a TimestampType if the format is omitted
1901 #' (equivalent to \code{cast(df$x, "timestamp")}).
1902 #'
1903 #' @rdname column_datetime_functions
1904 #' @aliases to_timestamp to_timestamp,Column,missing-method
1905 #' @note to_timestamp(Column) since 2.2.0
1906 setMethod("to_timestamp",
1907           signature(x = "Column", format = "missing"),
1908           function(x, format) {
1909             jc <- callJStatic("org.apache.spark.sql.functions", "to_timestamp", x@jc)
1910             column(jc)
1911           })
1912
1913 #' @rdname column_datetime_functions
1914 #' @aliases to_timestamp,Column,character-method
1915 #' @note to_timestamp(Column, character) since 2.2.0
1916 setMethod("to_timestamp",
1917           signature(x = "Column", format = "character"),
1918           function(x, format) {
1919             jc <- callJStatic("org.apache.spark.sql.functions", "to_timestamp", x@jc, format)
1920             column(jc)
1921           })
1922
1923 #' @details
1924 #' \code{trim}: Trims the spaces from both ends for the specified string column. Optionally a
1925 #' \code{trimString} can be specified.
1926 #'
1927 #' @rdname column_string_functions
1928 #' @aliases trim trim,Column,missing-method
1929 #' @note trim since 1.5.0
1930 setMethod("trim",
1931           signature(x = "Column", trimString = "missing"),
1932           function(x, trimString) {
1933             jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc)
1934             column(jc)
1935           })
1936
1937 #' @rdname column_string_functions
1938 #' @aliases trim,Column,character-method
1939 #' @note trim(Column, character) since 2.3.0
1940 setMethod("trim",
1941           signature(x = "Column", trimString = "character"),
1942           function(x, trimString) {
1943             jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc, trimString)
1944             column(jc)
1945           })
1946
1947 #' @details
1948 #' \code{unbase64}: Decodes a BASE64 encoded string column and returns it as a binary column.
1949 #' This is the reverse of base64.
1950 #'
1951 #' @rdname column_string_functions
1952 #' @aliases unbase64 unbase64,Column-method
1953 #' @note unbase64 since 1.5.0
1954 setMethod("unbase64",
1955           signature(x = "Column"),
1956           function(x) {
1957             jc <- callJStatic("org.apache.spark.sql.functions", "unbase64", x@jc)
1958             column(jc)
1959           })
1960
1961 #' @details
1962 #' \code{unhex}: Inverse of hex. Interprets each pair of characters as a hexadecimal number
1963 #' and converts to the byte representation of number.
1964 #'
1965 #' @rdname column_math_functions
1966 #' @aliases unhex unhex,Column-method
1967 #' @note unhex since 1.5.0
1968 setMethod("unhex",
1969           signature(x = "Column"),
1970           function(x) {
1971             jc <- callJStatic("org.apache.spark.sql.functions", "unhex", x@jc)
1972             column(jc)
1973           })
1974
1975 #' @details
1976 #' \code{upper}: Converts a string column to upper case.
1977 #'
1978 #' @rdname column_string_functions
1979 #' @aliases upper upper,Column-method
1980 #' @note upper since 1.4.0
1981 setMethod("upper",
1982           signature(x = "Column"),
1983           function(x) {
1984             jc <- callJStatic("org.apache.spark.sql.functions", "upper", x@jc)
1985             column(jc)
1986           })
1987
1988 #' @details
1989 #' \code{var}: Alias for \code{var_samp}.
1990 #'
1991 #' @rdname column_aggregate_functions
1992 #' @aliases var var,Column-method
1993 #' @examples
1994 #'
1995 #'\dontrun{
1996 #'head(agg(df, var(df$mpg), variance(df$mpg), var_pop(df$mpg), var_samp(df$mpg)))}
1997 #' @note var since 1.6.0
1998 setMethod("var",
1999           signature(x = "Column"),
2000           function(x) {
2001             # In R, sample variance is calculated with the var() function.
2002             var_samp(x)
2003           })
2004
2005 #' @rdname column_aggregate_functions
2006 #' @aliases variance variance,Column-method
2007 #' @note variance since 1.6.0
2008 setMethod("variance",
2009           signature(x = "Column"),
2010           function(x) {
2011             jc <- callJStatic("org.apache.spark.sql.functions", "variance", x@jc)
2012             column(jc)
2013           })
2014
2015 #' @details
2016 #' \code{var_pop}: Returns the population variance of the values in a group.
2017 #'
2018 #' @rdname column_aggregate_functions
2019 #' @aliases var_pop var_pop,Column-method
2020 #' @note var_pop since 1.5.0
2021 setMethod("var_pop",
2022           signature(x = "Column"),
2023           function(x) {
2024             jc <- callJStatic("org.apache.spark.sql.functions", "var_pop", x@jc)
2025             column(jc)
2026           })
2027
2028 #' @details
2029 #' \code{var_samp}: Returns the unbiased variance of the values in a group.
2030 #'
2031 #' @rdname column_aggregate_functions
2032 #' @aliases var_samp var_samp,Column-method
2033 #' @note var_samp since 1.6.0
2034 setMethod("var_samp",
2035           signature(x = "Column"),
2036           function(x) {
2037             jc <- callJStatic("org.apache.spark.sql.functions", "var_samp", x@jc)
2038             column(jc)
2039           })
2040
2041 #' @details
2042 #' \code{weekofyear}: Extracts the week number as an integer from a given date/timestamp/string.
2043 #'
2044 #' @rdname column_datetime_functions
2045 #' @aliases weekofyear weekofyear,Column-method
2046 #' @note weekofyear since 1.5.0
2047 setMethod("weekofyear",
2048           signature(x = "Column"),
2049           function(x) {
2050             jc <- callJStatic("org.apache.spark.sql.functions", "weekofyear", x@jc)
2051             column(jc)
2052           })
2053
2054 #' @details
2055 #' \code{year}: Extracts the year as an integer from a given date/timestamp/string.
2056 #'
2057 #' @rdname column_datetime_functions
2058 #' @aliases year year,Column-method
2059 #' @note year since 1.5.0
2060 setMethod("year",
2061           signature(x = "Column"),
2062           function(x) {
2063             jc <- callJStatic("org.apache.spark.sql.functions", "year", x@jc)
2064             column(jc)
2065           })
2066
2067 #' @details
2068 #' \code{atan2}: Returns the angle theta from the conversion of rectangular coordinates
2069 #' (x, y) to polar coordinates (r, theta),
2070 #' as if computed by \code{java.lang.Math.atan2()}. Units in radians.
2071 #'
2072 #' @rdname column_math_functions
2073 #' @aliases atan2 atan2,Column-method
2074 #' @note atan2 since 1.5.0
2075 setMethod("atan2", signature(y = "Column"),
2076           function(y, x) {
2077             if (class(x) == "Column") {
2078               x <- x@jc
2079             }
2080             jc <- callJStatic("org.apache.spark.sql.functions", "atan2", y@jc, x)
2081             column(jc)
2082           })
2083
2084 #' @details
2085 #' \code{datediff}: Returns the number of days from \code{y} to \code{x}.
2086 #' If \code{y} is later than \code{x} then the result is positive.
2087 #'
2088 #' @rdname column_datetime_diff_functions
2089 #' @aliases datediff datediff,Column-method
2090 #' @examples
2091 #'
2092 #' \dontrun{
2093 #' tmp <- createDataFrame(data.frame(time_string1 = as.POSIXct(dts),
2094 #'              time_string2 = as.POSIXct(dts[order(runif(length(dts)))])))
2095 #' tmp2 <- mutate(tmp, datediff = datediff(tmp$time_string1, tmp$time_string2),
2096 #'                monthdiff = months_between(tmp$time_string1, tmp$time_string2))
2097 #' head(tmp2)}
2098 #' @note datediff since 1.5.0
2099 setMethod("datediff", signature(y = "Column"),
2100           function(y, x) {
2101             if (class(x) == "Column") {
2102               x <- x@jc
2103             }
2104             jc <- callJStatic("org.apache.spark.sql.functions", "datediff", y@jc, x)
2105             column(jc)
2106           })
2107
2108 #' @details
2109 #' \code{hypot}: Computes "sqrt(a^2 + b^2)" without intermediate overflow or underflow.
2110 #'
2111 #' @rdname column_math_functions
2112 #' @aliases hypot hypot,Column-method
2113 #' @note hypot since 1.4.0
2114 setMethod("hypot", signature(y = "Column"),
2115           function(y, x) {
2116             if (class(x) == "Column") {
2117               x <- x@jc
2118             }
2119             jc <- callJStatic("org.apache.spark.sql.functions", "hypot", y@jc, x)
2120             column(jc)
2121           })
2122
2123 #' @details
2124 #' \code{levenshtein}: Computes the Levenshtein distance of the two given string columns.
2125 #'
2126 #' @rdname column_string_functions
2127 #' @aliases levenshtein levenshtein,Column-method
2128 #' @examples
2129 #'
2130 #' \dontrun{
2131 #' tmp <- mutate(df, d1 = levenshtein(df$Class, df$Sex),
2132 #'                   d2 = levenshtein(df$Age, df$Sex),
2133 #'                   d3 = levenshtein(df$Age, df$Age))
2134 #' head(tmp)}
2135 #' @note levenshtein since 1.5.0
2136 setMethod("levenshtein", signature(y = "Column"),
2137           function(y, x) {
2138             if (class(x) == "Column") {
2139               x <- x@jc
2140             }
2141             jc <- callJStatic("org.apache.spark.sql.functions", "levenshtein", y@jc, x)
2142             column(jc)
2143           })
2144
2145 #' @details
2146 #' \code{months_between}: Returns number of months between dates \code{y} and \code{x}.
2147 #' If \code{y} is later than \code{x}, then the result is positive. If \code{y} and \code{x}
2148 #' are on the same day of month, or both are the last day of month, time of day will be ignored.
2149 #' Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits.
2150 #'
2151 #' @param roundOff an optional parameter to specify if the result is rounded off to 8 digits
2152 #' @rdname column_datetime_diff_functions
2153 #' @aliases months_between months_between,Column-method
2154 #' @note months_between since 1.5.0
2155 setMethod("months_between", signature(y = "Column"),
2156           function(y, x, roundOff = NULL) {
2157             if (class(x) == "Column") {
2158               x <- x@jc
2159             }
2160             jc <- if (is.null(roundOff)) {
2161               callJStatic("org.apache.spark.sql.functions", "months_between", y@jc, x)
2162             } else {
2163               callJStatic("org.apache.spark.sql.functions", "months_between", y@jc, x,
2164                            as.logical(roundOff))
2165             }
2166             column(jc)
2167           })
2168
2169 #' @details
2170 #' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the second column
2171 #' (\code{x}) if the first column is NaN. Both inputs should be floating point columns
2172 #' (DoubleType or FloatType).
2173 #'
2174 #' @rdname column_nonaggregate_functions
2175 #' @aliases nanvl nanvl,Column-method
2176 #' @note nanvl since 1.5.0
2177 setMethod("nanvl", signature(y = "Column"),
2178           function(y, x) {
2179             if (class(x) == "Column") {
2180               x <- x@jc
2181             }
2182             jc <- callJStatic("org.apache.spark.sql.functions", "nanvl", y@jc, x)
2183             column(jc)
2184           })
2185
2186 #' @details
2187 #' \code{pmod}: Returns the positive value of dividend mod divisor.
2188 #' Column \code{x} is divisor column, and column \code{y} is the dividend column.
2189 #'
2190 #' @rdname column_math_functions
2191 #' @aliases pmod pmod,Column-method
2192 #' @note pmod since 1.5.0
2193 setMethod("pmod", signature(y = "Column"),
2194           function(y, x) {
2195             if (class(x) == "Column") {
2196               x <- x@jc
2197             }
2198             jc <- callJStatic("org.apache.spark.sql.functions", "pmod", y@jc, x)
2199             column(jc)
2200           })
2201
2202 #' @param rsd maximum estimation error allowed (default = 0.05).
2203 #'
2204 #' @rdname column_aggregate_functions
2205 #' @aliases approx_count_distinct,Column-method
2206 #' @note approx_count_distinct(Column, numeric) since 3.0.0
2207 setMethod("approx_count_distinct",
2208           signature(x = "Column"),
2209           function(x, rsd = 0.05) {
2210             jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc, rsd)
2211             column(jc)
2212           })
2213
2214 #' @rdname column_aggregate_functions
2215 #' @aliases approxCountDistinct,Column-method
2216 #' @note approxCountDistinct(Column, numeric) since 1.4.0
2217 setMethod("approxCountDistinct",
2218           signature(x = "Column"),
2219           function(x, rsd = 0.05) {
2220             .Deprecated("approx_count_distinct")
2221             jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc, rsd)
2222             column(jc)
2223           })
2224
2225 #' @details
2226 #' \code{countDistinct}: Returns the number of distinct items in a group.
2227 #'
2228 #' @rdname column_aggregate_functions
2229 #' @aliases countDistinct countDistinct,Column-method
2230 #' @note countDistinct since 1.4.0
2231 setMethod("countDistinct",
2232           signature(x = "Column"),
2233           function(x, ...) {
2234             jcols <- lapply(list(...), function(x) {
2235               stopifnot(class(x) == "Column")
2236               x@jc
2237             })
2238             jc <- callJStatic("org.apache.spark.sql.functions", "countDistinct", x@jc,
2239                               jcols)
2240             column(jc)
2241           })
2242
2243 #' @details
2244 #' \code{concat}: Concatenates multiple input columns together into a single column.
2245 #' The function works with strings, binary and compatible array columns.
2246 #'
2247 #' @rdname column_collection_functions
2248 #' @aliases concat concat,Column-method
2249 #' @note concat since 1.5.0
2250 setMethod("concat",
2251           signature(x = "Column"),
2252           function(x, ...) {
2253             jcols <- lapply(list(x, ...), function(x) {
2254               stopifnot(class(x) == "Column")
2255               x@jc
2256             })
2257             jc <- callJStatic("org.apache.spark.sql.functions", "concat", jcols)
2258             column(jc)
2259           })
2260
2261 #' @details
2262 #' \code{greatest}: Returns the greatest value of the list of column names, skipping null values.
2263 #' This function takes at least 2 parameters. It will return null if all parameters are null.
2264 #'
2265 #' @rdname column_nonaggregate_functions
2266 #' @aliases greatest greatest,Column-method
2267 #' @note greatest since 1.5.0
2268 setMethod("greatest",
2269           signature(x = "Column"),
2270           function(x, ...) {
2271             stopifnot(length(list(...)) > 0)
2272             jcols <- lapply(list(x, ...), function(x) {
2273               stopifnot(class(x) == "Column")
2274               x@jc
2275             })
2276             jc <- callJStatic("org.apache.spark.sql.functions", "greatest", jcols)
2277             column(jc)
2278           })
2279
2280 #' @details
2281 #' \code{least}: Returns the least value of the list of column names, skipping null values.
2282 #' This function takes at least 2 parameters. It will return null if all parameters are null.
2283 #'
2284 #' @rdname column_nonaggregate_functions
2285 #' @aliases least least,Column-method
2286 #' @note least since 1.5.0
2287 setMethod("least",
2288           signature(x = "Column"),
2289           function(x, ...) {
2290             stopifnot(length(list(...)) > 0)
2291             jcols <- lapply(list(x, ...), function(x) {
2292               stopifnot(class(x) == "Column")
2293               x@jc
2294             })
2295             jc <- callJStatic("org.apache.spark.sql.functions", "least", jcols)
2296             column(jc)
2297           })
2298
2299 #' @details
2300 #' \code{n_distinct}: Returns the number of distinct items in a group.
2301 #'
2302 #' @rdname column_aggregate_functions
2303 #' @aliases n_distinct n_distinct,Column-method
2304 #' @note n_distinct since 1.4.0
2305 setMethod("n_distinct", signature(x = "Column"),
2306           function(x, ...) {
2307             countDistinct(x, ...)
2308           })
2309
2310 #' @rdname count
2311 #' @name n
2312 #' @aliases n,Column-method
2313 #' @examples \dontrun{n(df$c)}
2314 #' @note n since 1.4.0
2315 setMethod("n", signature(x = "Column"),
2316           function(x) {
2317             count(x)
2318           })
2319
2320 #' @details
2321 #' \code{date_format}: Converts a date/timestamp/string to a value of string in the format
2322 #' specified by the date format given by the second argument. A pattern could be for instance
2323 #' \code{dd.MM.yyyy} and could return a string like '18.03.1993'. All
2324 #' pattern letters of \code{java.time.format.DateTimeFormatter} can be used.
2325 #' Note: Use when ever possible specialized functions like \code{year}. These benefit from a
2326 #' specialized implementation.
2327 #'
2328 #' @rdname column_datetime_diff_functions
2329 #'
2330 #' @aliases date_format date_format,Column,character-method
2331 #' @note date_format since 1.5.0
2332 setMethod("date_format", signature(y = "Column", x = "character"),
2333           function(y, x) {
2334             jc <- callJStatic("org.apache.spark.sql.functions", "date_format", y@jc, x)
2335             column(jc)
2336           })
2337
2338 setClassUnion("characterOrstructTypeOrColumn", c("character", "structType", "Column"))
2339
2340 #' @details
2341 #' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType}
2342 #' with the specified \code{schema} or array of \code{structType} if \code{as.json.array} is set
2343 #' to \code{TRUE}. If the string is unparseable, the Column will contain the value NA.
2344 #'
2345 #' @rdname column_collection_functions
2346 #' @param as.json.array indicating if input string is JSON array of objects or a single object.
2347 #' @aliases from_json from_json,Column,characterOrstructTypeOrColumn-method
2348 #' @examples
2349 #'
2350 #' \dontrun{
2351 #' df2 <- sql("SELECT named_struct('date', cast('2000-01-01' as date)) as d")
2352 #' df2 <- mutate(df2, d2 = to_json(df2$d, dateFormat = 'dd/MM/yyyy'))
2353 #' schema <- structType(structField("date", "string"))
2354 #' head(select(df2, from_json(df2$d2, schema, dateFormat = 'dd/MM/yyyy')))
2355 #' df2 <- sql("SELECT named_struct('name', 'Bob') as people")
2356 #' df2 <- mutate(df2, people_json = to_json(df2$people))
2357 #' schema <- structType(structField("name", "string"))
2358 #' head(select(df2, from_json(df2$people_json, schema)))
2359 #' head(select(df2, from_json(df2$people_json, "name STRING")))
2360 #' head(select(df2, from_json(df2$people_json, schema_of_json(head(df2)$people_json))))}
2361 #' @note from_json since 2.2.0
2362 setMethod("from_json", signature(x = "Column", schema = "characterOrstructTypeOrColumn"),
2363           function(x, schema, as.json.array = FALSE, ...) {
2364             if (is.character(schema)) {
2365               jschema <- structType(schema)$jobj
2366             } else if (class(schema) == "structType") {
2367               jschema <- schema$jobj
2368             } else {
2369               jschema <- schema@jc
2370             }
2371
2372             if (as.json.array) {
2373               # This case is R-specifically different. Unlike Scala and Python side,
2374               # R side has 'as.json.array' option to indicate if the schema should be
2375               # treated as struct or element type of array in order to make it more
2376               # R-friendly.
2377               if (class(schema) == "Column") {
2378                 jschema <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
2379                                        "createArrayType",
2380                                        jschema)
2381               } else {
2382                 jschema <- callJStatic("org.apache.spark.sql.types.DataTypes",
2383                                        "createArrayType",
2384                                        jschema)
2385               }
2386             }
2387             options <- varargsToStrEnv(...)
2388             jc <- callJStatic("org.apache.spark.sql.functions",
2389                               "from_json",
2390                               x@jc, jschema, options)
2391             column(jc)
2392           })
2393
2394 #' @details
2395 #' \code{schema_of_json}: Parses a JSON string and infers its schema in DDL format.
2396 #'
2397 #' @rdname column_collection_functions
2398 #' @aliases schema_of_json schema_of_json,characterOrColumn-method
2399 #' @examples
2400 #'
2401 #' \dontrun{
2402 #' json <- "{\"name\":\"Bob\"}"
2403 #' df <- sql("SELECT * FROM range(1)")
2404 #' head(select(df, schema_of_json(json)))}
2405 #' @note schema_of_json since 3.0.0
2406 setMethod("schema_of_json", signature(x = "characterOrColumn"),
2407           function(x, ...) {
2408             if (class(x) == "character") {
2409               col <- callJStatic("org.apache.spark.sql.functions", "lit", x)
2410             } else {
2411               col <- x@jc
2412             }
2413             options <- varargsToStrEnv(...)
2414             jc <- callJStatic("org.apache.spark.sql.functions",
2415                               "schema_of_json",
2416                               col, options)
2417             column(jc)
2418           })
2419
2420 #' @details
2421 #' \code{from_csv}: Parses a column containing a CSV string into a Column of \code{structType}
2422 #' with the specified \code{schema}.
2423 #' If the string is unparseable, the Column will contain the value NA.
2424 #'
2425 #' @rdname column_collection_functions
2426 #' @aliases from_csv from_csv,Column,characterOrstructTypeOrColumn-method
2427 #' @examples
2428 #'
2429 #' \dontrun{
2430 #' csv <- "Amsterdam,2018"
2431 #' df <- sql(paste0("SELECT '", csv, "' as csv"))
2432 #' schema <- "city STRING, year INT"
2433 #' head(select(df, from_csv(df$csv, schema)))
2434 #' head(select(df, from_csv(df$csv, structType(schema))))
2435 #' head(select(df, from_csv(df$csv, schema_of_csv(csv))))}
2436 #' @note from_csv since 3.0.0
2437 setMethod("from_csv", signature(x = "Column", schema = "characterOrstructTypeOrColumn"),
2438           function(x, schema, ...) {
2439             if (class(schema) == "structType") {
2440               schema <- callJMethod(schema$jobj, "toDDL")
2441             }
2442
2443             if (is.character(schema)) {
2444               jschema <- callJStatic("org.apache.spark.sql.functions", "lit", schema)
2445             } else {
2446               jschema <- schema@jc
2447             }
2448             options <- varargsToStrEnv(...)
2449             jc <- callJStatic("org.apache.spark.sql.functions",
2450                               "from_csv",
2451                               x@jc, jschema, options)
2452             column(jc)
2453           })
2454
2455 #' @details
2456 #' \code{schema_of_csv}: Parses a CSV string and infers its schema in DDL format.
2457 #'
2458 #' @rdname column_collection_functions
2459 #' @aliases schema_of_csv schema_of_csv,characterOrColumn-method
2460 #' @examples
2461 #'
2462 #' \dontrun{
2463 #' csv <- "Amsterdam,2018"
2464 #' df <- sql("SELECT * FROM range(1)")
2465 #' head(select(df, schema_of_csv(csv)))}
2466 #' @note schema_of_csv since 3.0.0
2467 setMethod("schema_of_csv", signature(x = "characterOrColumn"),
2468           function(x, ...) {
2469             if (class(x) == "character") {
2470               col <- callJStatic("org.apache.spark.sql.functions", "lit", x)
2471             } else {
2472               col <- x@jc
2473             }
2474             options <- varargsToStrEnv(...)
2475             jc <- callJStatic("org.apache.spark.sql.functions",
2476                               "schema_of_csv",
2477                               col, options)
2478             column(jc)
2479           })
2480
2481 #' @details
2482 #' \code{from_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT
2483 #' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a
2484 #' timestamp in UTC, and renders that timestamp as a timestamp in the given time zone.
2485 #' However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not
2486 #' timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to
2487 #' the given timezone.
2488 #' This function may return confusing result if the input is a string with timezone, e.g.
2489 #' (\code{2018-03-13T06:18:23+00:00}). The reason is that, Spark firstly cast the string to
2490 #' timestamp according to the timezone in the string, and finally display the result by converting
2491 #' the timestamp to string according to the session local timezone.
2492 #'
2493 #' @rdname column_datetime_diff_functions
2494 #'
2495 #' @aliases from_utc_timestamp from_utc_timestamp,Column,character-method
2496 #' @examples
2497 #'
2498 #' \dontrun{
2499 #' tmp <- mutate(df, from_utc = from_utc_timestamp(df$time, "PST"),
2500 #'                  to_utc = to_utc_timestamp(df$time, "PST"))
2501 #' head(tmp)}
2502 #' @note from_utc_timestamp since 1.5.0
2503 setMethod("from_utc_timestamp", signature(y = "Column", x = "character"),
2504           function(y, x) {
2505             jc <- callJStatic("org.apache.spark.sql.functions", "from_utc_timestamp", y@jc, x)
2506             column(jc)
2507           })
2508
2509 #' @details
2510 #' \code{instr}: Locates the position of the first occurrence of a substring (\code{x})
2511 #' in the given string column (\code{y}). Returns null if either of the arguments are null.
2512 #' Note: The position is not zero based, but 1 based index. Returns 0 if the substring
2513 #' could not be found in the string column.
2514 #'
2515 #' @rdname column_string_functions
2516 #' @aliases instr instr,Column,character-method
2517 #' @examples
2518 #'
2519 #' \dontrun{
2520 #' tmp <- mutate(df, s1 = instr(df$Sex, "m"), s2 = instr(df$Sex, "M"),
2521 #'                   s3 = locate("m", df$Sex), s4 = locate("m", df$Sex, pos = 4))
2522 #' head(tmp)}
2523 #' @note instr since 1.5.0
2524 setMethod("instr", signature(y = "Column", x = "character"),
2525           function(y, x) {
2526             jc <- callJStatic("org.apache.spark.sql.functions", "instr", y@jc, x)
2527             column(jc)
2528           })
2529
2530 #' @details
2531 #' \code{next_day}: Given a date column, returns the first date which is later than the value of
2532 #' the date column that is on the specified day of the week. For example,
2533 #' \code{next_day("2015-07-27", "Sunday")} returns 2015-08-02 because that is the first Sunday
2534 #' after 2015-07-27. Day of the week parameter is case insensitive, and accepts first three or
2535 #' two characters: "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun".
2536 #'
2537 #' @rdname column_datetime_diff_functions
2538 #' @aliases next_day next_day,Column,character-method
2539 #' @note next_day since 1.5.0
2540 setMethod("next_day", signature(y = "Column", x = "character"),
2541           function(y, x) {
2542             jc <- callJStatic("org.apache.spark.sql.functions", "next_day", y@jc, x)
2543             column(jc)
2544           })
2545
2546 #' @details
2547 #' \code{to_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT
2548 #' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a
2549 #' timestamp in the given timezone, and renders that timestamp as a timestamp in UTC.
2550 #' However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not
2551 #' timezone-agnostic. So in Spark this function just shift the timestamp value from the given
2552 #' timezone to UTC timezone.
2553 #' This function may return confusing result if the input is a string with timezone, e.g.
2554 #' (\code{2018-03-13T06:18:23+00:00}). The reason is that, Spark firstly cast the string to
2555 #' timestamp according to the timezone in the string, and finally display the result by converting
2556 #' the timestamp to string according to the session local timezone.
2557 #'
2558 #' @rdname column_datetime_diff_functions
2559 #' @aliases to_utc_timestamp to_utc_timestamp,Column,character-method
2560 #' @note to_utc_timestamp since 1.5.0
2561 setMethod("to_utc_timestamp", signature(y = "Column", x = "character"),
2562           function(y, x) {
2563             jc <- callJStatic("org.apache.spark.sql.functions", "to_utc_timestamp", y@jc, x)
2564             column(jc)
2565           })
2566
2567 #' @details
2568 #' \code{add_months}: Returns the date that is numMonths (\code{x}) after startDate (\code{y}).
2569 #'
2570 #' @rdname column_datetime_diff_functions
2571 #' @aliases add_months add_months,Column,numeric-method
2572 #' @examples
2573 #'
2574 #' \dontrun{
2575 #' tmp <- mutate(df, t1 = add_months(df$time, 1),
2576 #'                   t2 = date_add(df$time, 2),
2577 #'                   t3 = date_sub(df$time, 3),
2578 #'                   t4 = next_day(df$time, "Sun"))
2579 #' head(tmp)}
2580 #' @note add_months since 1.5.0
2581 setMethod("add_months", signature(y = "Column", x = "numeric"),
2582           function(y, x) {
2583             jc <- callJStatic("org.apache.spark.sql.functions", "add_months", y@jc, as.integer(x))
2584             column(jc)
2585           })
2586
2587 #' @details
2588 #' \code{date_add}: Returns the date that is \code{x} days after.
2589 #'
2590 #' @rdname column_datetime_diff_functions
2591 #' @aliases date_add date_add,Column,numeric-method
2592 #' @note date_add since 1.5.0
2593 setMethod("date_add", signature(y = "Column", x = "numeric"),
2594           function(y, x) {
2595             jc <- callJStatic("org.apache.spark.sql.functions", "date_add", y@jc, as.integer(x))
2596             column(jc)
2597           })
2598
2599 #' @details
2600 #' \code{date_sub}: Returns the date that is \code{x} days before.
2601 #'
2602 #' @rdname column_datetime_diff_functions
2603 #'
2604 #' @aliases date_sub date_sub,Column,numeric-method
2605 #' @note date_sub since 1.5.0
2606 setMethod("date_sub", signature(y = "Column", x = "numeric"),
2607           function(y, x) {
2608             jc <- callJStatic("org.apache.spark.sql.functions", "date_sub", y@jc, as.integer(x))
2609             column(jc)
2610           })
2611
2612 #' @details
2613 #' \code{format_number}: Formats numeric column \code{y} to a format like '#,###,###.##',
2614 #' rounded to \code{x} decimal places with HALF_EVEN round mode, and returns the result
2615 #' as a string column.
2616 #' If \code{x} is 0, the result has no decimal point or fractional part.
2617 #' If \code{x} < 0, the result will be null.
2618 #'
2619 #' @rdname column_string_functions
2620 #' @aliases format_number format_number,Column,numeric-method
2621 #' @examples
2622 #'
2623 #' \dontrun{
2624 #' tmp <- mutate(df, v1 = df$Freq/3)
2625 #' head(select(tmp, format_number(tmp$v1, 0), format_number(tmp$v1, 2),
2626 #'                  format_string("%4.2f %s", tmp$v1, tmp$Sex)), 10)}
2627 #' @note format_number since 1.5.0
2628 setMethod("format_number", signature(y = "Column", x = "numeric"),
2629           function(y, x) {
2630             jc <- callJStatic("org.apache.spark.sql.functions",
2631                               "format_number",
2632                               y@jc, as.integer(x))
2633             column(jc)
2634           })
2635
2636 #' @details
2637 #' \code{sha2}: Calculates the SHA-2 family of hash functions of a binary column and
2638 #' returns the value as a hex string. The second argument \code{x} specifies the number
2639 #' of bits, and is one of 224, 256, 384, or 512.
2640 #'
2641 #' @rdname column_misc_functions
2642 #' @aliases sha2 sha2,Column,numeric-method
2643 #' @note sha2 since 1.5.0
2644 setMethod("sha2", signature(y = "Column", x = "numeric"),
2645           function(y, x) {
2646             jc <- callJStatic("org.apache.spark.sql.functions", "sha2", y@jc, as.integer(x))
2647             column(jc)
2648           })
2649
2650 #' @details
2651 #' \code{shiftLeft}: Shifts the given value numBits left. If the given value is a long value,
2652 #' this function will return a long value else it will return an integer value.
2653 #'
2654 #' @rdname column_math_functions
2655 #' @aliases shiftLeft shiftLeft,Column,numeric-method
2656 #' @note shiftLeft since 1.5.0
2657 setMethod("shiftLeft", signature(y = "Column", x = "numeric"),
2658           function(y, x) {
2659             jc <- callJStatic("org.apache.spark.sql.functions",
2660                               "shiftLeft",
2661                               y@jc, as.integer(x))
2662             column(jc)
2663           })
2664
2665 #' @details
2666 #' \code{shiftRight}: (Signed) shifts the given value numBits right. If the given value is a long
2667 #' value, it will return a long value else it will return an integer value.
2668 #'
2669 #' @rdname column_math_functions
2670 #' @aliases shiftRight shiftRight,Column,numeric-method
2671 #' @note shiftRight since 1.5.0
2672 setMethod("shiftRight", signature(y = "Column", x = "numeric"),
2673           function(y, x) {
2674             jc <- callJStatic("org.apache.spark.sql.functions",
2675                               "shiftRight",
2676                               y@jc, as.integer(x))
2677             column(jc)
2678           })
2679
2680 #' @details
2681 #' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is
2682 #' a long value, it will return a long value else it will return an integer value.
2683 #'
2684 #' @rdname column_math_functions
2685 #' @aliases shiftRightUnsigned shiftRightUnsigned,Column,numeric-method
2686 #' @note shiftRightUnsigned since 1.5.0
2687 setMethod("shiftRightUnsigned", signature(y = "Column", x = "numeric"),
2688           function(y, x) {
2689             jc <- callJStatic("org.apache.spark.sql.functions",
2690                               "shiftRightUnsigned",
2691                               y@jc, as.integer(x))
2692             column(jc)
2693           })
2694
2695 #' @details
2696 #' \code{concat_ws}: Concatenates multiple input string columns together into a single
2697 #' string column, using the given separator.
2698 #'
2699 #' @param sep separator to use.
2700 #' @rdname column_string_functions
2701 #' @aliases concat_ws concat_ws,character,Column-method
2702 #' @examples
2703 #'
2704 #' \dontrun{
2705 #' # concatenate strings
2706 #' tmp <- mutate(df, s1 = concat_ws("_", df$Class, df$Sex),
2707 #'                   s2 = concat_ws("+", df$Class, df$Sex, df$Age, df$Survived))
2708 #' head(tmp)}
2709 #' @note concat_ws since 1.5.0
2710 setMethod("concat_ws", signature(sep = "character", x = "Column"),
2711           function(sep, x, ...) {
2712             jcols <- lapply(list(x, ...), function(x) { x@jc })
2713             jc <- callJStatic("org.apache.spark.sql.functions", "concat_ws", sep, jcols)
2714             column(jc)
2715           })
2716
2717 #' @details
2718 #' \code{conv}: Converts a number in a string column from one base to another.
2719 #'
2720 #' @param fromBase base to convert from.
2721 #' @param toBase base to convert to.
2722 #' @rdname column_math_functions
2723 #' @aliases conv conv,Column,numeric,numeric-method
2724 #' @note conv since 1.5.0
2725 setMethod("conv", signature(x = "Column", fromBase = "numeric", toBase = "numeric"),
2726           function(x, fromBase, toBase) {
2727             fromBase <- as.integer(fromBase)
2728             toBase <- as.integer(toBase)
2729             jc <- callJStatic("org.apache.spark.sql.functions",
2730                               "conv",
2731                               x@jc, fromBase, toBase)
2732             column(jc)
2733           })
2734
2735 #' @details
2736 #' \code{expr}: Parses the expression string into the column that it represents, similar to
2737 #' \code{SparkDataFrame.selectExpr}
2738 #'
2739 #' @rdname column_nonaggregate_functions
2740 #' @aliases expr expr,character-method
2741 #' @note expr since 1.5.0
2742 setMethod("expr", signature(x = "character"),
2743           function(x) {
2744             jc <- callJStatic("org.apache.spark.sql.functions", "expr", x)
2745             column(jc)
2746           })
2747
2748 #' @details
2749 #' \code{format_string}: Formats the arguments in printf-style and returns the result
2750 #' as a string column.
2751 #'
2752 #' @param format a character object of format strings.
2753 #' @rdname column_string_functions
2754 #' @aliases format_string format_string,character,Column-method
2755 #' @note format_string since 1.5.0
2756 setMethod("format_string", signature(format = "character", x = "Column"),
2757           function(format, x, ...) {
2758             jcols <- lapply(list(x, ...), function(arg) { arg@jc })
2759             jc <- callJStatic("org.apache.spark.sql.functions",
2760                               "format_string",
2761                               format, jcols)
2762             column(jc)
2763           })
2764
2765 #' @details
2766 #' \code{from_unixtime}: Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC)
2767 #' to a string representing the timestamp of that moment in the current system time zone in the JVM
2768 #' in the given format.
2769 #' See \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{
2770 #' Datetime Pattern} for available options.
2771 #'
2772 #' @rdname column_datetime_functions
2773 #'
2774 #' @aliases from_unixtime from_unixtime,Column-method
2775 #' @examples
2776 #'
2777 #' \dontrun{
2778 #' tmp <- mutate(df, to_unix = unix_timestamp(df$time),
2779 #'                   to_unix2 = unix_timestamp(df$time, 'yyyy-MM-dd HH'),
2780 #'                   from_unix = from_unixtime(unix_timestamp(df$time)),
2781 #'                   from_unix2 = from_unixtime(unix_timestamp(df$time), 'yyyy-MM-dd HH:mm'))
2782 #' head(tmp)}
2783 #' @note from_unixtime since 1.5.0
2784 setMethod("from_unixtime", signature(x = "Column"),
2785           function(x, format = "yyyy-MM-dd HH:mm:ss") {
2786             jc <- callJStatic("org.apache.spark.sql.functions",
2787                               "from_unixtime",
2788                               x@jc, format)
2789             column(jc)
2790           })
2791
2792 #' @details
2793 #' \code{window}: Bucketizes rows into one or more time windows given a timestamp specifying column.
2794 #' Window starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
2795 #' [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in
2796 #' the order of months are not supported. It returns an output column of struct called 'window'
2797 #' by default with the nested columns 'start' and 'end'
2798 #'
2799 #' @param windowDuration a string specifying the width of the window, e.g. '1 second',
2800 #'                       '1 day 12 hours', '2 minutes'. Valid interval strings are 'week',
2801 #'                       'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'. Note that
2802 #'                       the duration is a fixed length of time, and does not vary over time
2803 #'                       according to a calendar. For example, '1 day' always means 86,400,000
2804 #'                       milliseconds, not a calendar day.
2805 #' @param slideDuration a string specifying the sliding interval of the window. Same format as
2806 #'                      \code{windowDuration}. A new window will be generated every
2807 #'                      \code{slideDuration}. Must be less than or equal to
2808 #'                      the \code{windowDuration}. This duration is likewise absolute, and does not
2809 #'                      vary according to a calendar.
2810 #' @param startTime the offset with respect to 1970-01-01 00:00:00 UTC with which to start
2811 #'                  window intervals. For example, in order to have hourly tumbling windows
2812 #'                  that start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide
2813 #'                  \code{startTime} as \code{"15 minutes"}.
2814 #' @rdname column_datetime_functions
2815 #' @aliases window window,Column-method
2816 #' @examples
2817 #'
2818 #' \dontrun{
2819 #' # One minute windows every 15 seconds 10 seconds after the minute, e.g. 09:00:10-09:01:10,
2820 #' # 09:00:25-09:01:25, 09:00:40-09:01:40, ...
2821 #' window(df$time, "1 minute", "15 seconds", "10 seconds")
2822 #'
2823 #' # One minute tumbling windows 15 seconds after the minute, e.g. 09:00:15-09:01:15,
2824 #' # 09:01:15-09:02:15...
2825 #' window(df$time, "1 minute", startTime = "15 seconds")
2826 #'
2827 #' # Thirty-second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
2828 #' window(df$time, "30 seconds", "10 seconds")}
2829 #' @note window since 2.0.0
2830 setMethod("window", signature(x = "Column"),
2831           function(x, windowDuration, slideDuration = NULL, startTime = NULL) {
2832             stopifnot(is.character(windowDuration))
2833             if (!is.null(slideDuration) && !is.null(startTime)) {
2834               stopifnot(is.character(slideDuration) && is.character(startTime))
2835               jc <- callJStatic("org.apache.spark.sql.functions",
2836                                 "window",
2837                                 x@jc, windowDuration, slideDuration, startTime)
2838             } else if (!is.null(slideDuration)) {
2839               stopifnot(is.character(slideDuration))
2840               jc <- callJStatic("org.apache.spark.sql.functions",
2841                                 "window",
2842                                 x@jc, windowDuration, slideDuration)
2843             } else if (!is.null(startTime)) {
2844               stopifnot(is.character(startTime))
2845               jc <- callJStatic("org.apache.spark.sql.functions",
2846                                 "window",
2847                                 x@jc, windowDuration, windowDuration, startTime)
2848             } else {
2849               jc <- callJStatic("org.apache.spark.sql.functions",
2850                                 "window",
2851                                 x@jc, windowDuration)
2852             }
2853             column(jc)
2854           })
2855
2856 #' @details
2857 #' \code{locate}: Locates the position of the first occurrence of substr.
2858 #' Note: The position is not zero based, but 1 based index. Returns 0 if substr
2859 #' could not be found in str.
2860 #'
2861 #' @param substr a character string to be matched.
2862 #' @param str a Column where matches are sought for each entry.
2863 #' @rdname column_string_functions
2864 #' @aliases locate locate,character,Column-method
2865 #' @note locate since 1.5.0
2866 setMethod("locate", signature(substr = "character", str = "Column"),
2867           function(substr, str, pos = 1) {
2868             jc <- callJStatic("org.apache.spark.sql.functions",
2869                               "locate",
2870                               substr, str@jc, as.integer(pos))
2871             column(jc)
2872           })
2873
2874 #' @details
2875 #' \code{lpad}: Left-padded with pad to a length of len.
2876 #'
2877 #' @param pad a character string to be padded with.
2878 #' @rdname column_string_functions
2879 #' @aliases lpad lpad,Column,numeric,character-method
2880 #' @note lpad since 1.5.0
2881 setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
2882           function(x, len, pad) {
2883             jc <- callJStatic("org.apache.spark.sql.functions",
2884                               "lpad",
2885                               x@jc, as.integer(len), pad)
2886             column(jc)
2887           })
2888
2889 #' @details
2890 #' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.)
2891 #' samples uniformly distributed in [0.0, 1.0).
2892 #' Note: the function is non-deterministic in general case.
2893 #'
2894 #' @rdname column_nonaggregate_functions
2895 #' @param seed a random seed. Can be missing.
2896 #' @aliases rand rand,missing-method
2897 #' @examples
2898 #'
2899 #' \dontrun{
2900 #' tmp <- mutate(df, r1 = rand(), r2 = rand(10), r3 = randn(), r4 = randn(10))
2901 #' head(tmp)}
2902 #' @note rand since 1.5.0
2903 setMethod("rand", signature(seed = "missing"),
2904           function(seed) {
2905             jc <- callJStatic("org.apache.spark.sql.functions", "rand")
2906             column(jc)
2907           })
2908
2909 #' @rdname column_nonaggregate_functions
2910 #' @aliases rand,numeric-method
2911 #' @note rand(numeric) since 1.5.0
2912 setMethod("rand", signature(seed = "numeric"),
2913           function(seed) {
2914             jc <- callJStatic("org.apache.spark.sql.functions", "rand", as.integer(seed))
2915             column(jc)
2916           })
2917
2918 #' @details
2919 #' \code{randn}: Generates a column with independent and identically distributed (i.i.d.) samples
2920 #' from the standard normal distribution.
2921 #' Note: the function is non-deterministic in general case.
2922 #'
2923 #' @rdname column_nonaggregate_functions
2924 #' @aliases randn randn,missing-method
2925 #' @note randn since 1.5.0
2926 setMethod("randn", signature(seed = "missing"),
2927           function(seed) {
2928             jc <- callJStatic("org.apache.spark.sql.functions", "randn")
2929             column(jc)
2930           })
2931
2932 #' @rdname column_nonaggregate_functions
2933 #' @aliases randn,numeric-method
2934 #' @note randn(numeric) since 1.5.0
2935 setMethod("randn", signature(seed = "numeric"),
2936           function(seed) {
2937             jc <- callJStatic("org.apache.spark.sql.functions", "randn", as.integer(seed))
2938             column(jc)
2939           })
2940
2941 #' @details
2942 #' \code{regexp_extract}: Extracts a specific \code{idx} group identified by a Java regex,
2943 #' from the specified string column. If the regex did not match, or the specified group did
2944 #' not match, an empty string is returned.
2945 #'
2946 #' @param pattern a regular expression.
2947 #' @param idx a group index.
2948 #' @rdname column_string_functions
2949 #' @aliases regexp_extract regexp_extract,Column,character,numeric-method
2950 #' @examples
2951 #'
2952 #' \dontrun{
2953 #' tmp <- mutate(df, s1 = regexp_extract(df$Class, "(\\d+)\\w+", 1),
2954 #'                   s2 = regexp_extract(df$Sex, "^(\\w)\\w+", 1),
2955 #'                   s3 = regexp_replace(df$Class, "\\D+", ""),
2956 #'                   s4 = substring_index(df$Sex, "a", 1),
2957 #'                   s5 = substring_index(df$Sex, "a", -1),
2958 #'                   s6 = translate(df$Sex, "ale", ""),
2959 #'                   s7 = translate(df$Sex, "a", "-"))
2960 #' head(tmp)}
2961 #' @note regexp_extract since 1.5.0
2962 setMethod("regexp_extract",
2963           signature(x = "Column", pattern = "character", idx = "numeric"),
2964           function(x, pattern, idx) {
2965             jc <- callJStatic("org.apache.spark.sql.functions",
2966                               "regexp_extract",
2967                               x@jc, pattern, as.integer(idx))
2968             column(jc)
2969           })
2970
2971 #' @details
2972 #' \code{regexp_replace}: Replaces all substrings of the specified string value that
2973 #' match regexp with rep.
2974 #'
2975 #' @param replacement a character string that a matched \code{pattern} is replaced with.
2976 #' @rdname column_string_functions
2977 #' @aliases regexp_replace regexp_replace,Column,character,character-method
2978 #' @note regexp_replace since 1.5.0
2979 setMethod("regexp_replace",
2980           signature(x = "Column", pattern = "character", replacement = "character"),
2981           function(x, pattern, replacement) {
2982             jc <- callJStatic("org.apache.spark.sql.functions",
2983                               "regexp_replace",
2984                               x@jc, pattern, replacement)
2985             column(jc)
2986           })
2987
2988 #' @details
2989 #' \code{rpad}: Right-padded with pad to a length of len.
2990 #'
2991 #' @rdname column_string_functions
2992 #' @aliases rpad rpad,Column,numeric,character-method
2993 #' @note rpad since 1.5.0
2994 setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
2995           function(x, len, pad) {
2996             jc <- callJStatic("org.apache.spark.sql.functions",
2997                               "rpad",
2998                               x@jc, as.integer(len), pad)
2999             column(jc)
3000           })
3001
3002 #' @details
3003 #' \code{substring_index}: Returns the substring from string (\code{x}) before \code{count}
3004 #' occurrences of the delimiter (\code{delim}). If \code{count} is positive, everything the left of
3005 #' the final delimiter (counting from left) is returned. If \code{count} is negative, every to the
3006 #' right of the final delimiter (counting from the right) is returned. \code{substring_index}
3007 #' performs a case-sensitive match when searching for the delimiter.
3008 #'
3009 #' @param delim a delimiter string.
3010 #' @param count number of occurrences of \code{delim} before the substring is returned.
3011 #'              A positive number means counting from the left, while negative means
3012 #'              counting from the right.
3013 #' @rdname column_string_functions
3014 #' @aliases substring_index substring_index,Column,character,numeric-method
3015 #' @note substring_index since 1.5.0
3016 setMethod("substring_index",
3017           signature(x = "Column", delim = "character", count = "numeric"),
3018           function(x, delim, count) {
3019             jc <- callJStatic("org.apache.spark.sql.functions",
3020                               "substring_index",
3021                               x@jc, delim, as.integer(count))
3022             column(jc)
3023           })
3024
3025 #' @details
3026 #' \code{translate}: Translates any character in the src by a character in replaceString.
3027 #' The characters in replaceString is corresponding to the characters in matchingString.
3028 #' The translate will happen when any character in the string matching with the character
3029 #' in the matchingString.
3030 #'
3031 #' @param matchingString a source string where each character will be translated.
3032 #' @param replaceString a target string where each \code{matchingString} character will
3033 #'                      be replaced by the character in \code{replaceString}
3034 #'                      at the same location, if any.
3035 #' @rdname column_string_functions
3036 #' @aliases translate translate,Column,character,character-method
3037 #' @note translate since 1.5.0
3038 setMethod("translate",
3039           signature(x = "Column", matchingString = "character", replaceString = "character"),
3040           function(x, matchingString, replaceString) {
3041             jc <- callJStatic("org.apache.spark.sql.functions",
3042                               "translate", x@jc, matchingString, replaceString)
3043             column(jc)
3044           })
3045
3046 #' @details
3047 #' \code{unix_timestamp}: Gets current Unix timestamp in seconds.
3048 #'
3049 #' @rdname column_datetime_functions
3050 #' @aliases unix_timestamp unix_timestamp,missing,missing-method
3051 #' @note unix_timestamp since 1.5.0
3052 setMethod("unix_timestamp", signature(x = "missing", format = "missing"),
3053           function(x, format) {
3054             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp")
3055             column(jc)
3056           })
3057
3058 #' @rdname column_datetime_functions
3059 #' @aliases unix_timestamp,Column,missing-method
3060 #' @note unix_timestamp(Column) since 1.5.0
3061 setMethod("unix_timestamp", signature(x = "Column", format = "missing"),
3062           function(x, format) {
3063             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc)
3064             column(jc)
3065           })
3066
3067 #' @rdname column_datetime_functions
3068 #' @aliases unix_timestamp,Column,character-method
3069 #' @note unix_timestamp(Column, character) since 1.5.0
3070 setMethod("unix_timestamp", signature(x = "Column", format = "character"),
3071           function(x, format = "yyyy-MM-dd HH:mm:ss") {
3072             jc <- callJStatic("org.apache.spark.sql.functions", "unix_timestamp", x@jc, format)
3073             column(jc)
3074           })
3075
3076 #' @details
3077 #' \code{when}: Evaluates a list of conditions and returns one of multiple possible result
3078 #' expressions. For unmatched expressions null is returned.
3079 #'
3080 #' @rdname column_nonaggregate_functions
3081 #' @param condition the condition to test on. Must be a Column expression.
3082 #' @param value result expression.
3083 #' @aliases when when,Column-method
3084 #' @examples
3085 #'
3086 #' \dontrun{
3087 #' tmp <- mutate(df, mpg_na = otherwise(when(df$mpg > 20, df$mpg), lit(NaN)),
3088 #'                   mpg2 = ifelse(df$mpg > 20 & df$am > 0, 0, 1),
3089 #'                   mpg3 = ifelse(df$mpg > 20, df$mpg, 20.0))
3090 #' head(tmp)
3091 #' tmp <- mutate(tmp, ind_na1 = is.nan(tmp$mpg_na), ind_na2 = isnan(tmp$mpg_na))
3092 #' head(select(tmp, coalesce(tmp$mpg_na, tmp$mpg)))
3093 #' head(select(tmp, nanvl(tmp$mpg_na, tmp$hp)))}
3094 #' @note when since 1.5.0
3095 setMethod("when", signature(condition = "Column", value = "ANY"),
3096           function(condition, value) {
3097               condition <- condition@jc
3098               value <- if (class(value) == "Column") { value@jc } else { value }
3099               jc <- callJStatic("org.apache.spark.sql.functions", "when", condition, value)
3100               column(jc)
3101           })
3102
3103 #' @details
3104 #' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the conditions are
3105 #' satisfied. Otherwise \code{no} is returned for unmatched conditions.
3106 #'
3107 #' @rdname column_nonaggregate_functions
3108 #' @param test a Column expression that describes the condition.
3109 #' @param yes return values for \code{TRUE} elements of test.
3110 #' @param no return values for \code{FALSE} elements of test.
3111 #' @aliases ifelse ifelse,Column-method
3112 #' @note ifelse since 1.5.0
3113 setMethod("ifelse",
3114           signature(test = "Column", yes = "ANY", no = "ANY"),
3115           function(test, yes, no) {
3116               test <- test@jc
3117               yes <- if (class(yes) == "Column") { yes@jc } else { yes }
3118               no <- if (class(no) == "Column") { no@jc } else { no }
3119               jc <- callJMethod(callJStatic("org.apache.spark.sql.functions",
3120                                             "when",
3121                                             test, yes),
3122                                 "otherwise", no)
3123               column(jc)
3124           })
3125
3126 ###################### Window functions######################
3127
3128 #' @details
3129 #' \code{cume_dist}: Returns the cumulative distribution of values within a window partition,
3130 #' i.e. the fraction of rows that are below the current row:
3131 #' (number of values before and including x) / (total number of rows in the partition).
3132 #' This is equivalent to the \code{CUME_DIST} function in SQL.
3133 #' The method should be used with no argument.
3134 #'
3135 #' @rdname column_window_functions
3136 #' @aliases cume_dist cume_dist,missing-method
3137 #' @note cume_dist since 1.6.0
3138 setMethod("cume_dist",
3139           signature("missing"),
3140           function() {
3141             jc <- callJStatic("org.apache.spark.sql.functions", "cume_dist")
3142             column(jc)
3143           })
3144
3145 #' @details
3146 #' \code{dense_rank}: Returns the rank of rows within a window partition, without any gaps.
3147 #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
3148 #' sequence when there are ties. That is, if you were ranking a competition using dense_rank
3149 #' and had three people tie for second place, you would say that all three were in second
3150 #' place and that the next person came in third. Rank would give me sequential numbers, making
3151 #' the person that came in third place (after the ties) would register as coming in fifth.
3152 #' This is equivalent to the \code{DENSE_RANK} function in SQL.
3153 #' The method should be used with no argument.
3154 #'
3155 #' @rdname column_window_functions
3156 #' @aliases dense_rank dense_rank,missing-method
3157 #' @note dense_rank since 1.6.0
3158 setMethod("dense_rank",
3159           signature("missing"),
3160           function() {
3161             jc <- callJStatic("org.apache.spark.sql.functions", "dense_rank")
3162             column(jc)
3163           })
3164
3165 #' @details
3166 #' \code{lag}: Returns the value that is \code{offset} rows before the current row, and
3167 #' \code{defaultValue} if there is less than \code{offset} rows before the current row. For example,
3168 #' an \code{offset} of one will return the previous row at any given point in the window partition.
3169 #' This is equivalent to the \code{LAG} function in SQL.
3170 #'
3171 #' @rdname column_window_functions
3172 #' @aliases lag lag,characterOrColumn-method
3173 #' @note lag since 1.6.0
3174 setMethod("lag",
3175           signature(x = "characterOrColumn"),
3176           function(x, offset = 1, defaultValue = NULL) {
3177             col <- if (class(x) == "Column") {
3178               x@jc
3179             } else {
3180               x
3181             }
3182
3183             jc <- callJStatic("org.apache.spark.sql.functions",
3184                               "lag", col, as.integer(offset), defaultValue)
3185             column(jc)
3186           })
3187
3188 #' @details
3189 #' \code{lead}: Returns the value that is \code{offset} rows after the current row, and
3190 #' \code{defaultValue} if there is less than \code{offset} rows after the current row.
3191 #' For example, an \code{offset} of one will return the next row at any given point
3192 #' in the window partition.
3193 #' This is equivalent to the \code{LEAD} function in SQL.
3194 #'
3195 #' @rdname column_window_functions
3196 #' @aliases lead lead,characterOrColumn,numeric-method
3197 #' @note lead since 1.6.0
3198 setMethod("lead",
3199           signature(x = "characterOrColumn", offset = "numeric", defaultValue = "ANY"),
3200           function(x, offset = 1, defaultValue = NULL) {
3201             col <- if (class(x) == "Column") {
3202               x@jc
3203             } else {
3204               x
3205             }
3206
3207             jc <- callJStatic("org.apache.spark.sql.functions",
3208                               "lead", col, as.integer(offset), defaultValue)
3209             column(jc)
3210           })
3211
3212 #' @details
3213 #' \code{ntile}: Returns the ntile group id (from 1 to n inclusive) in an ordered window
3214 #' partition. For example, if n is 4, the first quarter of the rows will get value 1, the second
3215 #' quarter will get 2, the third quarter will get 3, and the last quarter will get 4.
3216 #' This is equivalent to the \code{NTILE} function in SQL.
3217 #'
3218 #' @rdname column_window_functions
3219 #' @aliases ntile ntile,numeric-method
3220 #' @note ntile since 1.6.0
3221 setMethod("ntile",
3222           signature(x = "numeric"),
3223           function(x) {
3224             jc <- callJStatic("org.apache.spark.sql.functions", "ntile", as.integer(x))
3225             column(jc)
3226           })
3227
3228 #' @details
3229 #' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows within a window
3230 #' partition.
3231 #' This is computed by: (rank of row in its partition - 1) / (number of rows in the partition - 1).
3232 #' This is equivalent to the \code{PERCENT_RANK} function in SQL.
3233 #' The method should be used with no argument.
3234 #'
3235 #' @rdname column_window_functions
3236 #' @aliases percent_rank percent_rank,missing-method
3237 #' @note percent_rank since 1.6.0
3238 setMethod("percent_rank",
3239           signature("missing"),
3240           function() {
3241             jc <- callJStatic("org.apache.spark.sql.functions", "percent_rank")
3242             column(jc)
3243           })
3244
3245 #' @details
3246 #' \code{rank}: Returns the rank of rows within a window partition.
3247 #' The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking
3248 #' sequence when there are ties. That is, if you were ranking a competition using dense_rank
3249 #' and had three people tie for second place, you would say that all three were in second
3250 #' place and that the next person came in third. Rank would give me sequential numbers, making
3251 #' the person that came in third place (after the ties) would register as coming in fifth.
3252 #' This is equivalent to the \code{RANK} function in SQL.
3253 #' The method should be used with no argument.
3254 #'
3255 #' @rdname column_window_functions
3256 #' @aliases rank rank,missing-method
3257 #' @note rank since 1.6.0
3258 setMethod("rank",
3259           signature(x = "missing"),
3260           function() {
3261             jc <- callJStatic("org.apache.spark.sql.functions", "rank")
3262             column(jc)
3263           })
3264
3265 #' @rdname column_window_functions
3266 #' @aliases rank,ANY-method
3267 setMethod("rank",
3268           signature(x = "ANY"),
3269           function(x, ...) {
3270             base::rank(x, ...)
3271           })
3272
3273 #' @details
3274 #' \code{row_number}: Returns a sequential number starting at 1 within a window partition.
3275 #' This is equivalent to the \code{ROW_NUMBER} function in SQL.
3276 #' The method should be used with no argument.
3277 #'
3278 #' @rdname column_window_functions
3279 #' @aliases row_number row_number,missing-method
3280 #' @note row_number since 1.6.0
3281 setMethod("row_number",
3282           signature("missing"),
3283           function() {
3284             jc <- callJStatic("org.apache.spark.sql.functions", "row_number")
3285             column(jc)
3286           })
3287
3288 ###################### Collection functions######################
3289
3290 #' @details
3291 #' \code{array_contains}: Returns null if the array is null, true if the array contains
3292 #' the value, and false otherwise.
3293 #'
3294 #' @rdname column_collection_functions
3295 #' @aliases array_contains array_contains,Column-method
3296 #' @note array_contains since 1.6.0
3297 setMethod("array_contains",
3298           signature(x = "Column", value = "ANY"),
3299           function(x, value) {
3300             jc <- callJStatic("org.apache.spark.sql.functions", "array_contains", x@jc, value)
3301             column(jc)
3302           })
3303
3304 #' @details
3305 #' \code{array_distinct}: Removes duplicate values from the array.
3306 #'
3307 #' @rdname column_collection_functions
3308 #' @aliases array_distinct array_distinct,Column-method
3309 #' @note array_distinct since 2.4.0
3310 setMethod("array_distinct",
3311           signature(x = "Column"),
3312           function(x) {
3313             jc <- callJStatic("org.apache.spark.sql.functions", "array_distinct", x@jc)
3314             column(jc)
3315           })
3316
3317 #' @details
3318 #' \code{array_except}: Returns an array of the elements in the first array but not in the second
3319 #'  array, without duplicates. The order of elements in the result is not determined.
3320 #'
3321 #' @rdname column_collection_functions
3322 #' @aliases array_except array_except,Column-method
3323 #' @note array_except since 2.4.0
3324 setMethod("array_except",
3325           signature(x = "Column", y = "Column"),
3326           function(x, y) {
3327             jc <- callJStatic("org.apache.spark.sql.functions", "array_except", x@jc, y@jc)
3328             column(jc)
3329           })
3330
3331 #' @details
3332 #' \code{array_intersect}: Returns an array of the elements in the intersection of the given two
3333 #'  arrays, without duplicates.
3334 #'
3335 #' @rdname column_collection_functions
3336 #' @aliases array_intersect array_intersect,Column-method
3337 #' @note array_intersect since 2.4.0
3338 setMethod("array_intersect",
3339           signature(x = "Column", y = "Column"),
3340           function(x, y) {
3341             jc <- callJStatic("org.apache.spark.sql.functions", "array_intersect", x@jc, y@jc)
3342             column(jc)
3343           })
3344
3345 #' @details
3346 #' \code{array_join}: Concatenates the elements of column using the delimiter.
3347 #' Null values are replaced with nullReplacement if set, otherwise they are ignored.
3348 #'
3349 #' @param delimiter a character string that is used to concatenate the elements of column.
3350 #' @param nullReplacement an optional character string that is used to replace the Null values.
3351 #' @rdname column_collection_functions
3352 #' @aliases array_join array_join,Column-method
3353 #' @note array_join since 2.4.0
3354 setMethod("array_join",
3355          signature(x = "Column", delimiter = "character"),
3356          function(x, delimiter, nullReplacement = NULL) {
3357            jc <- if (is.null(nullReplacement)) {
3358              callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter)
3359            } else {
3360              callJStatic("org.apache.spark.sql.functions", "array_join", x@jc, delimiter,
3361                          as.character(nullReplacement))
3362            }
3363            column(jc)
3364          })
3365
3366 #' @details
3367 #' \code{array_max}: Returns the maximum value of the array.
3368 #'
3369 #' @rdname column_collection_functions
3370 #' @aliases array_max array_max,Column-method
3371 #' @note array_max since 2.4.0
3372 setMethod("array_max",
3373           signature(x = "Column"),
3374           function(x) {
3375             jc <- callJStatic("org.apache.spark.sql.functions", "array_max", x@jc)
3376             column(jc)
3377           })
3378
3379 #' @details
3380 #' \code{array_min}: Returns the minimum value of the array.
3381 #'
3382 #' @rdname column_collection_functions
3383 #' @aliases array_min array_min,Column-method
3384 #' @note array_min since 2.4.0
3385 setMethod("array_min",
3386           signature(x = "Column"),
3387           function(x) {
3388             jc <- callJStatic("org.apache.spark.sql.functions", "array_min", x@jc)
3389             column(jc)
3390           })
3391
3392 #' @details
3393 #' \code{array_position}: Locates the position of the first occurrence of the given value
3394 #' in the given array. Returns NA if either of the arguments are NA.
3395 #' Note: The position is not zero based, but 1 based index. Returns 0 if the given
3396 #' value could not be found in the array.
3397 #'
3398 #' @rdname column_collection_functions
3399 #' @aliases array_position array_position,Column-method
3400 #' @note array_position since 2.4.0
3401 setMethod("array_position",
3402           signature(x = "Column", value = "ANY"),
3403           function(x, value) {
3404             jc <- callJStatic("org.apache.spark.sql.functions", "array_position", x@jc, value)
3405             column(jc)
3406           })
3407
3408 #' @details
3409 #' \code{array_remove}: Removes all elements that equal to element from the given array.
3410 #'
3411 #' @rdname column_collection_functions
3412 #' @aliases array_remove array_remove,Column-method
3413 #' @note array_remove since 2.4.0
3414 setMethod("array_remove",
3415           signature(x = "Column", value = "ANY"),
3416           function(x, value) {
3417             jc <- callJStatic("org.apache.spark.sql.functions", "array_remove", x@jc, value)
3418             column(jc)
3419           })
3420
3421 #' @details
3422 #' \code{array_repeat}: Creates an array containing \code{x} repeated the number of times
3423 #' given by \code{count}.
3424 #'
3425 #' @param count a Column or constant determining the number of repetitions.
3426 #' @rdname column_collection_functions
3427 #' @aliases array_repeat array_repeat,Column,numericOrColumn-method
3428 #' @note array_repeat since 2.4.0
3429 setMethod("array_repeat",
3430           signature(x = "Column", count = "numericOrColumn"),
3431           function(x, count) {
3432             if (class(count) == "Column") {
3433               count <- count@jc
3434             } else {
3435               count <- as.integer(count)
3436             }
3437             jc <- callJStatic("org.apache.spark.sql.functions", "array_repeat", x@jc, count)
3438             column(jc)
3439           })
3440
3441 #' @details
3442 #' \code{array_sort}: Sorts the input array in ascending order. The elements of the input array
3443 #' must be orderable. NA elements will be placed at the end of the returned array.
3444 #'
3445 #' @rdname column_collection_functions
3446 #' @aliases array_sort array_sort,Column-method
3447 #' @note array_sort since 2.4.0
3448 setMethod("array_sort",
3449           signature(x = "Column"),
3450           function(x) {
3451             jc <- callJStatic("org.apache.spark.sql.functions", "array_sort", x@jc)
3452             column(jc)
3453           })
3454
3455 #' @details
3456 #' \code{arrays_overlap}: Returns true if the input arrays have at least one non-null element in
3457 #' common. If not and both arrays are non-empty and any of them contains a null, it returns null.
3458 #' It returns false otherwise.
3459 #'
3460 #' @rdname column_collection_functions
3461 #' @aliases arrays_overlap arrays_overlap,Column-method
3462 #' @note arrays_overlap since 2.4.0
3463 setMethod("arrays_overlap",
3464           signature(x = "Column", y = "Column"),
3465           function(x, y) {
3466             jc <- callJStatic("org.apache.spark.sql.functions", "arrays_overlap", x@jc, y@jc)
3467             column(jc)
3468           })
3469
3470 #' @details
3471 #' \code{array_union}: Returns an array of the elements in the union of the given two arrays,
3472 #'  without duplicates.
3473 #'
3474 #' @rdname column_collection_functions
3475 #' @aliases array_union array_union,Column-method
3476 #' @note array_union since 2.4.0
3477 setMethod("array_union",
3478           signature(x = "Column", y = "Column"),
3479           function(x, y) {
3480             jc <- callJStatic("org.apache.spark.sql.functions", "array_union", x@jc, y@jc)
3481             column(jc)
3482           })
3483
3484 #' @details
3485 #' \code{arrays_zip}: Returns a merged array of structs in which the N-th struct contains all N-th
3486 #' values of input arrays.
3487 #'
3488 #' @rdname column_collection_functions
3489 #' @aliases arrays_zip arrays_zip,Column-method
3490 #' @note arrays_zip since 2.4.0
3491 setMethod("arrays_zip",
3492           signature(x = "Column"),
3493           function(x, ...) {
3494             jcols <- lapply(list(x, ...), function(arg) {
3495               stopifnot(class(arg) == "Column")
3496               arg@jc
3497             })
3498             jc <- callJStatic("org.apache.spark.sql.functions", "arrays_zip", jcols)
3499             column(jc)
3500           })
3501
3502 #' @details
3503 #' \code{shuffle}: Returns a random permutation of the given array.
3504 #'
3505 #' @rdname column_collection_functions
3506 #' @aliases shuffle shuffle,Column-method
3507 #' @note shuffle since 2.4.0
3508 setMethod("shuffle",
3509           signature(x = "Column"),
3510           function(x) {
3511             jc <- callJStatic("org.apache.spark.sql.functions", "shuffle", x@jc)
3512             column(jc)
3513           })
3514
3515 #' @details
3516 #' \code{flatten}: Creates a single array from an array of arrays.
3517 #' If a structure of nested arrays is deeper than two levels, only one level of nesting is removed.
3518 #'
3519 #' @rdname column_collection_functions
3520 #' @aliases flatten flatten,Column-method
3521 #' @note flatten since 2.4.0
3522 setMethod("flatten",
3523           signature(x = "Column"),
3524           function(x) {
3525             jc <- callJStatic("org.apache.spark.sql.functions", "flatten", x@jc)
3526             column(jc)
3527           })
3528
3529 #' @details
3530 #' \code{map_concat}: Returns the union of all the given maps.
3531 #'
3532 #' @rdname column_collection_functions
3533 #' @aliases map_concat map_concat,Column-method
3534 #' @note map_concat since 3.0.0
3535 setMethod("map_concat",
3536           signature(x = "Column"),
3537           function(x, ...) {
3538             jcols <- lapply(list(x, ...), function(arg) {
3539               stopifnot(class(arg) == "Column")
3540               arg@jc
3541             })
3542             jc <- callJStatic("org.apache.spark.sql.functions", "map_concat", jcols)
3543             column(jc)
3544           })
3545
3546 #' @details
3547 #' \code{map_entries}: Returns an unordered array of all entries in the given map.
3548 #'
3549 #' @rdname column_collection_functions
3550 #' @aliases map_entries map_entries,Column-method
3551 #' @note map_entries since 3.0.0
3552 setMethod("map_entries",
3553           signature(x = "Column"),
3554           function(x) {
3555             jc <- callJStatic("org.apache.spark.sql.functions", "map_entries", x@jc)
3556             column(jc)
3557          })
3558
3559 #' @details
3560 #' \code{map_from_arrays}: Creates a new map column. The array in the first column is used for
3561 #' keys. The array in the second column is used for values. All elements in the array for key
3562 #' should not be null.
3563 #'
3564 #' @rdname column_collection_functions
3565 #' @aliases map_from_arrays map_from_arrays,Column-method
3566 #' @note map_from_arrays since 2.4.0
3567 setMethod("map_from_arrays",
3568           signature(x = "Column", y = "Column"),
3569           function(x, y) {
3570             jc <- callJStatic("org.apache.spark.sql.functions", "map_from_arrays", x@jc, y@jc)
3571             column(jc)
3572          })
3573
3574 #' @details
3575 #' \code{map_from_entries}: Returns a map created from the given array of entries.
3576 #'
3577 #' @rdname column_collection_functions
3578 #' @aliases map_from_entries map_from_entries,Column-method
3579 #' @note map_from_entries since 3.0.0
3580 setMethod("map_from_entries",
3581           signature(x = "Column"),
3582           function(x) {
3583             jc <- callJStatic("org.apache.spark.sql.functions", "map_from_entries", x@jc)
3584             column(jc)
3585          })
3586
3587 #' @details
3588 #' \code{map_keys}: Returns an unordered array containing the keys of the map.
3589 #'
3590 #' @rdname column_collection_functions
3591 #' @aliases map_keys map_keys,Column-method
3592 #' @note map_keys since 2.3.0
3593 setMethod("map_keys",
3594           signature(x = "Column"),
3595           function(x) {
3596             jc <- callJStatic("org.apache.spark.sql.functions", "map_keys", x@jc)
3597             column(jc)
3598          })
3599
3600 #' @details
3601 #' \code{map_values}: Returns an unordered array containing the values of the map.
3602 #'
3603 #' @rdname column_collection_functions
3604 #' @aliases map_values map_values,Column-method
3605 #' @note map_values since 2.3.0
3606 setMethod("map_values",
3607           signature(x = "Column"),
3608           function(x) {
3609             jc <- callJStatic("org.apache.spark.sql.functions", "map_values", x@jc)
3610             column(jc)
3611           })
3612
3613 #' @details
3614 #' \code{element_at}: Returns element of array at given index in \code{extraction} if
3615 #' \code{x} is array. Returns value for the given key in \code{extraction} if \code{x} is map.
3616 #' Note: The position is not zero based, but 1 based index.
3617 #'
3618 #' @param extraction index to check for in array or key to check for in map
3619 #' @rdname column_collection_functions
3620 #' @aliases element_at element_at,Column-method
3621 #' @note element_at since 2.4.0
3622 setMethod("element_at",
3623           signature(x = "Column", extraction = "ANY"),
3624           function(x, extraction) {
3625             jc <- callJStatic("org.apache.spark.sql.functions", "element_at", x@jc, extraction)
3626             column(jc)
3627           })
3628
3629 #' @details
3630 #' \code{explode}: Creates a new row for each element in the given array or map column.
3631 #' Uses the default column name \code{col} for elements in the array and
3632 #' \code{key} and \code{value} for elements in the map unless specified otherwise.
3633 #'
3634 #' @rdname column_collection_functions
3635 #' @aliases explode explode,Column-method
3636 #' @note explode since 1.5.0
3637 setMethod("explode",
3638           signature(x = "Column"),
3639           function(x) {
3640             jc <- callJStatic("org.apache.spark.sql.functions", "explode", x@jc)
3641             column(jc)
3642           })
3643
3644 #' @details
3645 #' \code{size}: Returns length of array or map.
3646 #'
3647 #' @rdname column_collection_functions
3648 #' @aliases size size,Column-method
3649 #' @note size since 1.5.0
3650 setMethod("size",
3651           signature(x = "Column"),
3652           function(x) {
3653             jc <- callJStatic("org.apache.spark.sql.functions", "size", x@jc)
3654             column(jc)
3655           })
3656
3657 #' @details
3658 #' \code{slice}: Returns an array containing all the elements in x from the index start
3659 #' (array indices start at 1, or from the end if start is negative) with the specified length.
3660 #'
3661 #' @rdname column_collection_functions
3662 #' @param start the starting index
3663 #' @param length the length of the slice
3664 #' @aliases slice slice,Column-method
3665 #' @note slice since 2.4.0
3666 setMethod("slice",
3667           signature(x = "Column"),
3668           function(x, start, length) {
3669             jc <- callJStatic("org.apache.spark.sql.functions", "slice", x@jc, start, length)
3670             column(jc)
3671           })
3672
3673 #' @details
3674 #' \code{sort_array}: Sorts the input array in ascending or descending order according to
3675 #' the natural ordering of the array elements. NA elements will be placed at the beginning of
3676 #' the returned array in ascending order or at the end of the returned array in descending order.
3677 #'
3678 #' @rdname column_collection_functions
3679 #' @param asc a logical flag indicating the sorting order.
3680 #'            TRUE, sorting is in ascending order.
3681 #'            FALSE, sorting is in descending order.
3682 #' @aliases sort_array sort_array,Column-method
3683 #' @note sort_array since 1.6.0
3684 setMethod("sort_array",
3685           signature(x = "Column"),
3686           function(x, asc = TRUE) {
3687             jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc)
3688             column(jc)
3689           })
3690
3691 #' @details
3692 #' \code{posexplode}: Creates a new row for each element with position in the given array
3693 #' or map column. Uses the default column name \code{pos} for position, and \code{col}
3694 #' for elements in the array and \code{key} and \code{value} for elements in the map
3695 #' unless specified otherwise.
3696 #'
3697 #' @rdname column_collection_functions
3698 #' @aliases posexplode posexplode,Column-method
3699 #' @note posexplode since 2.1.0
3700 setMethod("posexplode",
3701           signature(x = "Column"),
3702           function(x) {
3703             jc <- callJStatic("org.apache.spark.sql.functions", "posexplode", x@jc)
3704             column(jc)
3705           })
3706
3707 #' @details
3708 #' \code{create_array}: Creates a new array column. The input columns must all have the same data
3709 #' type.
3710 #'
3711 #' @rdname column_nonaggregate_functions
3712 #' @aliases create_array create_array,Column-method
3713 #' @note create_array since 2.3.0
3714 setMethod("create_array",
3715           signature(x = "Column"),
3716           function(x, ...) {
3717             jcols <- lapply(list(x, ...), function(x) {
3718               stopifnot(class(x) == "Column")
3719               x@jc
3720             })
3721             jc <- callJStatic("org.apache.spark.sql.functions", "array", jcols)
3722             column(jc)
3723           })
3724
3725 #' @details
3726 #' \code{create_map}: Creates a new map column. The input columns must be grouped as key-value
3727 #' pairs, e.g. (key1, value1, key2, value2, ...).
3728 #' The key columns must all have the same data type, and can't be null.
3729 #' The value columns must all have the same data type.
3730 #'
3731 #' @rdname column_nonaggregate_functions
3732 #' @aliases create_map create_map,Column-method
3733 #' @note create_map since 2.3.0
3734 setMethod("create_map",
3735           signature(x = "Column"),
3736           function(x, ...) {
3737             jcols <- lapply(list(x, ...), function(x) {
3738               stopifnot(class(x) == "Column")
3739               x@jc
3740             })
3741             jc <- callJStatic("org.apache.spark.sql.functions", "map", jcols)
3742             column(jc)
3743           })
3744
3745 #' @details
3746 #' \code{collect_list}: Creates a list of objects with duplicates.
3747 #' Note: the function is non-deterministic because the order of collected results depends
3748 #' on the order of the rows which may be non-deterministic after a shuffle.
3749 #'
3750 #' @rdname column_aggregate_functions
3751 #' @aliases collect_list collect_list,Column-method
3752 #' @examples
3753 #'
3754 #' \dontrun{
3755 #' df2 = df[df$mpg > 20, ]
3756 #' collect(select(df2, collect_list(df2$gear)))
3757 #' collect(select(df2, collect_set(df2$gear)))}
3758 #' @note collect_list since 2.3.0
3759 setMethod("collect_list",
3760           signature(x = "Column"),
3761           function(x) {
3762             jc <- callJStatic("org.apache.spark.sql.functions", "collect_list", x@jc)
3763             column(jc)
3764           })
3765
3766 #' @details
3767 #' \code{collect_set}: Creates a list of objects with duplicate elements eliminated.
3768 #' Note: the function is non-deterministic because the order of collected results depends
3769 #' on the order of the rows which may be non-deterministic after a shuffle.
3770 #'
3771 #' @rdname column_aggregate_functions
3772 #' @aliases collect_set collect_set,Column-method
3773 #' @note collect_set since 2.3.0
3774 setMethod("collect_set",
3775           signature(x = "Column"),
3776           function(x) {
3777             jc <- callJStatic("org.apache.spark.sql.functions", "collect_set", x@jc)
3778             column(jc)
3779           })
3780
3781 #' @details
3782 #' \code{split_string}: Splits string on regular expression.
3783 #' Equivalent to \code{split} SQL function. Optionally a
3784 #' \code{limit} can be specified
3785 #'
3786 #' @rdname column_string_functions
3787 #' @param limit determines the length of the returned array.
3788 #'              \itemize{
3789 #'              \item \code{limit > 0}: length of the array will be at most \code{limit}
3790 #'              \item \code{limit <= 0}: the returned array can have any length
3791 #'              }
3792 #'
3793 #' @aliases split_string split_string,Column-method
3794 #' @examples
3795 #'
3796 #' \dontrun{
3797 #' head(select(df, split_string(df$Class, "\\d", 2)))
3798 #' head(select(df, split_string(df$Sex, "a")))
3799 #' head(select(df, split_string(df$Class, "\\d")))
3800 #' # This is equivalent to the following SQL expression
3801 #' head(selectExpr(df, "split(Class, '\\\\d')"))}
3802 #' @note split_string 2.3.0
3803 setMethod("split_string",
3804           signature(x = "Column", pattern = "character"),
3805           function(x, pattern, limit = -1) {
3806             jc <- callJStatic("org.apache.spark.sql.functions",
3807                               "split", x@jc, pattern, as.integer(limit))
3808             column(jc)
3809           })
3810
3811 #' @details
3812 #' \code{repeat_string}: Repeats string n times.
3813 #' Equivalent to \code{repeat} SQL function.
3814 #'
3815 #' @param n number of repetitions.
3816 #' @rdname column_string_functions
3817 #' @aliases repeat_string repeat_string,Column-method
3818 #' @examples
3819 #'
3820 #' \dontrun{
3821 #' head(select(df, repeat_string(df$Class, 3)))
3822 #' # This is equivalent to the following SQL expression
3823 #' head(selectExpr(df, "repeat(Class, 3)"))}
3824 #' @note repeat_string since 2.3.0
3825 setMethod("repeat_string",
3826           signature(x = "Column", n = "numeric"),
3827           function(x, n) {
3828             jc <- callJStatic("org.apache.spark.sql.functions", "repeat", x@jc, numToInt(n))
3829             column(jc)
3830           })
3831
3832 #' @details
3833 #' \code{explode}: Creates a new row for each element in the given array or map column.
3834 #' Unlike \code{explode}, if the array/map is \code{null} or empty
3835 #' then \code{null} is produced.
3836 #' Uses the default column name \code{col} for elements in the array and
3837 #' \code{key} and \code{value} for elements in the map unless specified otherwise.
3838 #'
3839 #' @rdname column_collection_functions
3840 #' @aliases explode_outer explode_outer,Column-method
3841 #' @examples
3842 #'
3843 #' \dontrun{
3844 #' df2 <- createDataFrame(data.frame(
3845 #'   id = c(1, 2, 3), text = c("a,b,c", NA, "d,e")
3846 #' ))
3847 #'
3848 #' head(select(df2, df2$id, explode_outer(split_string(df2$text, ","))))
3849 #' head(select(df2, df2$id, posexplode_outer(split_string(df2$text, ","))))}
3850 #' @note explode_outer since 2.3.0
3851 setMethod("explode_outer",
3852           signature(x = "Column"),
3853           function(x) {
3854             jc <- callJStatic("org.apache.spark.sql.functions", "explode_outer", x@jc)
3855             column(jc)
3856           })
3857
3858 #' @details
3859 #' \code{posexplode_outer}: Creates a new row for each element with position in the given
3860 #' array or map column. Unlike \code{posexplode}, if the array/map is \code{null} or empty
3861 #' then the row (\code{null}, \code{null}) is produced.
3862 #' Uses the default column name \code{pos} for position, and \code{col}
3863 #' for elements in the array and \code{key} and \code{value} for elements in the map
3864 #' unless specified otherwise.
3865 #'
3866 #' @rdname column_collection_functions
3867 #' @aliases posexplode_outer posexplode_outer,Column-method
3868 #' @note posexplode_outer since 2.3.0
3869 setMethod("posexplode_outer",
3870           signature(x = "Column"),
3871           function(x) {
3872             jc <- callJStatic("org.apache.spark.sql.functions", "posexplode_outer", x@jc)
3873             column(jc)
3874           })
3875
3876 #' not
3877 #'
3878 #' Inversion of boolean expression.
3879 #'
3880 #' \code{not} and \code{!} cannot be applied directly to numerical column.
3881 #' To achieve R-like truthiness column has to be casted to \code{BooleanType}.
3882 #'
3883 #' @param x Column to compute on
3884 #' @rdname not
3885 #' @name not
3886 #' @aliases not,Column-method
3887 #' @family non-aggregate functions
3888 #' @examples
3889 #' \dontrun{
3890 #' df <- createDataFrame(data.frame(
3891 #'   is_true = c(TRUE, FALSE, NA),
3892 #'   flag = c(1, 0,  1)
3893 #' ))
3894 #'
3895 #' head(select(df, not(df$is_true)))
3896 #'
3897 #' # Explicit cast is required when working with numeric column
3898 #' head(select(df, not(cast(df$flag, "boolean"))))
3899 #' }
3900 #' @note not since 2.3.0
3901 setMethod("not",
3902           signature(x = "Column"),
3903           function(x) {
3904             jc <- callJStatic("org.apache.spark.sql.functions", "not", x@jc)
3905             column(jc)
3906           })
3907
3908 #' @details
3909 #' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY list is aggregated or
3910 #' not, returns 1 for aggregated or 0 for not aggregated in the result set. Same as \code{GROUPING}
3911 #' in SQL and \code{grouping} function in Scala.
3912 #'
3913 #' @rdname column_aggregate_functions
3914 #' @aliases grouping_bit grouping_bit,Column-method
3915 #' @examples
3916 #'
3917 #' \dontrun{
3918 #' # With cube
3919 #' agg(
3920 #'   cube(df, "cyl", "gear", "am"),
3921 #'   mean(df$mpg),
3922 #'   grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
3923 #' )
3924 #'
3925 #' # With rollup
3926 #' agg(
3927 #'   rollup(df, "cyl", "gear", "am"),
3928 #'   mean(df$mpg),
3929 #'   grouping_bit(df$cyl), grouping_bit(df$gear), grouping_bit(df$am)
3930 #' )}
3931 #' @note grouping_bit since 2.3.0
3932 setMethod("grouping_bit",
3933           signature(x = "Column"),
3934           function(x) {
3935             jc <- callJStatic("org.apache.spark.sql.functions", "grouping", x@jc)
3936             column(jc)
3937           })
3938
3939 #' @details
3940 #' \code{grouping_id}: Returns the level of grouping.
3941 #' Equals to \code{
3942 #' grouping_bit(c1) * 2^(n - 1) + grouping_bit(c2) * 2^(n - 2)  + ... + grouping_bit(cn)
3943 #' }.
3944 #'
3945 #' @rdname column_aggregate_functions
3946 #' @aliases grouping_id grouping_id,Column-method
3947 #' @examples
3948 #'
3949 #' \dontrun{
3950 #' # With cube
3951 #' agg(
3952 #'   cube(df, "cyl", "gear", "am"),
3953 #'   mean(df$mpg),
3954 #'   grouping_id(df$cyl, df$gear, df$am)
3955 #' )
3956 #'
3957 #' # With rollup
3958 #' agg(
3959 #'   rollup(df, "cyl", "gear", "am"),
3960 #'   mean(df$mpg),
3961 #'   grouping_id(df$cyl, df$gear, df$am)
3962 #' )}
3963 #' @note grouping_id since 2.3.0
3964 setMethod("grouping_id",
3965           signature(x = "Column"),
3966           function(x, ...) {
3967             jcols <- lapply(list(x, ...), function(x) {
3968               stopifnot(class(x) == "Column")
3969               x@jc
3970             })
3971             jc <- callJStatic("org.apache.spark.sql.functions", "grouping_id", jcols)
3972             column(jc)
3973           })
3974
3975 #' @details
3976 #' \code{input_file_name}: Creates a string column with the input file name for a given row.
3977 #' The method should be used with no argument.
3978 #'
3979 #' @rdname column_nonaggregate_functions
3980 #' @aliases input_file_name input_file_name,missing-method
3981 #' @examples
3982 #'
3983 #' \dontrun{
3984 #' tmp <- read.text("README.md")
3985 #' head(select(tmp, input_file_name()))}
3986 #' @note input_file_name since 2.3.0
3987 setMethod("input_file_name", signature("missing"),
3988           function() {
3989             jc <- callJStatic("org.apache.spark.sql.functions", "input_file_name")
3990             column(jc)
3991           })
3992
3993 #' @details
3994 #' \code{trunc}: Returns date truncated to the unit specified by the format.
3995 #'
3996 #' @rdname column_datetime_functions
3997 #' @aliases trunc trunc,Column-method
3998 #' @examples
3999 #'
4000 #' \dontrun{
4001 #' head(select(df, df$time, trunc(df$time, "year"), trunc(df$time, "yy"),
4002 #'            trunc(df$time, "month"), trunc(df$time, "mon")))}
4003 #' @note trunc since 2.3.0
4004 setMethod("trunc",
4005           signature(x = "Column"),
4006           function(x, format) {
4007             jc <- callJStatic("org.apache.spark.sql.functions", "trunc",
4008                               x@jc, as.character(format))
4009             column(jc)
4010           })
4011
4012 #' @details
4013 #' \code{date_trunc}: Returns timestamp truncated to the unit specified by the format.
4014 #'
4015 #' @rdname column_datetime_functions
4016 #' @aliases date_trunc date_trunc,character,Column-method
4017 #' @examples
4018 #'
4019 #' \dontrun{
4020 #' head(select(df, df$time, date_trunc("hour", df$time), date_trunc("minute", df$time),
4021 #'             date_trunc("week", df$time), date_trunc("quarter", df$time)))}
4022 #' @note date_trunc since 2.3.0
4023 setMethod("date_trunc",
4024           signature(format = "character", x = "Column"),
4025           function(format, x) {
4026             jc <- callJStatic("org.apache.spark.sql.functions", "date_trunc", format, x@jc)
4027             column(jc)
4028           })
4029
4030 #' @details
4031 #' \code{current_date}: Returns the current date as a date column.
4032 #'
4033 #' @rdname column_datetime_functions
4034 #' @aliases current_date current_date,missing-method
4035 #' @examples
4036 #' \dontrun{
4037 #' head(select(df, current_date(), current_timestamp()))}
4038 #' @note current_date since 2.3.0
4039 setMethod("current_date",
4040           signature("missing"),
4041           function() {
4042             jc <- callJStatic("org.apache.spark.sql.functions", "current_date")
4043             column(jc)
4044           })
4045
4046 #' @details
4047 #' \code{current_timestamp}: Returns the current timestamp as a timestamp column.
4048 #'
4049 #' @rdname column_datetime_functions
4050 #' @aliases current_timestamp current_timestamp,missing-method
4051 #' @note current_timestamp since 2.3.0
4052 setMethod("current_timestamp",
4053           signature("missing"),
4054           function() {
4055             jc <- callJStatic("org.apache.spark.sql.functions", "current_timestamp")
4056             column(jc)
4057           })