apache
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 2 additions & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 112 additions & 14 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 112 additions & 14 deletions
diff --git a/‎R/pkg/R/functions.R‎
Lines changed: 17 additions & 0 deletions b/‎R/pkg/R/functions.R‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 10 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎R/pkg/R/group.R‎
Lines changed: 40 additions & 53 deletions b/‎R/pkg/R/group.R‎
Lines changed: 40 additions & 53 deletions
@@ -77,3 +77,4 @@ spark-warehouse/
 # For R session data
 .RData
 .RHistory
+.Rhistory
@@ -69,6 +69,7 @@ exportMethods("arrange",
               "first",
               "freqItems",
               "gapply",
+              "gapplyCollect",
               "group_by",
               "groupBy",
               "head",
@@ -234,6 +235,7 @@ exportMethods("%in%",
               "over",
               "percent_rank",
               "pmod",
+              "posexplode",
               "quarter",
               "rand",
               "randn",
 
@@ -176,8 +176,8 @@ setMethod("isLocal",
 #' @param x A SparkDataFrame
 #' @param numRows The number of rows to print. Defaults to 20.
 #' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be
-#' truncated and all cells will be aligned right
-#'
+#'    truncated. However, if set greater than zero, truncates strings longer than `truncate`
+#'    characters and all cells will be aligned right.
 #' @family SparkDataFrame functions
 #' @rdname showDF
 #' @name showDF
@@ -193,7 +193,12 @@ setMethod("isLocal",
 setMethod("showDF",
           signature(x = "SparkDataFrame"),
           function(x, numRows = 20, truncate = TRUE) {
-            s <- callJMethod(x@sdf, "showString", numToInt(numRows), truncate)
+            if (is.logical(truncate) && truncate) {
+              s <- callJMethod(x@sdf, "showString", numToInt(numRows), numToInt(20))
+            } else {
+              truncate2 <- as.numeric(truncate)
+              s <- callJMethod(x@sdf, "showString", numToInt(numRows), numToInt(truncate2))
+            }
             cat(s)
           })
 
@@ -466,7 +471,7 @@ setMethod("createOrReplaceTempView",
 
 #' (Deprecated) Register Temporary Table
 #'
-#' Registers a SparkDataFrame as a Temporary Table in the SQLContext
+#' Registers a SparkDataFrame as a Temporary Table in the SparkSession
 #' @param x A SparkDataFrame
 #' @param tableName A character vector containing the name of the table
 #'
@@ -493,7 +498,7 @@ setMethod("registerTempTable",
 
 #' insertInto
 #'
-#' Insert the contents of a SparkDataFrame into a table registered in the current SQL Context.
+#' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession.
 #'
 #' @param x A SparkDataFrame
 #' @param tableName A character vector containing the name of the table
@@ -1339,7 +1344,7 @@ setMethod("dapplyCollect",
 
 #' gapply
 #'
-#' Group the SparkDataFrame using the specified columns and apply the R function to each
+#' Groups the SparkDataFrame using the specified columns and applies the R function to each
 #' group.
 #'
 #' @param x A SparkDataFrame
@@ -1351,9 +1356,11 @@ setMethod("dapplyCollect",
 #' @param schema The schema of the resulting SparkDataFrame after the function is applied.
 #'               The schema must match to output of `func`. It has to be defined for each
 #'               output column with preferred output column name and corresponding data type.
+#' @return a SparkDataFrame
 #' @family SparkDataFrame functions
 #' @rdname gapply
 #' @name gapply
+#' @seealso \link{gapplyCollect}
 #' @export
 #' @examples
 #'
@@ -1369,14 +1376,22 @@ setMethod("dapplyCollect",
 #' columns with data types integer and string and the mean which is a double.
 #' schema <-  structType(structField("a", "integer"), structField("c", "string"),
 #'   structField("avg", "double"))
-#' df1 <- gapply(
+#' result <- gapply(
 #'   df,
-#'   list("a", "c"),
+#'   c("a", "c"),
 #'   function(key, x) {
 #'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
-#'   },
-#' schema)
-#' collect(df1)
+#' }, schema)
+#'
+#' We can also group the data and afterwards call gapply on GroupedData.
+#' For Example:
+#' gdf <- group_by(df, "a", "c")
+#' result <- gapply(
+#'   gdf,
+#'   function(key, x) {
+#'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
+#' }, schema)
+#' collect(result)
 #'
 #' Result
 #' ------
@@ -1394,16 +1409,16 @@ setMethod("dapplyCollect",
 #'   structField("Petal_Width", "double"))
 #' df1 <- gapply(
 #'   df,
-#'   list(df$"Species"),
+#'   df$"Species",
 #'   function(key, x) {
 #'     m <- suppressWarnings(lm(Sepal_Length ~
 #'     Sepal_Width + Petal_Length + Petal_Width, x))
 #'     data.frame(t(coef(m)))
 #'   }, schema)
 #' collect(df1)
 #'
-#'Result
-#'---------
+#' Result
+#' ---------
 #' Model  (Intercept)  Sepal_Width  Petal_Length  Petal_Width
 #' 1        0.699883    0.3303370    0.9455356    -0.1697527
 #' 2        1.895540    0.3868576    0.9083370    -0.6792238
@@ -1418,6 +1433,89 @@ setMethod("gapply",
             gapply(grouped, func, schema)
           })
 
+#' gapplyCollect
+#'
+#' Groups the SparkDataFrame using the specified columns, applies the R function to each
+#' group and collects the result back to R as data.frame.
+#'
+#' @param x A SparkDataFrame
+#' @param cols Grouping columns
+#' @param func A function to be applied to each group partition specified by grouping
+#'             column of the SparkDataFrame. The function `func` takes as argument
+#'             a key - grouping columns and a data frame - a local R data.frame.
+#'             The output of `func` is a local R data.frame.
+#' @return a data.frame
+#' @family SparkDataFrame functions
+#' @rdname gapplyCollect
+#' @name gapplyCollect
+#' @seealso \link{gapply}
+#' @export
+#' @examples
+#'
+#' \dontrun{
+#' Computes the arithmetic mean of the second column by grouping
+#' on the first and third columns. Output the grouping values and the average.
+#'
+#' df <- createDataFrame (
+#' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
+#'   c("a", "b", "c", "d"))
+#'
+#' result <- gapplyCollect(
+#'   df,
+#'   c("a", "c"),
+#'   function(key, x) {
+#'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
+#'     colnames(y) <- c("key_a", "key_c", "mean_b")
+#'     y
+#'   })
+#'
+#' We can also group the data and afterwards call gapply on GroupedData.
+#' For Example:
+#' gdf <- group_by(df, "a", "c")
+#' result <- gapplyCollect(
+#'   gdf,
+#'   function(key, x) {
+#'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
+#'     colnames(y) <- c("key_a", "key_c", "mean_b")
+#'     y
+#'   })
+#'
+#' Result
+#' ------
+#' key_a key_c mean_b
+#' 3 3 3.0
+#' 1 1 1.5
+#'
+#' Fits linear models on iris dataset by grouping on the 'Species' column and
+#' using 'Sepal_Length' as a target variable, 'Sepal_Width', 'Petal_Length'
+#' and 'Petal_Width' as training features.
+#'
+#' df <- createDataFrame (iris)
+#' result <- gapplyCollect(
+#'   df,
+#'   df$"Species",
+#'   function(key, x) {
+#'     m <- suppressWarnings(lm(Sepal_Length ~
+#'     Sepal_Width + Petal_Length + Petal_Width, x))
+#'     data.frame(t(coef(m)))
+#'   })
+#'
+#' Result
+#'---------
+#' Model  X.Intercept.  Sepal_Width  Petal_Length  Petal_Width
+#' 1        0.699883    0.3303370    0.9455356    -0.1697527
+#' 2        1.895540    0.3868576    0.9083370    -0.6792238
+#' 3        2.351890    0.6548350    0.2375602     0.2521257
+#'
+#'}
+#' @note gapplyCollect(SparkDataFrame) since 2.0.0
+setMethod("gapplyCollect",
+          signature(x = "SparkDataFrame"),
+          function(x, cols, func) {
+            grouped <- do.call("groupBy", c(x, cols))
+            gapplyCollect(grouped, func)
+          })
+
 ############################## RDD Map Functions ##################################
 # All of the following functions mirror the existing RDD map functions,           #
 # but allow for use with DataFrames by first converting to an RRDD before calling #
 
@@ -2934,3 +2934,20 @@ setMethod("sort_array",
             jc <- callJStatic("org.apache.spark.sql.functions", "sort_array", x@jc, asc)
             column(jc)
           })
+
+#' posexplode
+#'
+#' Creates a new row for each element with position in the given array or map column.
+#'
+#' @rdname posexplode
+#' @name posexplode
+#' @family collection_funcs
+#' @export
+#' @examples \dontrun{posexplode(df$c)}
+#' @note posexplode since 2.1.0
+setMethod("posexplode",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "posexplode", x@jc)
+            column(jc)
+          })
@@ -469,6 +469,10 @@ setGeneric("dapplyCollect", function(x, func) { standardGeneric("dapplyCollect")
 #' @export
 setGeneric("gapply", function(x, ...) { standardGeneric("gapply") })
 
+#' @rdname gapplyCollect
+#' @export
+setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect") })
+
 #' @rdname summary
 #' @export
 setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
@@ -1050,6 +1054,10 @@ setGeneric("percent_rank", function(x) { standardGeneric("percent_rank") })
 #' @export
 setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
 
+#' @rdname posexplode
+#' @export
+setGeneric("posexplode", function(x) { standardGeneric("posexplode") })
+
 #' @rdname quarter
 #' @export
 setGeneric("quarter", function(x) { standardGeneric("quarter") })
@@ -1247,6 +1255,7 @@ setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.gl
 #' @export
 setGeneric("glm")
 
+#' predict
 #' @rdname predict
 #' @export
 setGeneric("predict", function(object, ...) { standardGeneric("predict") })
@@ -1271,6 +1280,7 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s
 #' @export
 setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") })
 
+#' write.ml
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") })
@@ -196,64 +196,51 @@ createMethods()
 
 #' gapply
 #'
-#' Applies a R function to each group in the input GroupedData
-#'
-#' @param x a GroupedData
-#' @param func A function to be applied to each group partition specified by GroupedData.
-#'             The function `func` takes as argument a key - grouping columns and
-#'             a data frame - a local R data.frame.
-#'             The output of `func` is a local R data.frame.
-#' @param schema The schema of the resulting SparkDataFrame after the function is applied.
-#'               The schema must match to output of `func`. It has to be defined for each
-#'               output column with preferred output column name and corresponding data type.
-#' @return a SparkDataFrame
+#' @param x A GroupedData
 #' @rdname gapply
 #' @name gapply
 #' @export
-#' @examples
-#' \dontrun{
-#' Computes the arithmetic mean of the second column by grouping
-#' on the first and third columns. Output the grouping values and the average.
-#'
-#' df <- createDataFrame (
-#' list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
-#'   c("a", "b", "c", "d"))
-#'
-#' Here our output contains three columns, the key which is a combination of two
-#' columns with data types integer and string and the mean which is a double.
-#' schema <-  structType(structField("a", "integer"), structField("c", "string"),
-#'   structField("avg", "double"))
-#' df1 <- gapply(
-#'   df,
-#'   list("a", "c"),
-#'   function(key, x) {
-#'     y <- data.frame(key, mean(x$b), stringsAsFactors = FALSE)
-#'   },
-#' schema)
-#' collect(df1)
-#'
-#' Result
-#' ------
-#' a c avg
-#' 3 3 3.0
-#' 1 1 1.5
-#' }
 #' @note gapply(GroupedData) since 2.0.0
 setMethod("gapply",
           signature(x = "GroupedData"),
           function(x, func, schema) {
-            try(if (is.null(schema)) stop("schema cannot be NULL"))
-            packageNamesArr <- serialize(.sparkREnv[[".packages"]],
-                                 connection = NULL)
-            broadcastArr <- lapply(ls(.broadcastNames),
-                              function(name) { get(name, .broadcastNames) })
-            sdf <- callJStatic(
-                     "org.apache.spark.sql.api.r.SQLUtils",
-                     "gapply",
-                     x@sgd,
-                     serialize(cleanClosure(func), connection = NULL),
-                     packageNamesArr,
-                     broadcastArr,
-                     schema$jobj)
-            dataFrame(sdf)
+            if (is.null(schema)) stop("schema cannot be NULL")
+            gapplyInternal(x, func, schema)
+          })
+
+#' gapplyCollect
+#'
+#' @param x A GroupedData
+#' @rdname gapplyCollect
+#' @name gapplyCollect
+#' @export
+#' @note gapplyCollect(GroupedData) since 2.0.0
+setMethod("gapplyCollect",
+          signature(x = "GroupedData"),
+          function(x, func) {
+            gdf <- gapplyInternal(x, func, NULL)
+            content <- callJMethod(gdf@sdf, "collect")
+            # content is a list of items of struct type. Each item has a single field
+            # which is a serialized data.frame corresponds to one group of the
+            # SparkDataFrame.
+            ldfs <- lapply(content, function(x) { unserialize(x[[1]]) })
+            ldf <- do.call(rbind, ldfs)
+            row.names(ldf) <- NULL
+            ldf
           })
+
+gapplyInternal <- function(x, func, schema) {
+  packageNamesArr <- serialize(.sparkREnv[[".packages"]],
+                       connection = NULL)
+  broadcastArr <- lapply(ls(.broadcastNames),
+                    function(name) { get(name, .broadcastNames) })
+  sdf <- callJStatic(
+           "org.apache.spark.sql.api.r.SQLUtils",
+           "gapply",
+           x@sgd,
+           serialize(cleanClosure(func), connection = NULL),
+           packageNamesArr,
+           broadcastArr,
+           if (class(schema) == "structType") { schema$jobj } else { NULL })
+  dataFrame(sdf)
+}