Skip to content

Commit 70a7268

Browse files
committed
# Conflicts: # sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
2 parents f41811f + 964b507 commit 70a7268

File tree

769 files changed

+23285
-8002
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

769 files changed

+23285
-8002
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ dev/pr-deps/
4747
dist/
4848
docs/_site
4949
docs/api
50+
sql/docs
51+
sql/site
5052
lib_managed/
5153
lint-r-report.log
5254
log/

R/pkg/DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Package: SparkR
22
Type: Package
33
Version: 2.3.0
44
Title: R Frontend for Apache Spark
5-
Description: The SparkR package provides an R Frontend for Apache Spark.
5+
Description: Provides an R Frontend for Apache Spark.
66
Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
77
email = "[email protected]"),
88
person("Xiangrui", "Meng", role = "aut",

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@ exportMethods("%<=>%",
286286
"lower",
287287
"lpad",
288288
"ltrim",
289+
"map_keys",
290+
"map_values",
289291
"max",
290292
"md5",
291293
"mean",

R/pkg/R/DataFrame.R

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2930,7 +2930,7 @@ setMethod("saveAsTable",
29302930
invisible(callJMethod(write, "saveAsTable", tableName))
29312931
})
29322932

2933-
#' summary
2933+
#' describe
29342934
#'
29352935
#' Computes statistics for numeric and string columns.
29362936
#' If no columns are given, this function computes statistics for all numerical or string columns.
@@ -2941,7 +2941,7 @@ setMethod("saveAsTable",
29412941
#' @return A SparkDataFrame.
29422942
#' @family SparkDataFrame functions
29432943
#' @aliases describe,SparkDataFrame,character-method describe,SparkDataFrame,ANY-method
2944-
#' @rdname summary
2944+
#' @rdname describe
29452945
#' @name describe
29462946
#' @export
29472947
#' @examples
@@ -2953,6 +2953,7 @@ setMethod("saveAsTable",
29532953
#' describe(df, "col1")
29542954
#' describe(df, "col1", "col2")
29552955
#' }
2956+
#' @seealso See \link{summary} for expanded statistics and control over which statistics to compute.
29562957
#' @note describe(SparkDataFrame, character) since 1.4.0
29572958
setMethod("describe",
29582959
signature(x = "SparkDataFrame", col = "character"),
@@ -2962,7 +2963,7 @@ setMethod("describe",
29622963
dataFrame(sdf)
29632964
})
29642965

2965-
#' @rdname summary
2966+
#' @rdname describe
29662967
#' @name describe
29672968
#' @aliases describe,SparkDataFrame-method
29682969
#' @note describe(SparkDataFrame) since 1.4.0
@@ -2973,15 +2974,50 @@ setMethod("describe",
29732974
dataFrame(sdf)
29742975
})
29752976

2977+
#' summary
2978+
#'
2979+
#' Computes specified statistics for numeric and string columns. Available statistics are:
2980+
#' \itemize{
2981+
#' \item count
2982+
#' \item mean
2983+
#' \item stddev
2984+
#' \item min
2985+
#' \item max
2986+
#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
2987+
#' }
2988+
#' If no statistics are given, this function computes count, mean, stddev, min,
2989+
#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
2990+
#' This function is meant for exploratory data analysis, as we make no guarantee about the
2991+
#' backward compatibility of the schema of the resulting Dataset. If you want to
2992+
#' programmatically compute summary statistics, use the \code{agg} function instead.
2993+
#'
2994+
#'
29762995
#' @param object a SparkDataFrame to be summarized.
2996+
#' @param ... (optional) statistics to be computed for all columns.
2997+
#' @return A SparkDataFrame.
2998+
#' @family SparkDataFrame functions
29772999
#' @rdname summary
29783000
#' @name summary
29793001
#' @aliases summary,SparkDataFrame-method
3002+
#' @export
3003+
#' @examples
3004+
#'\dontrun{
3005+
#' sparkR.session()
3006+
#' path <- "path/to/file.json"
3007+
#' df <- read.json(path)
3008+
#' summary(df)
3009+
#' summary(df, "min", "25%", "75%", "max")
3010+
#' summary(select(df, "age", "height"))
3011+
#' }
29803012
#' @note summary(SparkDataFrame) since 1.5.0
3013+
#' @note The statistics provided by \code{summary} were change in 2.3.0 use \link{describe} for previous defaults.
3014+
#' @seealso \link{describe}
29813015
setMethod("summary",
29823016
signature(object = "SparkDataFrame"),
29833017
function(object, ...) {
2984-
describe(object)
3018+
statisticsList <- list(...)
3019+
sdf <- callJMethod(object@sdf, "summary", statisticsList)
3020+
dataFrame(sdf)
29853021
})
29863022

29873023

R/pkg/R/functions.R

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,10 @@ NULL
195195
#' head(tmp2)
196196
#' head(select(tmp, posexplode(tmp$v1)))
197197
#' head(select(tmp, sort_array(tmp$v1)))
198-
#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))}
198+
#' head(select(tmp, sort_array(tmp$v1, asc = FALSE)))
199+
#' tmp3 <- mutate(df, v3 = create_map(df$model, df$cyl))
200+
#' head(select(tmp3, map_keys(tmp3$v3)))
201+
#' head(select(tmp3, map_values(tmp3$v3)))}
199202
NULL
200203

201204
#' Window functions for Column operations
@@ -3055,6 +3058,34 @@ setMethod("array_contains",
30553058
column(jc)
30563059
})
30573060

3061+
#' @details
3062+
#' \code{map_keys}: Returns an unordered array containing the keys of the map.
3063+
#'
3064+
#' @rdname column_collection_functions
3065+
#' @aliases map_keys map_keys,Column-method
3066+
#' @export
3067+
#' @note map_keys since 2.3.0
3068+
setMethod("map_keys",
3069+
signature(x = "Column"),
3070+
function(x) {
3071+
jc <- callJStatic("org.apache.spark.sql.functions", "map_keys", x@jc)
3072+
column(jc)
3073+
})
3074+
3075+
#' @details
3076+
#' \code{map_values}: Returns an unordered array containing the values of the map.
3077+
#'
3078+
#' @rdname column_collection_functions
3079+
#' @aliases map_values map_values,Column-method
3080+
#' @export
3081+
#' @note map_values since 2.3.0
3082+
setMethod("map_values",
3083+
signature(x = "Column"),
3084+
function(x) {
3085+
jc <- callJStatic("org.apache.spark.sql.functions", "map_values", x@jc)
3086+
column(jc)
3087+
})
3088+
30583089
#' @details
30593090
#' \code{explode}: Creates a new row for each element in the given array or map column.
30603091
#'

R/pkg/R/generics.R

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ setGeneric("gapplyCollect", function(x, ...) { standardGeneric("gapplyCollect")
521521
# @export
522522
setGeneric("getNumPartitions", function(x) { standardGeneric("getNumPartitions") })
523523

524-
#' @rdname summary
524+
#' @rdname describe
525525
#' @export
526526
setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
527527

@@ -1213,6 +1213,16 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
12131213
#' @name NULL
12141214
setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
12151215

1216+
#' @rdname column_collection_functions
1217+
#' @export
1218+
#' @name NULL
1219+
setGeneric("map_keys", function(x) { standardGeneric("map_keys") })
1220+
1221+
#' @rdname column_collection_functions
1222+
#' @export
1223+
#' @name NULL
1224+
setGeneric("map_values", function(x) { standardGeneric("map_values") })
1225+
12161226
#' @rdname column_misc_functions
12171227
#' @export
12181228
#' @name NULL

R/pkg/R/install.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,11 @@ sparkCachePath <- function() {
270270
if (is_windows()) {
271271
winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA)
272272
if (is.na(winAppPath)) {
273-
stop(paste("%LOCALAPPDATA% not found.",
273+
message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.")
274+
winAppPath <- Sys.getenv("USERPROFILE", unset = NA)
275+
}
276+
if (is.na(winAppPath)) {
277+
stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.",
274278
"Please define the environment variable",
275279
"or restart and enter an installation path in localDir."))
276280
} else {

R/pkg/R/mllib_classification.R

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
6969
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
7070
#' or the number of partitions are large, this param could be adjusted to a larger size.
7171
#' This is an expert parameter. Default value should be good for most cases.
72+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
73+
#' column of string type.
74+
#' Supported options: "skip" (filter out rows with invalid data),
75+
#' "error" (throw an error), "keep" (put invalid data in a special additional
76+
#' bucket, at index numLabels). Default is "error".
7277
#' @param ... additional arguments passed to the method.
7378
#' @return \code{spark.svmLinear} returns a fitted linear SVM model.
7479
#' @rdname spark.svmLinear
@@ -98,7 +103,8 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
98103
#' @note spark.svmLinear since 2.2.0
99104
setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formula"),
100105
function(data, formula, regParam = 0.0, maxIter = 100, tol = 1E-6, standardization = TRUE,
101-
threshold = 0.0, weightCol = NULL, aggregationDepth = 2) {
106+
threshold = 0.0, weightCol = NULL, aggregationDepth = 2,
107+
handleInvalid = c("error", "keep", "skip")) {
102108
formula <- paste(deparse(formula), collapse = "")
103109

104110
if (!is.null(weightCol) && weightCol == "") {
@@ -107,10 +113,12 @@ setMethod("spark.svmLinear", signature(data = "SparkDataFrame", formula = "formu
107113
weightCol <- as.character(weightCol)
108114
}
109115

116+
handleInvalid <- match.arg(handleInvalid)
117+
110118
jobj <- callJStatic("org.apache.spark.ml.r.LinearSVCWrapper", "fit",
111119
data@sdf, formula, as.numeric(regParam), as.integer(maxIter),
112120
as.numeric(tol), as.logical(standardization), as.numeric(threshold),
113-
weightCol, as.integer(aggregationDepth))
121+
weightCol, as.integer(aggregationDepth), handleInvalid)
114122
new("LinearSVCModel", jobj = jobj)
115123
})
116124

@@ -218,6 +226,11 @@ function(object, path, overwrite = FALSE) {
218226
#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
219227
#' The bound vector size must be equal to 1 for binomial regression, or the number
220228
#' of classes for multinomial regression.
229+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
230+
#' column of string type.
231+
#' Supported options: "skip" (filter out rows with invalid data),
232+
#' "error" (throw an error), "keep" (put invalid data in a special additional
233+
#' bucket, at index numLabels). Default is "error".
221234
#' @param ... additional arguments passed to the method.
222235
#' @return \code{spark.logit} returns a fitted logistic regression model.
223236
#' @rdname spark.logit
@@ -257,7 +270,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
257270
tol = 1E-6, family = "auto", standardization = TRUE,
258271
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
259272
lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
260-
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
273+
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL,
274+
handleInvalid = c("error", "keep", "skip")) {
261275
formula <- paste(deparse(formula), collapse = "")
262276
row <- 0
263277
col <- 0
@@ -304,6 +318,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
304318
upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
305319
}
306320

321+
handleInvalid <- match.arg(handleInvalid)
322+
307323
jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
308324
data@sdf, formula, as.numeric(regParam),
309325
as.numeric(elasticNetParam), as.integer(maxIter),
@@ -312,7 +328,8 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula")
312328
weightCol, as.integer(aggregationDepth),
313329
as.integer(row), as.integer(col),
314330
lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
315-
lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
331+
lowerBoundsOnIntercepts, upperBoundsOnIntercepts,
332+
handleInvalid)
316333
new("LogisticRegressionModel", jobj = jobj)
317334
})
318335

@@ -394,7 +411,12 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
394411
#' @param stepSize stepSize parameter.
395412
#' @param seed seed parameter for weights initialization.
396413
#' @param initialWeights initialWeights parameter for weights initialization, it should be a
397-
#' numeric vector.
414+
#' numeric vector.
415+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
416+
#' column of string type.
417+
#' Supported options: "skip" (filter out rows with invalid data),
418+
#' "error" (throw an error), "keep" (put invalid data in a special additional
419+
#' bucket, at index numLabels). Default is "error".
398420
#' @param ... additional arguments passed to the method.
399421
#' @return \code{spark.mlp} returns a fitted Multilayer Perceptron Classification Model.
400422
#' @rdname spark.mlp
@@ -426,7 +448,8 @@ setMethod("write.ml", signature(object = "LogisticRegressionModel", path = "char
426448
#' @note spark.mlp since 2.1.0
427449
setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
428450
function(data, formula, layers, blockSize = 128, solver = "l-bfgs", maxIter = 100,
429-
tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL) {
451+
tol = 1E-6, stepSize = 0.03, seed = NULL, initialWeights = NULL,
452+
handleInvalid = c("error", "keep", "skip")) {
430453
formula <- paste(deparse(formula), collapse = "")
431454
if (is.null(layers)) {
432455
stop ("layers must be a integer vector with length > 1.")
@@ -441,10 +464,11 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame", formula = "formula"),
441464
if (!is.null(initialWeights)) {
442465
initialWeights <- as.array(as.numeric(na.omit(initialWeights)))
443466
}
467+
handleInvalid <- match.arg(handleInvalid)
444468
jobj <- callJStatic("org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper",
445469
"fit", data@sdf, formula, as.integer(blockSize), as.array(layers),
446470
as.character(solver), as.integer(maxIter), as.numeric(tol),
447-
as.numeric(stepSize), seed, initialWeights)
471+
as.numeric(stepSize), seed, initialWeights, handleInvalid)
448472
new("MultilayerPerceptronClassificationModel", jobj = jobj)
449473
})
450474

@@ -514,6 +538,11 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
514538
#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
515539
#' operators are supported, including '~', '.', ':', '+', and '-'.
516540
#' @param smoothing smoothing parameter.
541+
#' @param handleInvalid How to handle invalid data (unseen labels or NULL values) in features and label
542+
#' column of string type.
543+
#' Supported options: "skip" (filter out rows with invalid data),
544+
#' "error" (throw an error), "keep" (put invalid data in a special additional
545+
#' bucket, at index numLabels). Default is "error".
517546
#' @param ... additional argument(s) passed to the method. Currently only \code{smoothing}.
518547
#' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
519548
#' @rdname spark.naiveBayes
@@ -543,10 +572,12 @@ setMethod("write.ml", signature(object = "MultilayerPerceptronClassificationMode
543572
#' }
544573
#' @note spark.naiveBayes since 2.0.0
545574
setMethod("spark.naiveBayes", signature(data = "SparkDataFrame", formula = "formula"),
546-
function(data, formula, smoothing = 1.0) {
575+
function(data, formula, smoothing = 1.0,
576+
handleInvalid = c("error", "keep", "skip")) {
547577
formula <- paste(deparse(formula), collapse = "")
578+
handleInvalid <- match.arg(handleInvalid)
548579
jobj <- callJStatic("org.apache.spark.ml.r.NaiveBayesWrapper", "fit",
549-
formula, data@sdf, smoothing)
580+
formula, data@sdf, smoothing, handleInvalid)
550581
new("NaiveBayesModel", jobj = jobj)
551582
})
552583

0 commit comments

Comments
 (0)