Skip to content

Commit f9c09a1

Browse files
committed
Merge remote-tracking branch 'upstream/master' into remove-history-master-SPARK-12299
Conflicts: core/src/main/scala/org/apache/spark/deploy/master/Master.scala
2 parents fadd392 + e896336 commit f9c09a1

File tree

807 files changed

+28753
-12332
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

807 files changed

+28753
-12332
lines changed

LICENSE

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,9 +257,8 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
257257
(BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
258258
(BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
259259
(BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
260-
(New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/)
261-
(New BSD License) MinLog (com.esotericsoftware.minlog:minlog:1.2 - http://code.google.com/p/minlog/)
262-
(New BSD License) ReflectASM (com.esotericsoftware.reflectasm:reflectasm:1.07 - http://code.google.com/p/reflectasm/)
260+
(New BSD License) Kryo (com.esotericsoftware:kryo:3.0.3 - https://github.com/EsotericSoftware/kryo)
261+
(New BSD License) MinLog (com.esotericsoftware:minlog:1.3.0 - https://github.com/EsotericSoftware/minlog)
263262
(New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf)
264263
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
265264
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)

R/pkg/NAMESPACE

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ exportMethods("%in%",
265265
"var_samp",
266266
"weekofyear",
267267
"when",
268+
"window",
268269
"year")
269270

270271
exportClasses("GroupedData")
@@ -291,7 +292,8 @@ export("as.DataFrame",
291292
"tableToDF",
292293
"tableNames",
293294
"tables",
294-
"uncacheTable")
295+
"uncacheTable",
296+
"print.summary.GeneralizedLinearRegressionModel")
295297

296298
export("structField",
297299
"structField.jobj",

R/pkg/R/functions.R

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2131,6 +2131,69 @@ setMethod("from_unixtime", signature(x = "Column"),
21312131
column(jc)
21322132
})
21332133

2134+
#' window
2135+
#'
2136+
#' Bucketize rows into one or more time windows given a timestamp specifying column. Window
2137+
#' starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window
2138+
#' [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in
2139+
#' the order of months are not supported.
2140+
#'
2141+
#' The time column must be of TimestampType.
2142+
#'
2143+
#' Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid
2144+
#' interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.
2145+
#' If the `slideDuration` is not provided, the windows will be tumbling windows.
2146+
#'
2147+
#' The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start
2148+
#' window intervals. For example, in order to have hourly tumbling windows that start 15 minutes
2149+
#' past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.
2150+
#'
2151+
#' The output column will be a struct called 'window' by default with the nested columns 'start'
2152+
#' and 'end'.
2153+
#'
2154+
#' @family datetime_funcs
2155+
#' @rdname window
2156+
#' @name window
2157+
#' @export
2158+
#' @examples
2159+
#'\dontrun{
2160+
#' # One minute windows every 15 seconds 10 seconds after the minute, e.g. 09:00:10-09:01:10,
2161+
#' # 09:00:25-09:01:25, 09:00:40-09:01:40, ...
2162+
#' window(df$time, "1 minute", "15 seconds", "10 seconds")
2163+
#'
2164+
#' # One minute tumbling windows 15 seconds after the minute, e.g. 09:00:15-09:01:15,
2165+
#' # 09:01:15-09:02:15...
2166+
#' window(df$time, "1 minute", startTime = "15 seconds")
2167+
#'
2168+
#' # Thirty second windows every 10 seconds, e.g. 09:00:00-09:00:30, 09:00:10-09:00:40, ...
2169+
#' window(df$time, "30 seconds", "10 seconds")
2170+
#'}
2171+
setMethod("window", signature(x = "Column"),
2172+
function(x, windowDuration, slideDuration = NULL, startTime = NULL) {
2173+
stopifnot(is.character(windowDuration))
2174+
if (!is.null(slideDuration) && !is.null(startTime)) {
2175+
stopifnot(is.character(slideDuration) && is.character(startTime))
2176+
jc <- callJStatic("org.apache.spark.sql.functions",
2177+
"window",
2178+
x@jc, windowDuration, slideDuration, startTime)
2179+
} else if (!is.null(slideDuration)) {
2180+
stopifnot(is.character(slideDuration))
2181+
jc <- callJStatic("org.apache.spark.sql.functions",
2182+
"window",
2183+
x@jc, windowDuration, slideDuration)
2184+
} else if (!is.null(startTime)) {
2185+
stopifnot(is.character(startTime))
2186+
jc <- callJStatic("org.apache.spark.sql.functions",
2187+
"window",
2188+
x@jc, windowDuration, windowDuration, startTime)
2189+
} else {
2190+
jc <- callJStatic("org.apache.spark.sql.functions",
2191+
"window",
2192+
x@jc, windowDuration)
2193+
}
2194+
column(jc)
2195+
})
2196+
21342197
#' locate
21352198
#'
21362199
#' Locate the position of the first occurrence of substr.

R/pkg/R/generics.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1152,6 +1152,10 @@ setGeneric("var_samp", function(x) { standardGeneric("var_samp") })
11521152
#' @export
11531153
setGeneric("weekofyear", function(x) { standardGeneric("weekofyear") })
11541154

1155+
#' @rdname window
1156+
#' @export
1157+
setGeneric("window", function(x, ...) { standardGeneric("window") })
1158+
11551159
#' @rdname year
11561160
#' @export
11571161
setGeneric("year", function(x) { standardGeneric("year") })

R/pkg/R/mllib.R

Lines changed: 104 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717

1818
# mllib.R: Provides methods for MLlib integration
1919

20-
#' @title S4 class that represents a PipelineModel
21-
#' @param model A Java object reference to the backing Scala PipelineModel
20+
#' @title S4 class that represents a generalized linear model
21+
#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
2222
#' @export
23-
setClass("PipelineModel", representation(model = "jobj"))
23+
setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))
2424

2525
#' @title S4 class that represents a NaiveBayesModel
2626
#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
@@ -39,21 +39,18 @@ setClass("KMeansModel", representation(jobj = "jobj"))
3939

4040
#' Fits a generalized linear model
4141
#'
42-
#' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
42+
#' Fits a generalized linear model, similarly to R's glm().
4343
#'
4444
#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
4545
#' operators are supported, including '~', '.', ':', '+', and '-'.
46-
#' @param data DataFrame for training
47-
#' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
48-
#' @param lambda Regularization parameter
49-
#' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details)
50-
#' @param standardize Whether to standardize features before training
51-
#' @param solver The solver algorithm used for optimization, this can be "l-bfgs", "normal" and
52-
#' "auto". "l-bfgs" denotes Limited-memory BFGS which is a limited-memory
53-
#' quasi-Newton optimization method. "normal" denotes using Normal Equation as an
54-
#' analytical solution to the linear regression problem. The default value is "auto"
55-
#' which means that the solver algorithm is selected automatically.
56-
#' @return a fitted MLlib model
46+
#' @param data DataFrame for training.
47+
#' @param family A description of the error distribution and link function to be used in the model.
48+
#' This can be a character string naming a family function, a family function or
49+
#' the result of a call to a family function. Refer R family at
50+
#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
51+
#' @param epsilon Positive convergence tolerance of iterations.
52+
#' @param maxit Integer giving the maximal number of IRLS iterations.
53+
#' @return a fitted generalized linear model
5754
#' @rdname glm
5855
#' @export
5956
#' @examples
@@ -64,36 +61,113 @@ setClass("KMeansModel", representation(jobj = "jobj"))
6461
#' df <- createDataFrame(sqlContext, iris)
6562
#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
6663
#' summary(model)
67-
#'}
64+
#' }
6865
setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
69-
function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
70-
standardize = TRUE, solver = "auto") {
71-
family <- match.arg(family)
66+
function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 25) {
67+
if (is.character(family)) {
68+
family <- get(family, mode = "function", envir = parent.frame())
69+
}
70+
if (is.function(family)) {
71+
family <- family()
72+
}
73+
if (is.null(family$family)) {
74+
print(family)
75+
stop("'family' not recognized")
76+
}
77+
7278
formula <- paste(deparse(formula), collapse = "")
73-
model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
74-
"fitRModelFormula", formula, data@sdf, family, lambda,
75-
alpha, standardize, solver)
76-
return(new("PipelineModel", model = model))
79+
80+
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
81+
"fit", formula, data@sdf, family$family, family$link,
82+
epsilon, as.integer(maxit))
83+
return(new("GeneralizedLinearRegressionModel", jobj = jobj))
7784
})
7885

79-
#' Make predictions from a model
86+
#' Get the summary of a generalized linear model
8087
#'
81-
#' Makes predictions from a model produced by glm(), similarly to R's predict().
88+
#' Returns the summary of a model produced by glm(), similarly to R's summary().
8289
#'
83-
#' @param object A fitted MLlib model
90+
#' @param object A fitted generalized linear model
91+
#' @return coefficients the model's coefficients, intercept
92+
#' @rdname summary
93+
#' @export
94+
#' @examples
95+
#' \dontrun{
96+
#' model <- glm(y ~ x, trainingData)
97+
#' summary(model)
98+
#' }
99+
setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
100+
function(object, ...) {
101+
jobj <- object@jobj
102+
features <- callJMethod(jobj, "rFeatures")
103+
coefficients <- callJMethod(jobj, "rCoefficients")
104+
deviance.resid <- callJMethod(jobj, "rDevianceResiduals")
105+
dispersion <- callJMethod(jobj, "rDispersion")
106+
null.deviance <- callJMethod(jobj, "rNullDeviance")
107+
deviance <- callJMethod(jobj, "rDeviance")
108+
df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
109+
df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
110+
aic <- callJMethod(jobj, "rAic")
111+
iter <- callJMethod(jobj, "rNumIterations")
112+
family <- callJMethod(jobj, "rFamily")
113+
114+
deviance.resid <- dataFrame(deviance.resid)
115+
coefficients <- matrix(coefficients, ncol = 4)
116+
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
117+
rownames(coefficients) <- unlist(features)
118+
ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
119+
dispersion = dispersion, null.deviance = null.deviance,
120+
deviance = deviance, df.null = df.null, df.residual = df.residual,
121+
aic = aic, iter = iter, family = family)
122+
class(ans) <- "summary.GeneralizedLinearRegressionModel"
123+
return(ans)
124+
})
125+
126+
#' Print the summary of GeneralizedLinearRegressionModel
127+
#'
128+
#' @rdname print
129+
#' @name print.summary.GeneralizedLinearRegressionModel
130+
#' @export
131+
print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
132+
x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals",
133+
c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max"))
134+
x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
135+
cat("\nDeviance Residuals: \n")
136+
cat("(Note: These are approximate quantiles with relative error <= 0.01)\n")
137+
print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)
138+
139+
cat("\nCoefficients:\n")
140+
print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)
141+
142+
cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion),
143+
")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"),
144+
format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
145+
" on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"),
146+
1L, paste, collapse = " "), sep = "")
147+
cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
148+
"Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "")
149+
cat("\n")
150+
invisible(x)
151+
}
152+
153+
#' Make predictions from a generalized linear model
154+
#'
155+
#' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
156+
#'
157+
#' @param object A fitted generalized linear model
84158
#' @param newData DataFrame for testing
85-
#' @return DataFrame containing predicted values
159+
#' @return DataFrame containing predicted labels in a column named "prediction"
86160
#' @rdname predict
87161
#' @export
88162
#' @examples
89163
#' \dontrun{
90164
#' model <- glm(y ~ x, trainingData)
91165
#' predicted <- predict(model, testData)
92166
#' showDF(predicted)
93-
#'}
94-
setMethod("predict", signature(object = "PipelineModel"),
167+
#' }
168+
setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
95169
function(object, newData) {
96-
return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
170+
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
97171
})
98172

99173
#' Make predictions from a naive Bayes model
@@ -116,54 +190,6 @@ setMethod("predict", signature(object = "NaiveBayesModel"),
116190
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
117191
})
118192

119-
#' Get the summary of a model
120-
#'
121-
#' Returns the summary of a model produced by glm(), similarly to R's summary().
122-
#'
123-
#' @param object A fitted MLlib model
124-
#' @return a list with 'devianceResiduals' and 'coefficients' components for gaussian family
125-
#' or a list with 'coefficients' component for binomial family. \cr
126-
#' For gaussian family: the 'devianceResiduals' gives the min/max deviance residuals
127-
#' of the estimation, the 'coefficients' gives the estimated coefficients and their
128-
#' estimated standard errors, t values and p-values. (It only available when model
129-
#' fitted by normal solver.) \cr
130-
#' For binomial family: the 'coefficients' gives the estimated coefficients.
131-
#' See summary.glm for more information. \cr
132-
#' @rdname summary
133-
#' @export
134-
#' @examples
135-
#' \dontrun{
136-
#' model <- glm(y ~ x, trainingData)
137-
#' summary(model)
138-
#'}
139-
setMethod("summary", signature(object = "PipelineModel"),
140-
function(object, ...) {
141-
modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
142-
"getModelName", object@model)
143-
features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
144-
"getModelFeatures", object@model)
145-
coefficients <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
146-
"getModelCoefficients", object@model)
147-
if (modelName == "LinearRegressionModel") {
148-
devianceResiduals <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
149-
"getModelDevianceResiduals", object@model)
150-
devianceResiduals <- matrix(devianceResiduals, nrow = 1)
151-
colnames(devianceResiduals) <- c("Min", "Max")
152-
rownames(devianceResiduals) <- rep("", times = 1)
153-
coefficients <- matrix(coefficients, ncol = 4)
154-
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
155-
rownames(coefficients) <- unlist(features)
156-
return(list(devianceResiduals = devianceResiduals, coefficients = coefficients))
157-
} else if (modelName == "LogisticRegressionModel") {
158-
coefficients <- as.matrix(unlist(coefficients))
159-
colnames(coefficients) <- c("Estimate")
160-
rownames(coefficients) <- unlist(features)
161-
return(list(coefficients = coefficients))
162-
} else {
163-
stop(paste("Unsupported model", modelName, sep = " "))
164-
}
165-
})
166-
167193
#' Get the summary of a naive Bayes model
168194
#'
169195
#' Returns the summary of a naive Bayes model produced by naiveBayes(), similarly to R's summary().

R/pkg/inst/tests/testthat/test_context.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ test_that("Check masked functions", {
2626
maskedBySparkR <- masked[funcSparkROrEmpty]
2727
namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
2828
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
29-
"summary", "transform", "drop")
29+
"summary", "transform", "drop", "window")
3030
expect_equal(length(maskedBySparkR), length(namesOfMasked))
3131
expect_equal(sort(maskedBySparkR), sort(namesOfMasked))
3232
# above are those reported as masked when `library(SparkR)`

0 commit comments

Comments
 (0)