Skip to content

Commit 5f31d31

Browse files
author
Wayne Zhang
committed
change param name and update doc
1 parent 147311b commit 5f31d31

File tree

2 files changed

+26
-20
lines changed

2 files changed

+26
-20
lines changed

mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,29 +38,35 @@ import org.apache.spark.sql.types._
3838
private[feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
3939

4040
/**
41-
* Param for how to order labels of string column. The first label after ordering is assigned
42-
* an index of 0.
43-
* Options are:
44-
* - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
45-
* - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
46-
* - 'alphabetDesc': descending alphabetical order
47-
* - 'alphabetAsc': ascending alphabetical order
48-
* Default is 'frequencyDesc'.
49-
* When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R
50-
* when encoding strings.
41+
* Param for how to order categories of a FEATURE string column used by `StringIndexer`.
42+
* The last category after ordering is dropped when encoding strings.
43+
* The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
44+
* |
45+
* | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula
46+
* | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c')
47+
* | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b')
48+
* | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')
49+
* | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')
50+
* |
51+
* The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
52+
* drops the same category as R when encoding strings.
53+
* Note that this ordering option is NOT used for the label column. When the label column is
54+
* indexed, it uses the default descending frequency ordering in `StringIndexer`.
5155
*
5256
* @group param
5357
*/
5458
@Since("2.3.0")
55-
final val stringOrderType: Param[String] = new Param(this, "stringOrderType",
56-
"How to order labels of string column. " +
57-
"The first label after ordering is assigned an index of 0. " +
59+
final val stringIndexerOrderType: Param[String] = new Param(this, "stringIndexerOrderType",
60+
"How to order categories of a FEATURE string column used by StringIndexer. " +
61+
"The last category after ordering is dropped when encoding strings. " +
62+
"The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
63+
"RFormula drops the same category as R when encoding strings." +
5864
s"Supported options: ${StringIndexer.supportedStringOrderType.mkString(", ")}.",
5965
ParamValidators.inArray(StringIndexer.supportedStringOrderType))
6066

6167
/** @group getParam */
6268
@Since("2.3.0")
63-
def getStringOrderType: String = $(stringOrderType)
69+
def getStringIndexerOrderType: String = $(stringIndexerOrderType)
6470

6571
protected def hasLabelCol(schema: StructType): Boolean = {
6672
schema.map(_.name).contains($(labelCol))
@@ -152,8 +158,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
152158

153159
/** @group setParam */
154160
@Since("2.3.0")
155-
def setStringOrderType(value: String): this.type = set(stringOrderType, value)
156-
setDefault(stringOrderType, StringIndexer.frequencyDesc)
161+
def setStringIndexerOrderType(value: String): this.type = set(stringIndexerOrderType, value)
162+
setDefault(stringIndexerOrderType, StringIndexer.frequencyDesc)
157163

158164
/** Whether the formula specifies fitting an intercept. */
159165
private[ml] def hasIntercept: Boolean = {
@@ -185,7 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
185191
encoderStages += new StringIndexer()
186192
.setInputCol(term)
187193
.setOutputCol(indexCol)
188-
.setStringOrderType($(stringOrderType))
194+
.setStringOrderType($(stringIndexerOrderType))
189195
prefixesToRewrite(indexCol + "_") = term + "_"
190196
(term, indexCol)
191197
case _ =>

mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
129129
assert(result.collect() === expected.collect())
130130
}
131131

132-
test("encodes string terms with string order type") {
132+
test("encodes string terms with string indexer order type") {
133133
val formula = new RFormula().setFormula("id ~ a + b")
134134
val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
135135
.toDF("id", "a", "b")
@@ -163,7 +163,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
163163

164164
var idx = 0
165165
for (orderType <- StringIndexer.supportedStringOrderType) {
166-
val model = formula.setStringOrderType(orderType).fit(original)
166+
val model = formula.setStringIndexerOrderType(orderType).fit(original)
167167
val result = model.transform(original)
168168
val resultSchema = model.transformSchema(original.schema)
169169
assert(result.schema.toString == resultSchema.toString)
@@ -190,7 +190,7 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
190190
val original = Seq((1, "foo", 4), (2, "bar", 4), (3, "bar", 5), (4, "aaz", 5))
191191
.toDF("id", "a", "b")
192192
val formula = new RFormula().setFormula("id ~ a + b")
193-
.setStringOrderType(StringIndexer.alphabetDesc)
193+
.setStringIndexerOrderType(StringIndexer.alphabetDesc)
194194

195195
/*
196196
Note that the category dropped after encoding is the same between R and Spark

0 commit comments

Comments
 (0)