@@ -38,29 +38,35 @@ import org.apache.spark.sql.types._
3838private [feature] trait RFormulaBase extends HasFeaturesCol with HasLabelCol {
3939
4040 /**
41- * Param for how to order labels of string column. The first label after ordering is assigned
42- * an index of 0.
43- * Options are:
44- * - 'frequencyDesc': descending order by label frequency (most frequent label assigned 0)
45- * - 'frequencyAsc': ascending order by label frequency (least frequent label assigned 0)
46- * - 'alphabetDesc': descending alphabetical order
47- * - 'alphabetAsc': ascending alphabetical order
48- * Default is 'frequencyDesc'.
49- * When the ordering is set to 'alphabetDesc', `RFormula` drops the same category as R
50- * when encoding strings.
41+ * Param for how to order categories of a FEATURE string column used by `StringIndexer`.
42+ * The last category after ordering is dropped when encoding strings.
43+ * The options are explained using an example string: 'b', 'a', 'b', 'a', 'c', 'b'
44+ * |
45+ * | Option | Category mapped to 0 by StringIndexer | Category dropped by RFormula
46+ * | 'frequencyDesc' | most frequent category ('b') | least frequent category ('c')
47+ * | 'frequencyAsc' | least frequent category ('c') | most frequent category ('b')
48+ * | 'alphabetDesc' | first alphabetical category ('a') | last alphabetical category ('c')
49+ * | 'alphabetAsc' | last alphabetical category ('c') | last alphabetical category ('a')
50+ * |
51+ * The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', `RFormula`
52+ * drops the same category as R when encoding strings.
53+ * Note that this ordering option is NOT used for the label column. When the label column is
54+ * indexed, it uses the default descending frequency ordering in `StringIndexer`.
5155 *
5256 * @group param
5357 */
5458 @ Since (" 2.3.0" )
55- final val stringOrderType : Param [String ] = new Param (this , " stringOrderType" ,
56- " How to order labels of string column. " +
57- " The first label after ordering is assigned an index of 0. " +
59+ final val stringIndexerOrderType : Param [String ] = new Param (this , " stringIndexerOrderType" ,
60+ " How to order categories of a FEATURE string column used by StringIndexer. " +
61+ " The last category after ordering is dropped when encoding strings. " +
62+ " The default value is 'frequencyDesc'. When the ordering is set to 'alphabetDesc', " +
63+ " RFormula drops the same category as R when encoding strings." +
5864 s " Supported options: ${StringIndexer .supportedStringOrderType.mkString(" , " )}. " ,
5965 ParamValidators .inArray(StringIndexer .supportedStringOrderType))
6066
6167 /** @group getParam */
6268 @ Since (" 2.3.0" )
63- def getStringOrderType : String = $(stringOrderType )
69+ def getStringIndexerOrderType : String = $(stringIndexerOrderType )
6470
6571 protected def hasLabelCol (schema : StructType ): Boolean = {
6672 schema.map(_.name).contains($(labelCol))
@@ -152,8 +158,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
152158
153159 /** @group setParam */
154160 @ Since (" 2.3.0" )
155- def setStringOrderType (value : String ): this .type = set(stringOrderType , value)
156- setDefault(stringOrderType , StringIndexer .frequencyDesc)
161+ def setStringIndexerOrderType (value : String ): this .type = set(stringIndexerOrderType , value)
162+ setDefault(stringIndexerOrderType , StringIndexer .frequencyDesc)
157163
158164 /** Whether the formula specifies fitting an intercept. */
159165 private [ml] def hasIntercept : Boolean = {
@@ -185,7 +191,7 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
185191 encoderStages += new StringIndexer ()
186192 .setInputCol(term)
187193 .setOutputCol(indexCol)
188- .setStringOrderType($(stringOrderType ))
194+ .setStringOrderType($(stringIndexerOrderType ))
189195 prefixesToRewrite(indexCol + " _" ) = term + " _"
190196 (term, indexCol)
191197 case _ =>
0 commit comments