Addresses indentation and doc comments

MechCoder · MechCoder · commit e4d799b679c5 · 2015-02-23T09:45:40.000+05:30
diff --git a/docs/mllib-ensembles.md b/docs/mllib-ensembles.md
@@ -429,16 +429,15 @@ We omit some decision tree parameters since those are covered in the [decision t
 
 #### Validation while training
 
-Gradient boosting can overfit when trained with more number of trees. In order to prevent overfitting, it might
-be useful to validate while training. The method **`runWithValidation`** has been provided to make use of this
-option. It takes a pair of RDD's as arguments, the first one being the training dataset and the second being the validation dataset.
+Gradient boosting can overfit when trained with more trees. In order to prevent overfitting, it is useful to validate while
+training. The method runWithValidation has been provided to make use of this option. It takes a pair of RDD's as arguments, the
+first one being the training dataset and the second being the validation dataset.
 
 The training is stopped when the improvement in the validation error is not more than a certain tolerance
-(supplied by the **`validationTol`** argument in **`BoostingStrategy`**). In practice, the validation error
-decreases with the increase in number of trees and then increases as the model starts to overfit. There might
-be cases, in which the validation error does not change monotonically, and the user is advised to set a large
-enough negative tolerance and examine the validation curve to make further inference.
-
+(supplied by the validationTol argument in BoostingStrategy). In practice, the validation error
+decreases initially and later increases. There might be cases in which the validation error does not change monotonically,
+and the user is advised to set a large enough negative tolerance and examine the validation curve to to tune the number of
+iterations.
 
 ### Examples
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -80,26 +80,28 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
 
   /**
    * Method to validate a gradient boosting model
-   * @param trainInput Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
-   * @param validateInput Validation dataset:
+   * @param input Training dataset: RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
+   * @param validationInput Validation dataset:
                           RDD of [[org.apache.spark.mllib.regression.LabeledPoint]].
-                          Should follow same distribution as trainInput.
+                          Should be different from and follow the same distribution as input.
+                          e.g., these two datasets could be created from an original dataset
+                          by using [[org.apache.spark.rdd.RDD.randomSplit()]]
    * @return a gradient boosted trees model that can be used for prediction
    */
   def runWithValidation(
-      trainInput: RDD[LabeledPoint],
-      validateInput: RDD[LabeledPoint]): GradientBoostedTreesModel = {
+      input: RDD[LabeledPoint],
+      validationInput: RDD[LabeledPoint]): GradientBoostedTreesModel = {
     val algo = boostingStrategy.treeStrategy.algo
     algo match {
       case Regression => GradientBoostedTrees.boost(
-        trainInput, validateInput, boostingStrategy, validate=true)
+        input, validationInput, boostingStrategy, validate=true)
       case Classification =>
         // Map labels to -1, +1 so binary classification can be treated as regression.
-        val remappedTrainInput = trainInput.map(
+        val remappedInput = input.map(
           x => new LabeledPoint((x.label * 2) - 1, x.features))
-        val remappedValidateInput = trainInput.map(
+        val remappedValidationInput = validationInput.map(
           x => new LabeledPoint((x.label * 2) - 1, x.features))
-        GradientBoostedTrees.boost(remappedTrainInput, remappedValidateInput, boostingStrategy,
+        GradientBoostedTrees.boost(remappedInput, remappedValidationInput, boostingStrategy,
           validate=true)
       case _ =>
         throw new IllegalArgumentException(s"$algo is not supported by the gradient boosting.")
@@ -110,9 +112,9 @@ class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
    * Java-friendly API for [[org.apache.spark.mllib.tree.GradientBoostedTrees!#runWithValidation]].
    */
   def runWithValidation(
-      trainInput: JavaRDD[LabeledPoint],
-      validateInput: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
-    runWithValidation(trainInput.rdd, validateInput.rdd)
+      input: JavaRDD[LabeledPoint],
+      validationInput: JavaRDD[LabeledPoint]): GradientBoostedTreesModel = {
+    runWithValidation(input.rdd, validationInput.rdd)
   }
 }
 
@@ -145,16 +147,16 @@ object GradientBoostedTrees extends Logging {
   /**
    * Internal method for performing regression using trees as base learners.
    * @param input training dataset
-   * @param validateInput validation dataset, ignored if validate is set to false.
+   * @param validationInput validation dataset, ignored if validate is set to false.
    * @param boostingStrategy boosting parameters
    * @param validate whether or not to use the validation dataset.
    * @return a gradient boosted trees model that can be used for prediction
    */
   private def boost(
       input: RDD[LabeledPoint],
-      validateInput: RDD[LabeledPoint],
+      validationInput: RDD[LabeledPoint],
       boostingStrategy: BoostingStrategy,
-      validate: Boolean = false): GradientBoostedTreesModel = {
+      validate: Boolean): GradientBoostedTreesModel = {
 
     val timer = new TimeTracker()
     timer.start("total")
@@ -198,7 +200,7 @@ object GradientBoostedTrees extends Logging {
     // Note: A model of type regression is used since we require raw prediction
     timer.stop("building tree 0")
 
-    var bestValidateError = if (validate) loss.computeError(startingModel, validateInput) else 0.0
+    var bestValidateError = if (validate) loss.computeError(startingModel, validationInput) else 0.0
     var bestM = 1
 
     // psuedo-residual for second iteration
@@ -225,19 +227,18 @@ object GradientBoostedTrees extends Logging {
 
       if (validate) {
         // Stop training early if
-        // 1. Reduction in error is lesser than the validationTol or
+        // 1. Reduction in error is less than the validationTol or
         // 2. If the error increases, that is if the model is overfit.
         // We want the model returned corresponding to the best validation error.
-        val currentValidateError = loss.computeError(partialModel, validateInput)
+        val currentValidateError = loss.computeError(partialModel, validationInput)
         if (bestValidateError - currentValidateError < validationTol) {
           return new GradientBoostedTreesModel(
             boostingStrategy.treeStrategy.algo,
             baseLearners.slice(0, bestM),
             baseLearnerWeights.slice(0, bestM))
-        }
-        else if (currentValidateError < bestValidateError){
-          bestValidateError = currentValidateError
-          bestM = m + 1
+        } else if (currentValidateError < bestValidateError){
+            bestValidateError = currentValidateError
+            bestM = m + 1
         }
       }
       // Update data with pseudo-residuals
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -34,13 +34,11 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss}
  *                      weak hypotheses used in the final model.
  * @param learningRate Learning rate for shrinking the contribution of each estimator. The
  *                     learning rate should be between in the interval (0, 1]
- * @param validationTol Useful when runWithValidation is used. If the error rate between two
-                        iterations is lesser than the validationTol, then stop. If run
-                        is used, then this parameter is ignored.
-
- a pair of RDD's are supplied to run. If the error rate
- *                      between two iterations is lesser than convergenceTol, then training stops.
+ * @param validationTol Useful when runWithValidation is used. If the error rate on the
+ *                      validation input between two iterations is less than the validationTol
+ *                      then stop. Ignored when [[run]] is used.
  */
+
 @Experimental
 case class BoostingStrategy(
     // Required boosting parameters
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/tree/GradientBoostedTreesSuite.scala
@@ -160,7 +160,7 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
   }
 
   test("runWithValidation performs better on a validation dataset (Regression)") {
-    // Set numIterations large enough so that it early stops.
+    // Set numIterations large enough so that it stops early.
     val numIterations = 20
     val trainRdd = sc.parallelize(GradientBoostedTreesSuite.trainData, 2)
     val validateRdd = sc.parallelize(GradientBoostedTreesSuite.validateData, 2)
@@ -171,9 +171,9 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
       val boostingStrategy =
         new BoostingStrategy(treeStrategy, error, numIterations, validationTol = 0.0)
 
-      val gbtValidate = new GradientBoostedTrees(boostingStrategy).runWithValidation(
-        trainRdd, validateRdd)
-      assert(gbtValidate.numTrees != numIterations)
+      val gbtValidate = new GradientBoostedTrees(boostingStrategy).
+        runWithValidation(trainRdd, validateRdd)
+      assert(gbtValidate.numTrees !== numIterations)
 
       val gbt = GradientBoostedTrees.train(trainRdd, boostingStrategy)
       val errorWithoutValidation = error.computeError(gbt, validateRdd)
@@ -183,7 +183,7 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
   }
 
   test("runWithValidation performs better on a validation dataset (Classification)") {
-    // Set numIterations large enough so that it early stops.
+    // Set numIterations large enough so that it stops early.
     val numIterations = 20
     val trainRdd = sc.parallelize(GradientBoostedTreesSuite.trainData, 2)
     val validateRdd = sc.parallelize(GradientBoostedTreesSuite.validateData, 2)
@@ -194,9 +194,9 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
       new BoostingStrategy(treeStrategy, LogLoss, numIterations, validationTol = 0.0)
 
     // Test that it stops early.
-    val gbtValidate = new GradientBoostedTrees(boostingStrategy).runWithValidation(
-      trainRdd, validateRdd)
-    assert(gbtValidate.numTrees != numIterations)
+    val gbtValidate = new GradientBoostedTrees(boostingStrategy).
+      runWithValidation(trainRdd, validateRdd)
+    assert(gbtValidate.numTrees !== numIterations)
 
     // Remap labels to {-1, 1}
     val remappedInput = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features))
@@ -213,7 +213,7 @@ class GradientBoostedTreesSuite extends FunSuite with MLlibTestSparkContext {
     val errorWithoutValidation = LogLoss.computeError(gbtRegressor, remappedInput)
 
     assert(errorWithValidation < errorWithoutValidation)
-    }
+  }
 
 }