@@ -903,8 +903,11 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
903903 informative than features that occur in a small fraction of the training
904904 corpus.
905905
906- In the SMART notation used in IR, this class implements several tf-idf
907- variants:
906+ The actual formula used for tf-idf is tf * (idf + 1) = tf + tf * idf,
907+ instead of tf * idf. The effect of this is that terms with zero idf, i.e.
908+ that occur in all documents of a training set, will not be entirely
909+ ignored. The formulas used to compute tf and idf depend on parameter
910+ settings that correspond to the SMART notation used in IR, as follows:
908911
909912 Tf is "n" (natural) by default, "l" (logarithmic) when sublinear_tf=True.
910913 Idf is "t" when use_idf is given, "n" (none) otherwise.
@@ -962,7 +965,8 @@ def fit(self, X, y=None):
962965 df += int (self .smooth_idf )
963966 n_samples += int (self .smooth_idf )
964967
965- # avoid division by zeros for features that occur in all documents
968+ # log1p instead of log makes sure terms with zero idf don't get
969+ # suppressed entirely
966970 idf = np .log (float (n_samples ) / df ) + 1.0
967971 self ._idf_diag = sp .spdiags (idf ,
968972 diags = 0 , m = n_features , n = n_features )
0 commit comments