scikit-learn
diff --git a/‎doc/modules/linear_model.rst
Lines changed: 4 additions & 4 deletions b/‎doc/modules/linear_model.rst
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/linear_model/plot_robust_fit.py
Lines changed: 3 additions & 2 deletions b/‎examples/linear_model/plot_robust_fit.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎sklearn/linear_model/huber.py
Lines changed: 18 additions & 15 deletions b/‎sklearn/linear_model/huber.py
Lines changed: 18 additions & 15 deletions
diff --git a/‎sklearn/linear_model/tests/test_huber.py
Lines changed: 14 additions & 3 deletions b/‎sklearn/linear_model/tests/test_huber.py
Lines changed: 14 additions & 3 deletions
diff --git a/‎sklearn/utils/estimator_checks.py
Lines changed: 1 addition & 1 deletion b/‎sklearn/utils/estimator_checks.py
Lines changed: 1 addition & 1 deletion
@@ -909,7 +909,7 @@ in these settings.
 
   * :ref:`HuberRegressor <huber_regression>` should be faster than
     :ref:`RANSAC <ransac_regression>` and :ref:`Theil Sen <theil_sen_regression>`
-    unless the number of samples are very large, i.e n_samples >> n_features.
+    unless the number of samples are very large, i.e ``n_samples`` >> ``n_features``.
     This is because :ref:`RANSAC <ransac_regression>` and :ref:`Theil Sen <theil_sen_regression>`
     fit on smaller subsets of the data. However, both :ref:`Theil Sen <theil_sen_regression>`
     and :ref:`RANSAC <ransac_regression>` are unlikely to be as robust as
@@ -1064,8 +1064,8 @@ considering only a random subset of all possible combinations.
 Huber Regression
 ----------------
 
-The :class:`HuberRegressor` is similar to :class:`Ridge` that it also applies
-L2 regularization and a squared loss for samples which are classified as inliers.
+The :class:`HuberRegressor` is different to :class:`Ridge` because it applies a
+linear loss to samples that are classified as outliers.
 A sample is classified as an inlier if the absolute error of that sample is
 lesser than a certain threshold. It differs from :class:`TheilSenRegressor`
 and :class:`RANSACRegressor` because it does not ignore the effect of the outliers
@@ -1100,7 +1100,7 @@ in the following ways.
 
 - :class:`HuberRegressor` is scaling invariant. Once ``epsilon`` is set, scaling ``X`` and ``y``
   down or up by different values would produce the same robustness to outliers as before.
-  as compared to :class:`SGDRegessor` where `epsilon` has to be set again when ``X`` and ``y`` are
+  as compared to :class:`SGDRegressor` where ``epsilon`` has to be set again when ``X`` and ``y`` are
   scaled.
 
 - :class:`HuberRegressor` should be more efficient to use on data with small number of
 
@@ -24,8 +24,9 @@
 - TheilSen is good for small outliers, both in direction X and y, but has
   a break point above which it performs worse than OLS.
 
-- HuberRegressor should not differ much in performance to both RANSAC
-  and TheilSen due to outliers in both X and y direction, since it checks if
+- HuberRegressor may not be compared directly to both TheilSen and RANSAC
+  because it does not attempt to completely filter the outliers but
+  lessen their effect. The higher the deviance of the outliers
   the mean absolute error is lesser than a certain threshold.
 
 """
 
@@ -18,33 +18,33 @@ def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
 
     Parameters
     ----------
-    w: ndarray, shape (n_features + 1,) or (n_features + 2,)
+    w : ndarray, shape (n_features + 1,) or (n_features + 2,)
         Feature vector.
-        w[:n_features] gives the feature vector
+        w[:n_features] gives the coefficients
         w[-1] gives the scale factor and if the intercept is fit w[-2]
         gives the intercept factor.
 
-    X: ndarray, shape (n_samples, n_features)
+    X : ndarray, shape (n_samples, n_features)
         Input data.
 
-    y: ndarray, shape (n_samples,)
+    y : ndarray, shape (n_samples,)
         Target vector.
 
-    epsilon: float
+    epsilon : float
         Robustness of the Huber estimator.
 
-    alpha: float
+    alpha : float
         Regularization parameter.
 
-    sample_weight: ndarray, shape (n_samples,), optional
+    sample_weight : ndarray, shape (n_samples,), optional
         Weight assigned to each sample.
 
     Returns
     -------
     loss: float
         Huber loss.
 
-    gradient: ndarray, shape (n_features + 1,) or (n_features + 2,)
+    gradient: ndarray, shape (len(w))
         Returns the derivative of the Huber loss with respect to each
         coefficient, intercept and the scale as a vector.
     """
@@ -129,8 +129,9 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     ``|(y - X'w) / sigma| < epsilon`` and the absolute loss for the samples
     where ``|(y - X'w) / sigma| > epsilon``, where w and sigma are parameters
     to be optimized. The parameter sigma makes sure that if y is scaled up
-    or down by a certain factor, one does not need to rescale epsilon to acheive
-    the same robustness.
+    or down by a certain factor, one does not need to rescale epsilon to
+    achieve the same robustness. Note that this does not take into account
+    the fact that the different features of X may be of different scales.
 
     This makes sure that the loss function is not heavily influenced by the
     outliers while not completely ignoring their effect.
@@ -141,11 +142,12 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     ----------
     epsilon : float, greater than 1.0, default 1.35
         The parameter epsilon controls the number of samples that should be
-        classified as outliers. The lesser the epsilon, the more robust it is
+        classified as outliers. The smaller the epsilon, the more robust it is
         to outliers.
 
     max_iter : int, default 100
-        Number of iterations that scipy.optimize.fmin_l_bfgs_b should run for.
+        Maximum number of iterations that scipy.optimize.fmin_l_bfgs_b
+        should run for.
 
     alpha : float, default 0.0001
         Regularization parameter.
@@ -174,7 +176,7 @@ class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
     scale_ : float
         The value by which ``|y - X'w - c|`` is scaled down.
 
-    n_iter_: int
+    n_iter_ : int
         Number of iterations that fmin_l_bfgs_b has run for.
         Not available if SciPy version is 0.9 and below.
 
@@ -207,7 +209,7 @@ def fit(self, X, y, sample_weight=None):
         y : array-like, shape (n_samples,)
             Target vector relative to X.
 
-        sample_weight: array-like, shape (n_samples,)
+        sample_weight : array-like, shape (n_samples,)
             Weight given to each sample.
 
         Returns
@@ -225,7 +227,8 @@ def fit(self, X, y, sample_weight=None):
 
         if self.epsilon < 1.0:
             raise ValueError(
-                "epsilon should be greater than 1.0, got %f" % self.epsilon)
+                "epsilon should be greater than or equal to 1.0, got %f"
+                % self.epsilon)
 
         if self.warm_start and hasattr(self, 'coef_'):
             parameters = np.concatenate(
 
@@ -83,14 +83,24 @@ def test_huber_sample_weights():
     assert_array_almost_equal(huber.coef_, huber_coef, 3)
     assert_array_almost_equal(huber.intercept_, huber_intercept, 3)
 
-    # Test sparse implementation with sparse weights.
-    # Checking sparse=non_sparse should be covered in the common tests.
+    # Test sparse implementation with sample weights.
     X_csr = sparse.csr_matrix(X)
     huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1)
     huber_sparse.fit(X_csr, y, sample_weight=[1, 3, 1, 2, 1])
     assert_array_almost_equal(huber_sparse.coef_, huber_coef, 3)
 
 
+def test_huber_sparse():
+    X, y = make_regression_with_outliers()
+    huber = HuberRegressor(fit_intercept=True, alpha=0.1)
+    huber.fit(X, y)
+
+    X_csr = sparse.csr_matrix(X)
+    huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1)
+    huber_sparse.fit(X_csr, y)
+    assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
+
+
 def return_outliers(X, y, huber):
     """Return the number of outliers."""
     return np.abs(huber.predict(X) - y) > huber.epsilon * huber.scale_
@@ -165,6 +175,8 @@ def test_huber_better_r2_score():
     huber_score = huber.score(X[mask], y[mask])
     huber_outlier_score = huber.score(X[~mask], y[~mask])
 
+    # The Ridge regressor should be influenced by the outliers and hence
+    # give a worse score on the non-outliers as compared to the huber regressor.
     ridge = Ridge(fit_intercept=True, alpha=0.01)
     ridge.fit(X, y)
     ridge_score = ridge.score(X[mask], y[mask])
@@ -173,4 +185,3 @@ def test_huber_better_r2_score():
 
     # The huber model should also fit poorly on the outliers.
     assert_greater(ridge_outlier_score, huber_outlier_score)
- 
 
@@ -1488,7 +1488,7 @@ def check_non_transformer_estimators_n_iter(name, estimator,
         estimator.fit(X, y_)
 
     # HuberRegressor depends on scipy.optimize.fmin_l_bfgs_b
-    # which does return a n_iter for old versions of SciPy.
+    # which doesn't return a n_iter for old versions of SciPy.
     if not (name == 'HuberRegressor' and estimator.n_iter_ is None):
         assert_greater(estimator.n_iter_, 0)