From 7e1db03753f2a40cb0f525a3b0188a2117bfebb6 Mon Sep 17 00:00:00 2001
From: ankishb <bansal.ankish1@gmail.com>
Date: Tue, 24 Dec 2019 16:34:12 +0530
Subject: [PATCH 1/2] Fix default value in doc of SGD

---
 sklearn/linear_model/_stochastic_gradient.py | 83 ++++++++++----------
 1 file changed, 40 insertions(+), 43 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index eb1e9e7b545e7..dc0ad85bffac3 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -362,11 +362,11 @@ def fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter,
     sample_weight : numpy array of shape [n_samples, ]
         The weight of each sample
 
-    validation_mask : numpy array of shape [n_samples, ] or None
+    validation_mask : numpy array of shape [n_samples, ], default=None
         Precomputed validation mask in case _fit_binary is called in the
         context of a one-vs-rest reduction.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
@@ -644,7 +644,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         y : numpy array, shape (n_samples,)
             Subset of the target values.
 
-        classes : array, shape (n_classes,)
+        classes : array, shape (n_classes,), default=None
             Classes across all calls to partial_fit.
             Can be obtained by via `np.unique(y_all)`, where y_all is the
             target vector of the entire dataset.
@@ -652,7 +652,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             and can be omitted in the subsequent calls.
             Note that y doesn't need to contain all labels in `classes`.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed.
 
@@ -688,13 +688,13 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         y : numpy array, shape (n_samples,)
             Target values.
 
-        coef_init : array, shape (n_classes, n_features)
+        coef_init : array, shape (n_classes, n_features), default=None
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (n_classes,)
+        intercept_init : array, shape (n_classes,), default=None
             The initial intercept to warm-start the optimization.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed. These weights will
             be multiplied with class_weight (passed through the
@@ -738,7 +738,7 @@ class SGDClassifier(BaseSGDClassifier):
 
     Parameters
     ----------
-    loss : str, default: 'hinge'
+    loss : str, default='hinge'
         The loss function to be used. Defaults to 'hinge', which gives a
         linear SVM.
 
@@ -754,42 +754,41 @@ class SGDClassifier(BaseSGDClassifier):
         The other losses are designed for regression but can be useful in
         classification as well; see SGDRegressor for a description.
 
-    penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
+    penalty : str, 'none', 'l2', 'l1', or 'elasticnet', default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
         'elasticnet' might bring sparsity to the model (feature selection)
         not achievable with 'l2'.
 
-    alpha : float
+    alpha : float, default=0.0001
         Constant that multiplies the regularization term. Defaults to 0.0001.
         Also used to compute learning_rate when set to 'optimal'.
 
-    l1_ratio : float
+    l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
         Defaults to 0.15.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
         .. versionadded:: 0.19
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
-        Defaults to True.
 
     verbose : int, default=0
         The verbosity level.
@@ -802,21 +801,21 @@ class SGDClassifier(BaseSGDClassifier):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : str, optional
+    learning_rate : str, default='optimal'
         The learning rate schedule:
 
         'constant':
@@ -832,12 +831,12 @@ class SGDClassifier(BaseSGDClassifier):
             training loss by tol or fail to increase validation score by tol if
             early_stopping is True, the current learning rate is divided by 5.
 
-    eta0 : double
+    eta0 : double, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
         the default schedule 'optimal'.
 
-    power_t : double
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate [default 0.5].
 
     early_stopping : bool, default=False
@@ -861,7 +860,7 @@ class SGDClassifier(BaseSGDClassifier):
 
         .. versionadded:: 0.20
 
-    class_weight : dict, {class_label: weight} or "balanced" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
@@ -1140,7 +1139,7 @@ def partial_fit(self, X, y, sample_weight=None):
         y : numpy array of shape (n_samples,)
             Subset of target values
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed.
 
@@ -1201,13 +1200,13 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         y : numpy array, shape (n_samples,)
             Target values
 
-        coef_init : array, shape (n_features,)
+        coef_init : array, shape (n_features,), default=None
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (1,)
+        intercept_init : array, shape (1,), default=None
             The initial intercept to warm-start the optimization.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples (1. for unweighted).
 
         Returns
@@ -1359,7 +1358,7 @@ class SGDRegressor(BaseSGDRegressor):
 
     Parameters
     ----------
-    loss : str, default: 'squared_loss'
+    loss : str, default='squared_loss'
         The loss function to be used. The possible values are 'squared_loss',
         'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
 
@@ -1371,42 +1370,40 @@ class SGDRegressor(BaseSGDRegressor):
         'squared_epsilon_insensitive' is the same but becomes squared loss past
         a tolerance of epsilon.
 
-    penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
+    penalty : str, 'none', 'l2', 'l1', or 'elasticnet', default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
         'elasticnet' might bring sparsity to the model (feature selection)
         not achievable with 'l2'.
 
-    alpha : float
-        Constant that multiplies the regularization term. Defaults to 0.0001
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term.
         Also used to compute learning_rate when set to 'optimal'.
 
-    l1_ratio : float
+    l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
-        Defaults to 0.15.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
-        data is assumed to be already centered. Defaults to True.
+        data is assumed to be already centered.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
         .. versionadded:: 0.19
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
-        Defaults to True.
 
     verbose : integer, default=0
         The verbosity level.
@@ -1419,14 +1416,14 @@ class SGDRegressor(BaseSGDRegressor):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : string, optional
+    learning_rate : string, default='invscaling'
         The learning rate schedule:
 
         'constant':
@@ -1442,12 +1439,12 @@ class SGDRegressor(BaseSGDRegressor):
             training loss by tol or fail to increase validation score by tol if
             early_stopping is True, the current learning rate is divided by 5.
 
-    eta0 : double
+    eta0 : double, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.01.
 
-    power_t : double
-        The exponent for inverse scaling learning rate [default 0.25].
+    power_t : double, default=0.25
+        The exponent for inverse scaling learning rate.
 
     early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation

From 459ee56fafe6307c378dce20887e3cf85a82793a Mon Sep 17 00:00:00 2001
From: ankishb <bansal.ankish1@gmail.com>
Date: Tue, 24 Dec 2019 17:07:41 +0530
Subject: [PATCH 2/2] Fix default value in sgd in master branch

---
 sklearn/linear_model/_stochastic_gradient.py | 40 ++++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index eb1e9e7b545e7..0cd3c9e076dab 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -244,12 +244,12 @@ def _make_validation_split(self, y):
 
         Parameters
         ----------
-        y : array, shape (n_samples, )
+        y : ndarray of shape (n_samples, )
             Target values.
 
         Returns
         -------
-        validation_mask : array, shape (n_samples, )
+        validation_mask : ndarray of shape (n_samples, )
             Equal to 1 on the validation set, 0 on the training set.
         """
         n_samples = y.shape[0]
@@ -641,10 +641,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Subset of the training data.
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Subset of the target values.
 
-        classes : array, shape (n_classes,)
+        classes : ndarray of shape (n_classes,)
             Classes across all calls to partial_fit.
             Can be obtained by via `np.unique(y_all)`, where y_all is the
             target vector of the entire dataset.
@@ -685,13 +685,13 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values.
 
-        coef_init : array, shape (n_classes, n_features)
+        coef_init : ndarray of shape (n_classes, n_features)
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (n_classes,)
+        intercept_init : ndarray of shape (n_classes,)
             The initial intercept to warm-start the optimization.
 
         sample_weight : array-like, shape (n_samples,), optional
@@ -893,11 +893,11 @@ class SGDClassifier(BaseSGDClassifier):
 
     Attributes
     ----------
-    coef_ : array, shape (1, n_features) if n_classes == 2 else (n_classes,\
-            n_features)
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
         Weights assigned to the features.
 
-    intercept_ : array, shape (1,) if n_classes == 2 else (n_classes,)
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
         Constants in decision function.
 
     n_iter_ : int
@@ -979,7 +979,7 @@ def predict_proba(self):
 
         Returns
         -------
-        array, shape (n_samples, n_classes)
+        ndarray of shape (n_samples, n_classes)
             Returns the probability of the sample for each class in the model,
             where classes are ordered as they are in `self.classes_`.
 
@@ -1198,13 +1198,13 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values
 
-        coef_init : array, shape (n_features,)
+        coef_init : ndarray of shape (n_features,)
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (1,)
+        intercept_init : ndarray of shape (1,)
             The initial intercept to warm-start the optimization.
 
         sample_weight : array-like, shape (n_samples,), optional
@@ -1229,7 +1229,7 @@ def _decision_function(self, X):
 
         Returns
         -------
-        array, shape (n_samples,)
+        ndarray of shape (n_samples,)
            Predicted target values per element in X.
         """
         check_is_fitted(self)
@@ -1249,7 +1249,7 @@ def predict(self, X):
 
         Returns
         -------
-        array, shape (n_samples,)
+        ndarray of shape (n_samples,)
            Predicted target values per element in X.
         """
         return self._decision_function(X)
@@ -1492,16 +1492,16 @@ class SGDRegressor(BaseSGDRegressor):
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
+    coef_ : ndarray of shape (n_features,)
         Weights assigned to the features.
 
-    intercept_ : array, shape (1,)
+    intercept_ : ndarray of shape (1,)
         The intercept term.
 
-    average_coef_ : array, shape (n_features,)
+    average_coef_ : ndarray of shape (n_features,)
         Averaged weights assigned to the features.
 
-    average_intercept_ : array, shape (1,)
+    average_intercept_ : ndarray of shape (1,)
         The averaged intercept term.
 
     n_iter_ : int