scikit-learn
diff --git a/‎sklearn/linear_model/_stochastic_gradient.py
+56-59 b/‎sklearn/linear_model/_stochastic_gradient.py
+56-59
@@ -244,12 +244,12 @@ def _make_validation_split(self, y):
 
         Parameters
         ----------
-        y : array, shape (n_samples, )
+        y : ndarray of shape (n_samples, )
             Target values.
 
         Returns
         -------
-        validation_mask : array, shape (n_samples, )
+        validation_mask : ndarray of shape (n_samples, )
             Equal to 1 on the validation set, 0 on the training set.
         """
         n_samples = y.shape[0]
     sample_weight : numpy array of shape [n_samples, ]
         The weight of each sample
 
-    validation_mask : numpy array of shape [n_samples, ] or None
+    validation_mask : numpy array of shape [n_samples, ], default=None
         Precomputed validation mask in case _fit_binary is called in the
         context of a one-vs-rest reduction.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
@@ -641,18 +641,18 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Subset of the training data.
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Subset of the target values.
 
-        classes : array, shape (n_classes,)
+        classes : ndarray of shape (n_classes,), default=None
             Classes across all calls to partial_fit.
             Can be obtained by via `np.unique(y_all)`, where y_all is the
             target vector of the entire dataset.
             This argument is required for the first call to partial_fit
             and can be omitted in the subsequent calls.
             Note that y doesn't need to contain all labels in `classes`.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed.
 
@@ -685,16 +685,16 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data.
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values.
 
-        coef_init : array, shape (n_classes, n_features)
+        coef_init : ndarray of shape (n_classes, n_features), default=None
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (n_classes,)
+        intercept_init : ndarray of shape (n_classes,), default=None
             The initial intercept to warm-start the optimization.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed. These weights will
             be multiplied with class_weight (passed through the
@@ -738,7 +738,7 @@ class SGDClassifier(BaseSGDClassifier):
 
     Parameters
     ----------
-    loss : str, default: 'hinge'
+    loss : str, default='hinge'
         The loss function to be used. Defaults to 'hinge', which gives a
         linear SVM.
 
@@ -754,42 +754,41 @@ class SGDClassifier(BaseSGDClassifier):
         The other losses are designed for regression but can be useful in
         classification as well; see SGDRegressor for a description.
 
-    penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
+    penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
         'elasticnet' might bring sparsity to the model (feature selection)
         not achievable with 'l2'.
 
-    alpha : float
+    alpha : float, default=0.0001
         Constant that multiplies the regularization term. Defaults to 0.0001.
         Also used to compute learning_rate when set to 'optimal'.
 
-    l1_ratio : float
+    l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
         Defaults to 0.15.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
         data is assumed to be already centered. Defaults to True.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
         .. versionadded:: 0.19
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
-        Defaults to True.
 
     verbose : int, default=0
         The verbosity level.
@@ -802,21 +801,21 @@ class SGDClassifier(BaseSGDClassifier):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
-    n_jobs : int or None, optional (default=None)
+    n_jobs : int, default=None
         The number of CPUs to use to do the OVA (One Versus All, for
         multi-class problems) computation.
         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
         for more details.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : str, optional
+    learning_rate : str, default='optimal'
         The learning rate schedule:
 
         'constant':
@@ -832,12 +831,12 @@ class SGDClassifier(BaseSGDClassifier):
             training loss by tol or fail to increase validation score by tol if
             early_stopping is True, the current learning rate is divided by 5.
 
-    eta0 : double
+    eta0 : double, default=0.0
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.0 as eta0 is not used by
         the default schedule 'optimal'.
 
-    power_t : double
+    power_t : double, default=0.5
         The exponent for inverse scaling learning rate [default 0.5].
 
     early_stopping : bool, default=False
@@ -861,7 +860,7 @@ class SGDClassifier(BaseSGDClassifier):
 
         .. versionadded:: 0.20
 
-    class_weight : dict, {class_label: weight} or "balanced" or None, optional
+    class_weight : dict, {class_label: weight} or "balanced", default=None
         Preset for the class_weight fit parameter.
 
         Weights associated with classes. If not given, all classes
@@ -893,11 +892,11 @@ class SGDClassifier(BaseSGDClassifier):
 
     Attributes
     ----------
-    coef_ : array, shape (1, n_features) if n_classes == 2 else (n_classes,\
-            n_features)
+    coef_ : ndarray of shape (1, n_features) if n_classes == 2 else \
+            (n_classes, n_features)
         Weights assigned to the features.
 
-    intercept_ : array, shape (1,) if n_classes == 2 else (n_classes,)
+    intercept_ : ndarray of shape (1,) if n_classes == 2 else (n_classes,)
         Constants in decision function.
 
     n_iter_ : int
@@ -979,7 +978,7 @@ def predict_proba(self):
 
         Returns
         -------
-        array, shape (n_samples, n_classes)
+        ndarray of shape (n_samples, n_classes)
             Returns the probability of the sample for each class in the model,
             where classes are ordered as they are in `self.classes_`.
 
@@ -1140,7 +1139,7 @@ def partial_fit(self, X, y, sample_weight=None):
         y : numpy array of shape (n_samples,)
             Subset of target values
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples.
             If not provided, uniform weights are assumed.
 
@@ -1198,16 +1197,16 @@ def fit(self, X, y, coef_init=None, intercept_init=None,
         X : {array-like, sparse matrix}, shape (n_samples, n_features)
             Training data
 
-        y : numpy array, shape (n_samples,)
+        y : ndarray of shape (n_samples,)
             Target values
 
-        coef_init : array, shape (n_features,)
+        coef_init : ndarray of shape (n_features,), default=None
             The initial coefficients to warm-start the optimization.
 
-        intercept_init : array, shape (1,)
+        intercept_init : ndarray of shape (1,), default=None
             The initial intercept to warm-start the optimization.
 
-        sample_weight : array-like, shape (n_samples,), optional
+        sample_weight : array-like, shape (n_samples,), default=None
             Weights applied to individual samples (1. for unweighted).
 
         Returns
@@ -1229,7 +1228,7 @@ def _decision_function(self, X):
 
         Returns
         -------
-        array, shape (n_samples,)
+        ndarray of shape (n_samples,)
            Predicted target values per element in X.
         """
         check_is_fitted(self)
@@ -1249,7 +1248,7 @@ def predict(self, X):
 
         Returns
         -------
-        array, shape (n_samples,)
+        ndarray of shape (n_samples,)
            Predicted target values per element in X.
         """
         return self._decision_function(X)
@@ -1359,7 +1358,7 @@ class SGDRegressor(BaseSGDRegressor):
 
     Parameters
     ----------
-    loss : str, default: 'squared_loss'
+    loss : str, default='squared_loss'
         The loss function to be used. The possible values are 'squared_loss',
         'huber', 'epsilon_insensitive', or 'squared_epsilon_insensitive'
 
@@ -1371,44 +1370,42 @@ class SGDRegressor(BaseSGDRegressor):
         'squared_epsilon_insensitive' is the same but becomes squared loss past
         a tolerance of epsilon.
 
-    penalty : str, 'none', 'l2', 'l1', or 'elasticnet'
+    penalty : {'l2', 'l1', 'elasticnet'}, default='l2'
         The penalty (aka regularization term) to be used. Defaults to 'l2'
         which is the standard regularizer for linear SVM models. 'l1' and
         'elasticnet' might bring sparsity to the model (feature selection)
         not achievable with 'l2'.
 
-    alpha : float
-        Constant that multiplies the regularization term. Defaults to 0.0001
+    alpha : float, default=0.0001
+        Constant that multiplies the regularization term.
         Also used to compute learning_rate when set to 'optimal'.
 
-    l1_ratio : float
+    l1_ratio : float, default=0.15
         The Elastic Net mixing parameter, with 0 <= l1_ratio <= 1.
         l1_ratio=0 corresponds to L2 penalty, l1_ratio=1 to L1.
-        Defaults to 0.15.
 
-    fit_intercept : bool
+    fit_intercept : bool, default=True
         Whether the intercept should be estimated or not. If False, the
-        data is assumed to be already centered. Defaults to True.
+        data is assumed to be already centered.
 
-    max_iter : int, optional (default=1000)
+    max_iter : int, default=1000
         The maximum number of passes over the training data (aka epochs).
         It only impacts the behavior in the ``fit`` method, and not the
         :meth:`partial_fit` method.
 
         .. versionadded:: 0.19
 
-    tol : float or None, optional (default=1e-3)
+    tol : float, default=1e-3
         The stopping criterion. If it is not None, the iterations will stop
         when (loss > best_loss - tol) for ``n_iter_no_change`` consecutive
         epochs.
 
         .. versionadded:: 0.19
 
-    shuffle : bool, optional
+    shuffle : bool, default=True
         Whether or not the training data should be shuffled after each epoch.
-        Defaults to True.
 
-    verbose : integer, default=0
+    verbose : int, default=0
         The verbosity level.
 
     epsilon : float, default=0.1
@@ -1419,14 +1416,14 @@ class SGDRegressor(BaseSGDRegressor):
         For epsilon-insensitive, any differences between the current prediction
         and the correct label are ignored if they are less than this threshold.
 
-    random_state : int, RandomState instance or None, optional (default=None)
+    random_state : int, RandomState instance, default=None
         The seed of the pseudo random number generator to use when shuffling
         the data.  If int, random_state is the seed used by the random number
         generator; If RandomState instance, random_state is the random number
         generator; If None, the random number generator is the RandomState
         instance used by `np.random`.
 
-    learning_rate : string, optional
+    learning_rate : string, default='invscaling'
         The learning rate schedule:
 
         'constant':
@@ -1442,12 +1439,12 @@ class SGDRegressor(BaseSGDRegressor):
             training loss by tol or fail to increase validation score by tol if
             early_stopping is True, the current learning rate is divided by 5.
 
-    eta0 : double
+    eta0 : double, default=0.01
         The initial learning rate for the 'constant', 'invscaling' or
         'adaptive' schedules. The default value is 0.01.
 
-    power_t : double
-        The exponent for inverse scaling learning rate [default 0.25].
+    power_t : double, default=0.25
+        The exponent for inverse scaling learning rate.
 
     early_stopping : bool, default=False
         Whether to use early stopping to terminate training when validation
@@ -1492,16 +1489,16 @@ class SGDRegressor(BaseSGDRegressor):
 
     Attributes
     ----------
-    coef_ : array, shape (n_features,)
+    coef_ : ndarray of shape (n_features,)
         Weights assigned to the features.
 
-    intercept_ : array, shape (1,)
+    intercept_ : ndarray of shape (1,)
         The intercept term.
 
-    average_coef_ : array, shape (n_features,)
+    average_coef_ : ndarray of shape (n_features,)
         Averaged weights assigned to the features.
 
-    average_intercept_ : array, shape (1,)
+    average_intercept_ : ndarray of shape (1,)
         The averaged intercept term.
 
     n_iter_ : int