scikit-learn
diff --git a/‎benchmarks/bench_feature_expansions.py
Lines changed: 49 additions & 0 deletions b/‎benchmarks/bench_feature_expansions.py
Lines changed: 49 additions & 0 deletions
diff --git a/‎conftest.py
Lines changed: 3 additions & 2 deletions b/‎conftest.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎doc/developers/contributing.rst
Lines changed: 13 additions & 4 deletions b/‎doc/developers/contributing.rst
Lines changed: 13 additions & 4 deletions
diff --git a/‎doc/modules/classes.rst
Lines changed: 12 additions & 2 deletions b/‎doc/modules/classes.rst
Lines changed: 12 additions & 2 deletions
diff --git a/‎doc/modules/clustering.rst
Lines changed: 11 additions & 13 deletions b/‎doc/modules/clustering.rst
Lines changed: 11 additions & 13 deletions
diff --git a/‎doc/modules/compose.rst
Lines changed: 7 additions & 5 deletions b/‎doc/modules/compose.rst
Lines changed: 7 additions & 5 deletions
diff --git a/‎doc/modules/ensemble.rst
Lines changed: 7 additions & 8 deletions b/‎doc/modules/ensemble.rst
Lines changed: 7 additions & 8 deletions
diff --git a/‎doc/modules/metrics.rst
Lines changed: 28 additions & 0 deletions b/‎doc/modules/metrics.rst
Lines changed: 28 additions & 0 deletions
@@ -0,0 +1,49 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.sparse as sparse
+from sklearn.preprocessing import PolynomialFeatures
+from time import time
+
+degree = 2
+trials = 3
+num_rows = 1000
+dimensionalities = np.array([1, 2, 8, 16, 32, 64])
+densities = np.array([0.01, 0.1, 1.0])
+csr_times = {d: np.zeros(len(dimensionalities)) for d in densities}
+dense_times = {d: np.zeros(len(dimensionalities)) for d in densities}
+transform = PolynomialFeatures(degree=degree, include_bias=False,
+                               interaction_only=False)
+
+for trial in range(trials):
+    for density in densities:
+        for dim_index, dim in enumerate(dimensionalities):
+            print(trial, density, dim)
+            X_csr = sparse.random(num_rows, dim, density).tocsr()
+            X_dense = X_csr.toarray()
+            # CSR
+            t0 = time()
+            transform.fit_transform(X_csr)
+            csr_times[density][dim_index] += time() - t0
+            # Dense
+            t0 = time()
+            transform.fit_transform(X_dense)
+            dense_times[density][dim_index] += time() - t0
+
+csr_linestyle = (0, (3, 1, 1, 1, 1, 1))  # densely dashdotdotted
+dense_linestyle = (0, ())  # solid
+
+fig, axes = plt.subplots(nrows=len(densities), ncols=1, figsize=(8, 10))
+for density, ax in zip(densities, axes):
+
+    ax.plot(dimensionalities, csr_times[density] / trials,
+            label='csr', linestyle=csr_linestyle)
+    ax.plot(dimensionalities, dense_times[density] / trials,
+            label='dense', linestyle=dense_linestyle)
+    ax.set_title("density %0.2f, degree=%d, n_samples=%d" %
+                 (density, degree, num_rows))
+    ax.legend()
+    ax.set_xlabel('Dimensionality')
+    ax.set_ylabel('Time (seconds)')
+
+plt.tight_layout()
+plt.show()
@@ -16,8 +16,9 @@
 PYTEST_MIN_VERSION = '3.3.0'
 
 if LooseVersion(pytest.__version__) < PYTEST_MIN_VERSION:
-    raise('Your version of pytest is too old, you should have at least '
-          'pytest >= {} installed.'.format(PYTEST_MIN_VERSION))
+    raise ImportError('Your version of pytest is too old, you should have '
+                      'at least pytest >= {} installed.'
+                      .format(PYTEST_MIN_VERSION))
 
 
 def pytest_addoption(parser):
 
@@ -1143,6 +1143,16 @@ data dependent. A tolerance stopping criterion ``tol`` is not directly
 data dependent (although the optimal value according to some scoring
 function probably is).
 
+When ``fit`` is called, any previous call to ``fit`` should be ignored. In
+general, calling ``estimator.fit(X1)`` and then ``estimator.fit(X2)`` should
+be the same as only calling ``estimator.fit(X2)``. However, this may not be
+true in practice when ``fit`` depends on some random process, see
+:term:`random_state`. Another exception to this rule is when the
+hyper-parameter ``warm_start`` is set to ``True`` for estimators that
+support it. ``warm_start=True`` means that the previous state of the
+trainable parameters of the estimator are reused instead of using the
+default initialization strategy.
+
 Estimated Attributes
 ^^^^^^^^^^^^^^^^^^^^
 
@@ -1151,9 +1161,8 @@ ending with trailing underscore, for example the coefficients of
 some regression estimator would be stored in a ``coef_`` attribute after
 ``fit`` has been called.
 
-The last-mentioned attributes are expected to be overridden when
-you call ``fit`` a second time without taking any previous value into
-account: **fit should be idempotent**.
+The estimated attributes are expected to be overridden when you call ``fit``
+a second time.
 
 Optional Arguments
 ^^^^^^^^^^^^^^^^^^
@@ -1209,7 +1218,7 @@ the correct interface more easily.
     and optionally the mixin classes in ``sklearn.base``.
     For example, below is a custom classifier, with more examples included
     in the scikit-learn-contrib
-    `project template <https://github.com/scikit-learn-contrib/project-template/blob/master/skltemplate/template.py>`__.
+    `project template <https://github.com/scikit-learn-contrib/project-template/blob/master/skltemplate/_template.py>`__.
 
       >>> import numpy as np
       >>> from sklearn.base import BaseEstimator, ClassifierMixin
 
@@ -846,6 +846,7 @@ details.
    metrics.jaccard_similarity_score
    metrics.log_loss
    metrics.matthews_corrcoef
+   metrics.multilabel_confusion_matrix
    metrics.precision_recall_curve
    metrics.precision_recall_fscore_support
    metrics.precision_score
@@ -904,7 +905,7 @@ details.
 
    metrics.adjusted_mutual_info_score
    metrics.adjusted_rand_score
-   metrics.calinski_harabaz_score
+   metrics.calinski_harabasz_score
    metrics.davies_bouldin_score
    metrics.completeness_score
    metrics.cluster.contingency_matrix
@@ -1496,6 +1497,15 @@ Utilities from joblib:
 Recently deprecated
 ===================
 
+To be removed in 0.23
+---------------------
+
+.. autosummary::
+   :toctree: generated/
+   :template: deprecated_function.rst
+
+   metrics.calinski_harabaz_score
+
 
 To be removed in 0.22
 ---------------------
@@ -1513,4 +1523,4 @@ To be removed in 0.22
    :template: deprecated_function.rst
 
    covariance.graph_lasso
-   datasets.fetch_mldata
+   datasets.fetch_mldata
@@ -387,7 +387,7 @@ is updated according to the following equation:
 
 .. math::
 
-    x_i^{t+1} = x_i^t + m(x_i^t)
+    x_i^{t+1} = m(x_i^t)
 
 Where :math:`N(x_i)` is the neighborhood of samples within a given distance
 around :math:`x_i` and :math:`m` is the *mean shift* vector that is computed for each
@@ -1551,7 +1551,7 @@ Advantages
 - **Upper-bounded at 1**:  Values close to zero indicate two label
   assignments that are largely independent, while values close to one
   indicate significant agreement. Further, values of exactly 0 indicate
-  **purely** independent label assignments and a AMI of exactly 1 indicates
+  **purely** independent label assignments and a FMI of exactly 1 indicates
   that the two label assignments are equal (with or without permutation).
 
 - **No assumption is made on the cluster structure**: can be used
@@ -1652,17 +1652,16 @@ Drawbacks
  * :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` : In this example
    the silhouette analysis is used to choose an optimal value for n_clusters.
 
-.. _calinski_harabaz_index:
+.. _calinski_harabasz_index:
 
-Calinski-Harabaz Index
+Calinski-Harabasz Index
 ----------------------
-
-If the ground truth labels are not known, the Calinski-Harabaz index
-(:func:`sklearn.metrics.calinski_harabaz_score`) - also known as the Variance 
+If the ground truth labels are not known, the Calinski-Harabasz index
+(:func:`sklearn.metrics.calinski_harabasz_score`) - also known as the Variance 
 Ratio Criterion - can be used to evaluate the model, where a higher 
-Calinski-Harabaz score relates to a model with better defined clusters.
+Calinski-Harabasz score relates to a model with better defined clusters.
 
-For :math:`k` clusters, the Calinski-Harabaz score :math:`s` is given as the
+For :math:`k` clusters, the Calinski-Harabasz score :math:`s` is given as the
 ratio of the between-clusters dispersion mean and the within-cluster
 dispersion:
 
@@ -1689,17 +1688,16 @@ points in cluster :math:`q`.
   >>> X = dataset.data
   >>> y = dataset.target
 
-In normal usage, the Calinski-Harabaz index is applied to the results of a
+In normal usage, the Calinski-Harabasz index is applied to the results of a
 cluster analysis.
 
   >>> import numpy as np
   >>> from sklearn.cluster import KMeans
   >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
   >>> labels = kmeans_model.labels_
-  >>> metrics.calinski_harabaz_score(X, labels)  # doctest: +ELLIPSIS
+  >>> metrics.calinski_harabasz_score(X, labels)  # doctest: +ELLIPSIS
   561.62...
 
-
 Advantages
 ~~~~~~~~~~
 
@@ -1712,7 +1710,7 @@ Advantages
 Drawbacks
 ~~~~~~~~~
 
-- The Calinski-Harabaz index is generally higher for convex clusters than other
+- The Calinski-Harabasz index is generally higher for convex clusters than other
   concepts of clusters, such as density based clusters like those obtained
   through DBSCAN.
 
 
@@ -107,10 +107,10 @@ This is particularly important for doing grid searches::
     >>> grid_search = GridSearchCV(pipe, param_grid=param_grid)
 
 Individual steps may also be replaced as parameters, and non-final steps may be
-ignored by setting them to ``None``::
+ignored by setting them to ``'passthrough'``::
 
     >>> from sklearn.linear_model import LogisticRegression
-    >>> param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
+    >>> param_grid = dict(reduce_dim=['passthrough', PCA(5), PCA(10)],
     ...                   clf=[SVC(), LogisticRegression()],
     ...                   clf__C=[0.1, 10, 100])
     >>> grid_search = GridSearchCV(pipe, param_grid=param_grid)
@@ -486,17 +486,19 @@ the transformation::
          [0.5, 0.5],
          [1. , 0. ]])
 
-The :func:`~sklearn.compose.make_columntransformer` function is available
+The :func:`~sklearn.compose.make_column_transformer` function is available
 to more easily create a :class:`~sklearn.compose.ColumnTransformer` object.
 Specifically, the names will be given automatically. The equivalent for the
 above example would be::
 
   >>> from sklearn.compose import make_column_transformer
   >>> column_trans = make_column_transformer(
   ...     ('city', CountVectorizer(analyzer=lambda x: [x])),
-  ...     ('title', CountVectorizer()))
+  ...     ('title', CountVectorizer()),
+  ...     remainder=MinMaxScaler())
   >>> column_trans # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
-  ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
+  ColumnTransformer(n_jobs=None, remainder=MinMaxScaler(copy=True, ...),
+           sparse_threshold=0.3,
            transformer_weights=None,
            transformers=[('countvectorizer-
4E34
1', ...)
 
 
@@ -594,21 +594,20 @@ learners. Decision trees have a number of abilities that make them
 valuable for boosting, namely the ability to handle data of mixed type
 and the ability to model complex functions.
 
-Similar to other boosting algorithms GBRT builds the additive model in
-a forward stagewise fashion:
+Similar to other boosting algorithms, GBRT builds the additive model in
+a greedy fashion:
 
   .. math::
 
-    F_m(x) = F_{m-1}(x) + \gamma_m h_m(x)
+    F_m(x) = F_{m-1}(x) + \gamma_m h_m(x),
 
-At each stage the decision tree :math:`h_m(x)` is chosen to
-minimize the loss function :math:`L` given the current model
-:math:`F_{m-1}` and its fit :math:`F_{m-1}(x_i)`
+where the newly added tree :math:`h_m` tries to minimize the loss :math:`L`,
+given the previous ensemble :math:`F_{m-1}`:
 
   .. math::
 
-    F_m(x) = F_{m-1}(x) + \arg\min_{h} \sum_{i=1}^{n} L(y_i,
-    F_{m-1}(x_i) + h(x))
+    h_m =  \arg\min_{h} \sum_{i=1}^{n} L(y_i,
+    F_{m-1}(x_i) + h(x_i)).
 
 The initial model :math:`F_{0}` is problem specific, for least-squares
 regression one usually chooses the mean of the target values.
 
@@ -33,6 +33,34 @@ the kernel:
     2. ``S = 1. / (D / np.max(D))``
 
 
+.. currentmodule:: sklearn.metrics
+
+The distances between the row vectors of ``X`` and the row vectors of ``Y``
+can be evaluated using :func:`pairwise_distances`. If ``Y`` is omitted the
+pairwise distances of the row vectors of ``X`` are calculated. Similarly,
+:func:`pairwise.pairwise_kernels` can be used to calculate the kernel between `X`
+and `Y` using different kernel functions. See the API reference for more
+details.
+
+    >>> import numpy as np
+    >>> from sklearn.metrics import pairwise_distances
+    >>> from sklearn.metrics.pairwise import pairwise_kernels
+    >>> X = np.array([[2, 3], [3, 5], [5, 8]])
+    >>> Y = np.array([[1, 0], [2, 1]])
+    >>> pairwise_distances(X, Y, metric='manhattan')
+    array([[ 4.,  2.],
+           [ 7.,  5.],
+           [12., 10.]])
+    >>> pairwise_distances(X, metric='manhattan')
+    array([[0., 3., 8.],
+           [3., 0., 5.],
+           [8., 5., 0.]])
+    >>> pairwise_kernels(X, Y, metric='linear')
+    array([[ 2.,  7.],
+           [ 3., 11.],
+           [ 5., 18.]])
+
+
 .. currentmodule:: sklearn.metrics.pairwise
 
 .. _cosine_similarity: