scikit-learn
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/bench_covertype.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/bench_covertype.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/datasets/covtype.rst
Lines changed: 2 additions & 1 deletion b/‎doc/datasets/covtype.rst
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/datasets/index.rst
Lines changed: 5 additions & 5 deletions b/‎doc/datasets/index.rst
Lines changed: 5 additions & 5 deletions
diff --git a/‎doc/modules/classes.rst
Lines changed: 1 addition & 0 deletions b/‎doc/modules/classes.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/clustering.rst
Lines changed: 28 additions & 27 deletions b/‎doc/modules/clustering.rst
Lines changed: 28 additions & 27 deletions
diff --git a/‎doc/modules/cross_validation.rst
Lines changed: 5 additions & 2 deletions b/‎doc/modules/cross_validation.rst
Lines changed: 5 additions & 2 deletions
diff --git a/‎doc/modules/decomposition.rst
Lines changed: 80 additions & 0 deletions b/‎doc/modules/decomposition.rst
Lines changed: 80 additions & 0 deletions
diff --git a/‎doc/modules/feature_extraction.rst
Lines changed: 19 additions & 0 deletions b/‎doc/modules/feature_extraction.rst
Lines changed: 19 additions & 0 deletions
diff --git a/‎doc/modules/metrics.rst
Lines changed: 1 addition & 1 deletion b/‎doc/modules/metrics.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/sphinxext/gen_rst.py
Lines changed: 4 additions & 5 deletions b/‎doc/sphinxext/gen_rst.py
Lines changed: 4 additions & 5 deletions
@@ -2,6 +2,7 @@
 *.so
 *~
 .#*
+*.lprof
 *.swp
 *.swo
 .DS_Store
@@ -40,6 +41,7 @@ nips2010_pdf/
 *.tgz
 
 examples/cluster/joblib
+reuters/
 benchmarks/bench_covertype_data/
 
 *.prefs
 
@@ -109,7 +109,7 @@ def load_data(dtype=np.float32, order='F'):
     print("Loading dataset...")
     data = fetch_covtype(download_if_missing=True, shuffle=True,
                          random_state=opts.random_seed)
-    X, y = data.data, data.target
+    X, y = data['data'], data['target']
     if order.lower() == 'f':
         X = np.asfortranarray(X)
 
 
@@ -14,6 +14,7 @@ Some of the features are boolean indicators,
 while others are discrete or continuous measurements.
 
 ``sklearn.datasets.fetch_covtype`` will load the covertype dataset;
-it returns a ``Bunch`` object with the feature matrix in the ``data`` member
+it returns a dictionary-like object
+with the feature matrix in the ``data`` member
 and the target values in ``target``.
 The dataset will be downloaded from the web if necessary.
@@ -32,11 +32,11 @@ numpy array X and an array of length n_samples containing the targets y.
 
 The toy datasets as well as the 'real world' datasets and the datasets
 fetched from mldata.org have more sophisticated structure.
-These functions return a ``bunch`` (which is a dictionary that is
-accessible with the 'dict.key' syntax).
-All datasets have at least two keys, ``data``, containg an array of shape
-``n_samples x n_features`` (except for 20newsgroups) and ``target``, a numpy
-array of length ``n_features``, containing the targets.
+These functions return a dictionary-like object holding at least two items:
+an array of shape ``n_samples`` * `` n_features`` with key ``data``
+(except for 20newsgroups)
+and a NumPy array of length ``n_features``, containing the target values,
+with key ``target``.
 
 The datasets also contain a description in ``DESCR`` and some contain
 ``feature_names`` and ``target_names``.
 
@@ -217,6 +217,7 @@ Samples generator
    decomposition.KernelPCA
    decomposition.FactorAnalysis
    decomposition.FastICA
+   decomposition.TruncatedSVD
    decomposition.NMF
    decomposition.SparsePCA
    decomposition.MiniBatchSparsePCA
 
@@ -55,20 +55,20 @@ Overview of clustering methods
      - number of clusters
      - Very large `n_samples`, medium `n_clusters` with
        :ref:`MiniBatch code <mini_batch_kmeans>`
-     - General-purpose, even cluster size, flat geometry, not too many clusters 
-     - Distances between points 
+     - General-purpose, even cluster size, flat geometry, not too many clusters
+     - Distances between points
 
    * - :ref:`Affinity propagation <affinity_propagation>`
-     - damping, sample preference 
+     - damping, sample preference
      - Not scalable with n_samples
      - Many clusters, uneven cluster size, non-flat geometry
      - Graph distance (e.g. nearest-neighbor graph)
 
    * - :ref:`Mean-shift <mean_shift>`
-     - bandwidth 
+     - bandwidth
      - Not scalable with n_samples
      - Many clusters, uneven cluster size, non-flat geometry
-     - Distances between points 
+     - Distances between points
 
    * - :ref:`Spectral clustering <spectral_clustering>`
      - number of clusters
@@ -80,13 +80,13 @@ Overview of clustering methods
      - number of clusters
      - Large `n_samples` and `n_clusters`
      - Many clusters, possibly connectivity constraints
-     - Distances between points 
+     - Distances between points
 
    * - :ref:`DBSCAN <dbscan>`
      - neighborhood size
      - Very large `n_samples`, medium `n_clusters`
      - Non-flat geometry, uneven cluster sizes
-     - Distances between nearest points 
+     - Distances between nearest points
 
    * - :ref:`Gaussian mixtures <mixture>`
      - many
@@ -116,7 +116,7 @@ be specified. It scales well to large number of samples and has been used
 across a large range of application areas in many different fields. It is
 also equivalent to the expectation-maximization algorithm when setting the
 covariance matrix to be diagonal, equal and small. The K-means algorithm
-aims to choose centroids :math:`C` that minimise the within cluster sum of 
+aims to choose centroids :math:`C` that minimise the within cluster sum of
 squares objective function with a dataset :math:`X` with :math:`n` samples:
 
 .. math:: J(X, C) = \sum_{i=0}^{n}\min_{\mu_j \in C}(||x_j - \mu_i||^2)
@@ -156,8 +156,8 @@ centroids to be (generally) distant from each other, leading to provably better
 results than random initialisation.
 
 A parameter can be given to allow K-means to be run in parallel, called
-`n_jobs`. Giving this parameter a positive value uses that many processors 
-(default=1). A value of -1 uses all processors, with -2 using one less, and so 
+`n_jobs`. Giving this parameter a positive value uses that many processors
+(default=1). A value of -1 uses all processors, with -2 using one less, and so
 on. Parallelization generally speeds up computation at the cost of memory (in
 this case, multiple copies of centroids need to be stored, one for each job).
 
@@ -500,16 +500,17 @@ separated by areas of low density. Due to this rather generic view, clusters
 found by DBSCAN can be any shape, as opposed to k-means which assumes that
 clusters are convex shaped. The central component to the DBSCAN is the concept
 of *core samples*, which are samples that are in areas of high density. A
-cluster is therefore a set of core samples, each highly similar to each other
-and a set of non-core samples that are similar to a core sample (but are not
+cluster is therefore a set of core samples, each close to each other
+(measured by some distance measure)
+and a set of non-core samples that are close to a core sample (but are not
 themselves core samples). There are two parameters to the algorithm,
-`min_points` and `eps`, which define formally what we mean when we say *dense*.
-A higher `min_points` or lower `eps` indicate higher density necessary to form
+`min_samples` and `eps`, which define formally what we mean when we say *dense*.
+A higher `min_samples` or lower `eps` indicate higher density necessary to form
 a cluster.
 
 More formally, we define a core sample as being a sample in the dataset such
-that there exists `min_samples` other samples with a similarity higher than
-`eps` to it, which are defined as *neighbors* of the core sample. This tells
+that there exist `min_samples` other samples within a distance of
+`eps`, which are defined as *neighbors* of the core sample. This tells
 us that the core sample is in a dense area of the vector space. A cluster
 is a set of core samples, that can be built by recursively by taking a core
 sample, finding all of its neighbors that are core samples, finding all of
@@ -520,24 +521,24 @@ are on the fringes of a cluster.
 
 Any core sample is part of a cluster, by definition. Further, any cluster has
 at least `min_samples` points in it, following the definition of a core
-sample. For any sample that is not a core sample, and does not have a
-similarity higher than `eps` to a core sample, it is considered an outlier by
+sample. For any sample that is not a core sample, and does have a
+distance higher than `eps` to any core sample, it is considered an outlier by
 the algorithm.
 
 The algorithm is non-deterministic, however the core samples themselves will
 always belong to the same clusters (although the labels themselves may be
 different). The non-determinism comes from deciding on which cluster a
-non-core sample belongs to. A non-core sample can be have a similarity higher
+non-core sample belongs to. A non-core sample can have a distance lower
 than `eps` to two core samples in different classes. Following from the
-triangular inequality, those two core samples would be less similar than
+triangular inequality, those two core samples would be more distant than
 `eps` from each other -- else they would be in the same class. The non-core
 sample is simply assigned to which ever cluster is generated first, where
 the order is determined randomly within the code. Other than the ordering of,
 the dataset, the algorithm is deterministic, making the results relatively
 stable between iterations on the same data.
 
 In the figure below, the color indicates cluster membership, with large circles
-indicating core samples found by the algorithm. Smaller circles are non-core 
+indicating core samples found by the algorithm. Smaller circles are non-core
 samples that are still part of a cluster. Moreover, the outliers are indicated
 by black points below.
 
@@ -819,7 +820,7 @@ Drawbacks
 
  * :ref:`example_cluster_plot_adjusted_for_chance_measures.py`: Analysis of
    the impact of the dataset size on the value of clustering measures
-   for random assignments. This example also includes the Adjusted Rand 
+   for random assignments. This example also includes the Adjusted Rand
    Index.
 
 
@@ -864,7 +865,7 @@ following equation, from Vinh, Epps, and Bailey, (2009). In this equation,
    \frac{a_i!b_j!(N-a_i)!(N-b_j)!}{N!n_{ij}!(a_i-n_{ij})!(b_j-n_{ij})!
    (N-a_i-b_j+n_{ij})!}
 
-Using the expected value, the adjusted mutual information can then be 
+Using the expected value, the adjusted mutual information can then be
 calculated using a similar form to that of the adjusted Rand index:
 
 .. math:: \text{AMI} = \frac{\text{MI} - E[\text{MI}]}{\max(H(U), H(V)) - E[\text{MI}]}
@@ -875,7 +876,7 @@ calculated using a similar form to that of the adjusted Rand index:
    knowledge reuse framework for combining multiple partitions". Journal of
    Machine Learning Research 3: 583–617. doi:10.1162/153244303321897735
 
- * Vinh, Epps, and Bailey, (2009). "Information theoretic measures 
+ * Vinh, Epps, and Bailey, (2009). "Information theoretic measures
    for clusterings comparison". Proceedings of the 26th Annual International
    Conference on Machine Learning - ICML '09.
    doi:10.1145/1553374.1553511. ISBN 9781605585161.
@@ -1045,7 +1046,7 @@ mean of homogeneity and completeness**:
 
  .. [B2011] `Identication and Characterization of Events in Social Media
    <http://www.cs.columbia.edu/~hila/hila-thesis-distributed.pdf>`_, Hila
-   Becker, PhD Thesis. 
+   Becker, PhD Thesis.
 
 .. _silhouette_coefficient:
 
@@ -1073,7 +1074,7 @@ The Silhoeutte Coefficient *s* for a single sample is then given as:
 
 .. math:: s = \frac{b - a}{max(a, b)}
 
-The Silhouette Coefficient for a set of samples is given as the mean of the 
+The Silhouette Coefficient for a set of samples is given as the mean of the
 Silhouette Coefficient for each sample.
 
 
@@ -1091,7 +1092,7 @@ cluster analysis.
   >>> from sklearn.cluster import KMeans
   >>> kmeans_model = KMeans(n_clusters=3, random_state=1).fit(X)
   >>> labels = kmeans_model.labels_
-  >>> metrics.silhouette_score(X, labels, metric='euclidean')  
+  >>> metrics.silhouette_score(X, labels, metric='euclidean')
   ...                                                      # doctest: +ELLIPSIS
   0.55...
 
 
@@ -277,8 +277,11 @@ not waste much data as only one sample is removed from the learning set::
 Leave-P-Out - LPO
 -----------------
 
-:class:`LeavePOut` is very similar to *Leave-One-Out*, as it creates all the
-possible training/test sets by removing :math:`P` samples from the complete set.
+:class:`LeavePOut` is very similar to :class:`LeaveOneOut` as it creates all
+the possible training/test sets by removing :math:`p` samples from the complete
+set. For :math:`n` samples, this produces :math:`{n \choose p}` train-test
+pairs. Unlike :class:`LeaveOneOut` and :class:`KFold`, the test sets will
+overlap for :math:`p > 1`.
 
 Example of Leave-2-Out::
 
 
@@ -232,6 +232,86 @@ factorization, while larger values shrink many coefficients to zero.
      R. Jenatton, G. Obozinski, F. Bach, 2009
 
 
+.. _LSA:
+
+Truncated singular value decomposition and latent semantic analysis
+===================================================================
+
+:class:`TruncatedSVD` implements a variant of singular value decomposition
+(SVD) that only computes the :math:`k` largest singular values,
+where :math:`k` is a user-specified parameter.
+
+When truncated SVD is applied to term-document matrices
+(as returned by ``CountVectorizer`` or ``TfidfVectorizer``),
+this transformation is known as
+`latent semantic analysis <http://nlp.stanford.edu/IR-book/pdf/18lsi.pdf>`_
+(LSA), because it transforms such matrices
+to a "semantic" space of low dimensionality.
+In particular, LSA is known to combat the effects of synonymy and polysemy
+(both of which roughly mean there are multiple meanings per word),
+which cause term-document matrices to be overly sparse
+and exhibit poor similarity under measures such as cosine similarity.
+
+.. note::
+    LSA is also known as latent semantic indexing, LSI,
+    though strictly that refers to its use in persistent indexes
+    for information retrieval purposes.
+
+Mathematically, truncated SVD applied to training samples :math:`X`
+produces a low-rank approximation :math:`X`:
+
+.. math::
+    X \approx X_k = U_k \Sigma_k V_k^\top
+
+After this operation, :math:`U_k \Sigma_k^\top`
+is the transformed training set with :math:`k` features
+(called ``n_components`` in the API).
+
+To also transform a test set :math:`X`, we multiply it with :math:`V_k`:
+
+.. math::
+    X' = X V_k^\top
+
+.. note::
+    Most treatments of LSA in the natural language processing (NLP)
+    and information retrieval (IR) literature
+    swap the axis of the matrix :math:`X` so that it has shape
+    ``n_features`` × ``n_samples``.
+    We present LSA in a different way that matches the scikit-learn API better,
+    but the singular values found are the same.
+
+:class:`TruncatedSVD` is very similar to :class:`PCA`, but differs
+in that it works on sample matrices :math:`X` directly
+instead of their covariance matrices.
+When the columnwise (per-feature) means of :math:`X`
+are subtracted from the feature values,
+truncated SVD on the resulting matrix is equivalent to PCA.
+In practical terms, this means
+that the :class:`TruncatedSVD` transformer accepts ``scipy.sparse``
+matrices without the need to densify them,
+as densifying may fill up memory even for medium-sized document collections.
+
+While the :class:`TruncatedSVD` transformer
+works with any (sparse) feature matrix,
+using it on tf–idf matrices is recommended over raw frequency counts
+in an LSA/document processing setting.
+In particular, sublinear scaling and inverse document frequency
+should be turned on (``sublinear_tf=True, use_idf=True``)
+to bring the feature values closer to a Gaussian distribution,
+compensating for LSA's erroneous assumptions about textual data.
+
+.. topic:: Examples:
+
+   * :ref:`example_document_clustering.py`
+
+.. topic:: References:
+
+  * Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze (2008),
+    *Introduction to Information Retrieval*, Cambridge University Press,
+    chapter 18: `Matrix decompositions & latent semantic indexing
+    <http://nlp.stanford.edu/IR-book/pdf/18lsi.pdf>`_
+
+
 .. _DictionaryLearning:
 
 Dictionary Learning
 
@@ -653,6 +653,25 @@ The :class:`HashingVectorizer` also comes with the following limitations:
   model. A :class:`TfidfTransformer` can be appended to it in a pipeline if
   required.
 
+Performing out-of-core scaling with HashingVectorizer
+------------------------------------------------------
+
+An interesting development of using a :class:`HashingVectorizer` is the ability
+to perform `out-of-core`_ scaling. This means that we can learn from data that
+does not fit into the computer's main memory.
+
+.. _out-of-core: http://en.wikipedia.org/wiki/Out-of-core_algorithm. 
+
+A strategy to implement out-of-core scaling is to stream data to the estimator 
+in mini-batches. Each mini-batch is vectorized using :class:`HashingVectorizer`
+so as to guarantee that the input space of the estimator has always the same
+dimensionality. The amount of memory used at any time is thus bounded by the
+size of a mini-batch. Although there is no limit to the amount of data that can
+be ingested using such an approach, from a practical point of view the learning
+time is often limited by the CPU time one wants to spend on the task.
+
+For a full-fledged example of out-of-core scaling in a text classification 
+task see :ref:`example_applications_plot_out_of_core_classification.py`.
 
 Customizing the vectorizer classes
 ----------------------------------
 
@@ -96,7 +96,7 @@ The chi squared kernel is given by
 
 .. math::
 
-        k(x, y) = exp(-\gamma * \sum_i (x[i] - y[i]) ** 2 / (x[i] + y[i]))
+        k(x, y) = \exp \left (-\gamma \sum_i \frac{(x[i] - y[i]) ^ 2}{x[i] + y[i]} \right )
 
 The data is assumed to be non-negative, and is often normalized to have an L1-norm of one.
 The normalization is rationalized with the connection to the chi squared distance,
 
@@ -873,15 +873,14 @@ def embed_code_links(app, exception):
                             str_repl[name_html] = link_pattern % (link, name_html)
                     # do the replacement in the html file
                     if len(str_repl) > 0:
-                        with open(full_fname, 'rt') as fid:
+                        with open(full_fname, 'rb') as fid:
                             lines_in = fid.readlines()
-                        fid.close()
-                        with open(full_fname, 'wt') as fid:
+                        with open(full_fname, 'wb') as fid:
                             for line in lines_in:
+                                line = line.decode('utf-8')
                                 for name, link in str_repl.iteritems():
                                     line = line.replace(name, link)
-                                fid.write(line)
-                        fid.close()
+                                fid.write(line.encode('utf-8'))
     except urllib2.HTTPError, e:
         print ("The following HTTP Error has occurred:\n")
         print e.code