scikit-learn
diff --git a/‎benchmarks/bench_plot_polynomial_kernel_approximation.py‎
Lines changed: 156 additions & 0 deletions b/‎benchmarks/bench_plot_polynomial_kernel_approximation.py‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎doc/modules/classes.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/modules/classes.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/kernel_approximation.rst‎
Lines changed: 53 additions & 0 deletions b/‎doc/modules/kernel_approximation.rst‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎doc/whats_new/v0.24.rst‎
Lines changed: 10 additions & 0 deletions b/‎doc/whats_new/v0.24.rst‎
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,156 @@
+"""
+========================================================================
+Benchmark for explicit feature map approximation of polynomial kernels
+========================================================================
+
+An example illustrating the approximation of the feature map
+of an Homogeneous Polynomial kernel.
+
+.. currentmodule:: sklearn.kernel_approximation
+
+It shows how to use :class:`PolynomialCountSketch` and :class:`Nystroem` to
+approximate the feature map of a polynomial kernel for
+classification with an SVM on the digits dataset. Results using a linear
+SVM in the original space, a linear SVM using the approximate mappings
+and a kernelized SVM are compared.
+
+The first plot shows the classification accuracy of Nystroem [2] and
+PolynomialCountSketch [1] as the output dimension (n_components) grows.
+It also shows the accuracy of a linear SVM and a polynomial kernel SVM
+on the same data.
+
+The second plot explores the scalability of PolynomialCountSketch
+and Nystroem. For a sufficiently large output dimension,
+PolynomialCountSketch should be faster as it is O(n(d+klog k))
+while Nystroem is O(n(dk+k^2)). In addition, Nystroem requires
+a time-consuming training phase, while training is almost immediate
+for PolynomialCountSketch, whose training phase boils down to
+initializing some random variables (because is data-independent).
+
+[1] Pham, N., & Pagh, R. (2013, August). Fast and scalable polynomial
+kernels via explicit feature maps. In Proceedings of the 19th ACM SIGKDD
+international conference on Knowledge discovery and data mining (pp. 239-247)
+(http://chbrown.github.io/kdd-2013-usb/kdd/p239.pdf)
+
+[2] Charikar, M., Chen, K., & Farach-Colton, M. (2002, July). Finding frequent
+items in data streams. In International Colloquium on Automata, Languages, and
+Programming (pp. 693-703). Springer, Berlin, Heidelberg.
+(http://www.vldb.org/pvldb/1/1454225.pdf)
+
+"""
+# Author: Daniel Lopez-Sanchez <lope@usal.es>
+# License: BSD 3 clause
+
+# Load data manipulation functions
+from sklearn.datasets import load_digits
+from sklearn.model_selection import train_test_split
+
+# Some common libraries
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Will use this for timing results
+from time import time
+
+# Import SVM classifiers and feature map approximation algorithms
+from sklearn.svm import LinearSVC, SVC
+from sklearn.kernel_approximation import Nystroem, PolynomialCountSketch
+from sklearn.pipeline import Pipeline
+
+# Split data in train and test sets
+X, y = load_digits()["data"], load_digits()["target"]
+X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
+
+# Set the range of n_components for our experiments
+out_dims = range(20, 400, 20)
+
+# Evaluate Linear SVM
+lsvm = LinearSVC().fit(X_train, y_train)
+lsvm_score = 100*lsvm.score(X_test, y_test)
+
+# Evaluate kernelized SVM
+ksvm = SVC(kernel="poly", degree=2, gamma=1.).fit(X_train, y_train)
+ksvm_score = 100*ksvm.score(X_test, y_test)
+
+# Evaluate PolynomialCountSketch + LinearSVM
+ps_svm_scores = []
+n_runs = 5
+
+# To compensate for the stochasticity of the method, we make n_tets runs
+for k in out_dims:
+    score_avg = 0
+    for _ in range(n_runs):
+        ps_svm = Pipeline([("PS", PolynomialCountSketch(degree=2,
+                                                        n_components=k)),
+                           ("SVM", LinearSVC())])
+        score_avg += ps_svm.fit(X_train, y_train).score(X_test, y_test)
+    ps_svm_scores.append(100*score_avg/n_runs)
+
+# Evaluate Nystroem + LinearSVM
+ny_svm_scores = []
+n_runs = 5
+
+for k in out_dims:
+    score_avg = 0
+    for _ in range(n_runs):
+        ny_svm = Pipeline([("NY", Nystroem(kernel="poly", gamma=1., degree=2,
+                                           coef0=0, n_components=k)),
+                           ("SVM", LinearSVC())])
+        score_avg += ny_svm.fit(X_train, y_train).score(X_test, y_test)
+    ny_svm_scores.append(100*score_avg/n_runs)
+
+# Show results
+fig, ax = plt.subplots(figsize=(6, 4))
+ax.set_title("Accuracy results")
+ax.plot(out_dims, ps_svm_scores, label="PolynomialCountSketch + linear SVM",
+        c="orange")
+ax.plot(out_dims, ny_svm_scores, label="Nystroem + linear SVM",
+        c="blue")
+ax.plot([out_dims[0], out_dims[-1]], [lsvm_score, lsvm_score],
+        label="Linear SVM", c="black", dashes=[2, 2])
+ax.plot([out_dims[0], out_dims[-1]], [ksvm_score, ksvm_score],
+        label="Poly-kernel SVM", c="red", dashes=[2, 2])
+ax.legend()
+ax.set_xlabel("N_components for PolynomialCountSketch and Nystroem")
+ax.set_ylabel("Accuracy (%)")
+ax.set_xlim([out_dims[0], out_dims[-1]])
+fig.tight_layout()
+
+# Now lets evaluate the scalability of PolynomialCountSketch vs Nystroem
+# First we generate some fake data with a lot of samples
+
+fakeData = np.random.randn(10000, 100)
+fakeDataY = np.random.randint(0, high=10, size=(10000))
+
+out_dims = range(500, 6000, 500)
+
+# Evaluate scalability of PolynomialCountSketch as n_components grows
+ps_svm_times = []
+for k in out_dims:
+    ps = PolynomialCountSketch(degree=2, n_components=k)
+
+    start = time()
+    ps.fit_transform(fakeData, None)
+    ps_svm_times.append(time() - start)
+
+# Evaluate scalability of Nystroem as n_components grows
+# This can take a while due to the inefficient training phase
+ny_svm_times = []
+for k in out_dims:
+    ny = Nystroem(kernel="poly", gamma=1., degree=2, coef0=0, n_components=k)
+
+    start = time()
+    ny.fit_transform(fakeData, None)
+    ny_svm_times.append(time() - start)
+
+# Show results
+fig, ax = plt.subplots(figsize=(6, 4))
+ax.set_title("Scalability results")
+ax.plot(out_dims, ps_svm_times, label="PolynomialCountSketch", c="orange")
+ax.plot(out_dims, ny_svm_times, label="Nystroem", c="blue")
+ax.legend()
+ax.set_xlabel("N_components for PolynomialCountSketch and Nystroem")
+ax.set_ylabel("fit_transform time \n(s/10.000 samples)")
+ax.set_xlim([out_dims[0], out_dims[-1]])
+fig.tight_layout()
+plt.show()
@@ -706,6 +706,7 @@ Plotting
 
    kernel_approximation.AdditiveChi2Sampler
    kernel_approximation.Nystroem
+   kernel_approximation.PolynomialCountSketch
    kernel_approximation.RBFSampler
    kernel_approximation.SkewedChi2Sampler
 
 
@@ -149,6 +149,51 @@ above for the :class:`RBFSampler`. The only difference is in the free
 parameter, that is called :math:`c`.
 For a motivation for this mapping and the mathematical details see [LS2010]_.
 
+.. _polynomial_kernel_approx:
+
+Polynomial Kernel Approximation via Tensor Sketch
+-------------------------------------------------
+
+The :ref:`polynomial kernel <polynomial_kernel>` is a popular type of kernel
+function given by:
+
+.. math::
+
+        k(x, y) = (\gamma x^\top y +c_0)^d
+
+where:
+
+    * ``x``, ``y`` are the input vectors
+    * ``d`` is the kernel degree
+
+Intuitively, the feature space of the polynomial kernel of degree `d`
+consists of all possible degree-`d` products among input features, which enables
+learning algorithms using this kernel to account for interactions between features.
+
+The TensorSketch [PP2013]_ method, as implemented in :class:`PolynomialCountSketch`, is a
+scalable, input data independent method for polynomial kernel approximation.
+It is based on the concept of Count sketch [WIKICS]_ [CCF2002]_ , a dimensionality
+reduction technique similar to feature hashing, which instead uses several
+independent hash functions. TensorSketch obtains a Count Sketch of the outer product
+of two vectors (or a vector with itself), which can be used as an approximation of the
+polynomial kernel feature space. In particular, instead of explicitly computing
+the outer product, TensorSketch computes the Count Sketch of the vectors and then
+uses polynomial multiplication via the Fast Fourier Transform to compute the
+Count Sketch of their outer product.
+
+Conveniently, the training phase of TensorSketch simply consists of initializing
+some random variables. It is thus independent of the input data, i.e. it only
+depends on the number of input features, but not the data values.
+In addition, this method can transform samples in
+:math:`\mathcal{O}(n_{\text{samples}}(n_{\text{features}} + n_{\text{components}} \log(n_{\text{components}})))`
+time, where :math:`n_{\text{components}}` is the desired output dimension,
+determined by ``n_components``.
+
+.. topic:: Examples:
+
+    * :ref:`sphx_glr_auto_examples_plot_scalable_poly_kernels.py`
+
+.. _tensor_sketch_kernel_approx:
 
 Mathematical Details
 --------------------
@@ -201,3 +246,11 @@ or store training examples.
     .. [VVZ2010] `"Generalized RBF feature maps for Efficient Detection"
       <https://www.robots.ox.ac.uk/~vgg/publications/2010/Sreekanth10/sreekanth10.pdf>`_
       Vempati, S. and Vedaldi, A. and Zisserman, A. and Jawahar, CV - 2010
+    .. [PP2013] `"Fast and scalable polynomial kernels via explicit feature maps"
+      <https://doi.org/10.1145/2487575.2487591>`_
+      Pham, N., & Pagh, R. - 2013
+    .. [CCF2002] `"Finding frequent items in data streams"
+      <http://www.cs.princeton.edu/courses/archive/spring04/cos598B/bib/CharikarCF.pdf>`_
+      Charikar, M., Chen, K., & Farach-Colton - 2002
+    .. [WIKICS] `"Wikipedia: Count sketch"
+      <https://en.wikipedia.org/wiki/Count_sketch>`_
@@ -237,6 +237,15 @@ Changelog
 - |Enhancement| :class:`isotonic.IsotonicRegression` now accepts 2darray with 1 feature as
   input array. :pr:`17379` by :user:`Jiaxiang <fujiaxiang>`.
 
+:mod:`sklearn.kernel_approximation`
+...................................
+
+- |Feature| Added class :class:`kernel_approximation.PolynomialCountSketch`
+  which implements the Tensor Sketch algorithm for polynomial kernel feature
+  map approximation.
+  :pr:`13003` by :user:`Daniel López Sánchez <lopeLH>`.
+
+
 :mod:`sklearn.linear_model`
 ...........................
 
@@ -251,6 +260,7 @@ Changelog
   efficient leave-one-out cross-validation scheme ``cv=None``. :pr:`6624` by
   :user:`Marijn van Vliet <wmvanvliet>`.
 
+
 :mod:`sklearn.manifold`
 .......................