jjerphan
diff --git a/‎doc/computing/scaling_strategies.rst
+1 b/‎doc/computing/scaling_strategies.rst
+1
diff --git a/‎doc/modules/classes.rst
+1 b/‎doc/modules/classes.rst
+1
diff --git a/‎doc/modules/decomposition.rst
+26 b/‎doc/modules/decomposition.rst
+26
diff --git a/‎doc/whats_new/v1.1.rst
+5 b/‎doc/whats_new/v1.1.rst
+5
diff --git a/‎examples/applications/plot_topics_extraction_with_nmf_lda.py
+72-3 b/‎examples/applications/plot_topics_extraction_with_nmf_lda.py
+72-3
diff --git a/‎sklearn/decomposition/__init__.py
+6-1 b/‎sklearn/decomposition/__init__.py
+6-1
@@ -80,6 +80,7 @@ Here is a list of incremental estimators for different tasks:
       + :class:`sklearn.decomposition.MiniBatchDictionaryLearning`
       + :class:`sklearn.decomposition.IncrementalPCA`
       + :class:`sklearn.decomposition.LatentDirichletAllocation`
+      + :class:`sklearn.decomposition.MiniBatchNMF`
   - Preprocessing
       + :class:`sklearn.preprocessing.StandardScaler`
       + :class:`sklearn.preprocessing.MinMaxScaler`
 
@@ -319,6 +319,7 @@ Samples generator
    decomposition.MiniBatchDictionaryLearning
    decomposition.MiniBatchSparsePCA
    decomposition.NMF
+   decomposition.MiniBatchNMF
    decomposition.PCA
    decomposition.SparsePCA
    decomposition.SparseCoder
 
@@ -921,6 +921,29 @@ stored components::
     * :ref:`sphx_glr_auto_examples_applications_plot_topics_extraction_with_nmf_lda.py`
     * :ref:`sphx_glr_auto_examples_decomposition_plot_beta_divergence.py`
 
+.. _MiniBatchNMF:
+
+Mini-batch Non Negative Matrix Factorization
+--------------------------------------------
+
+:class:`MiniBatchNMF` [7]_ implements a faster, but less accurate version of the
+non negative matrix factorization (i.e. :class:`~sklearn.decomposition.NMF`),
+better suited for large datasets.
+
+By default, :class:`MiniBatchNMF` divides the data into mini-batches and
+optimizes the NMF model in an online manner by cycling over the mini-batches
+for the specified number of iterations. The ``batch_size`` parameter controls
+the size of the batches.
+
+In order to speed up the mini-batch algorithm it is also possible to scale
+past batches, giving them less importance than newer batches. This is done
+introducing a so-called forgetting factor controlled by the ``forget_factor``
+parameter.
+
+The estimator also implements ``partial_fit``, which updates ``H`` by iterating
+only once over a mini-batch. This can be used for online learning when the data
+is not readily available from the start, or when the data does not fit into memory.
+
 .. topic:: References:
 
     .. [1] `"Learning the parts of objects by non-negative matrix factorization"
@@ -945,6 +968,9 @@ stored components::
            the beta-divergence" <1010.1763>`
            C. Fevotte, J. Idier, 2011
 
+    .. [7] :arxiv:`"Online algorithms for nonnegative matrix factorization with the
+       Itakura-Saito divergence" <1106.4198>`
+       A. Lefevre, F. Bach, C. Fevotte, 2011
 
 .. _LatentDirichletAllocation:
 
 
@@ -288,6 +288,11 @@ Changelog
 :mod:`sklearn.decomposition`
 ............................
 
+- |MajorFeature| Added a new estimator :class:`decomposition.MiniBatchNMF`. It is a
+  faster but less accurate version of non-negative matrix factorization, better suited
+  for large datasets. :pr:`16948` by :user:`Chiara Marmo <cmarmo>`,
+  :user:`Patricio Cerda <pcerda>` and :user:`Jérémie du Boisberranger <jeremiedbb>`. 
+
 - |Enhancement| :class:`decomposition.PCA` exposes a parameter `n_oversamples` to tune
   :func:`sklearn.decomposition.randomized_svd` and
   get accurate results when the number of features is large.
 
@@ -30,13 +30,15 @@
 import matplotlib.pyplot as plt
 
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from sklearn.decomposition import NMF, LatentDirichletAllocation
+from sklearn.decomposition import NMF, MiniBatchNMF, LatentDirichletAllocation
 from sklearn.datasets import fetch_20newsgroups
 
 n_samples = 2000
 n_features = 1000
 n_components = 10
 n_top_words = 20
+batch_size = 128
+init = "nndsvda"
 
 
 def plot_top_words(model, feature_names, n_top_words, title):
@@ -101,7 +103,15 @@ def plot_top_words(model, feature_names, n_top_words, title):
     "n_samples=%d and n_features=%d..." % (n_samples, n_features)
 )
 t0 = time()
-nmf = NMF(n_components=n_components, random_state=1, alpha=0.1, l1_ratio=0.5).fit(tfidf)
+nmf = NMF(
+    n_components=n_components,
+    random_state=1,
+    init=init,
+    beta_loss="frobenius",
+    alpha_W=0.00005,
+    alpha_H=0.00005,
+    l1_ratio=1,
+).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
 
 
@@ -121,10 +131,12 @@ def plot_top_words(model, feature_names, n_top_words, title):
 nmf = NMF(
     n_components=n_components,
     random_state=1,
+    init=init,
     beta_loss="kullback-leibler",
     solver="mu",
     max_iter=1000,
-    alpha=0.1,
+    alpha_W=0.00005,
+    alpha_H=0.00005,
     l1_ratio=0.5,
 ).fit(tfidf)
 print("done in %0.3fs." % (time() - t0))
@@ -137,6 +149,63 @@ def plot_top_words(model, feature_names, n_top_words, title):
     "Topics in NMF model (generalized Kullback-Leibler divergence)",
 )
 
+# Fit the MiniBatchNMF model
+print(
+    "\n" * 2,
+    "Fitting the MiniBatchNMF model (Frobenius norm) with tf-idf "
+    "features, n_samples=%d and n_features=%d, batch_size=%d..."
+    % (n_samples, n_features, batch_size),
+)
+t0 = time()
+mbnmf = MiniBatchNMF(
+    n_components=n_components,
+    random_state=1,
+    batch_size=batch_size,
+    init=init,
+    beta_loss="frobenius",
+    alpha_W=0.00005,
+    alpha_H=0.00005,
+    l1_ratio=0.5,
+).fit(tfidf)
+print("done in %0.3fs." % (time() - t0))
+
+
+tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+plot_top_words(
+    mbnmf,
+    tfidf_feature_names,
+    n_top_words,
+    "Topics in MiniBatchNMF model (Frobenius norm)",
+)
+
+# Fit the MiniBatchNMF model
+print(
+    "\n" * 2,
+    "Fitting the MiniBatchNMF model (generalized Kullback-Leibler "
+    "divergence) with tf-idf features, n_samples=%d and n_features=%d, "
+    "batch_size=%d..." % (n_samples, n_features, batch_size),
+)
+t0 = time()
+mbnmf = MiniBatchNMF(
+    n_components=n_components,
+    random_state=1,
+    batch_size=batch_size,
+    init=init,
+    beta_loss="kullback-leibler",
+    alpha_W=0.00005,
+    alpha_H=0.00005,
+    l1_ratio=0.5,
+).fit(tfidf)
+print("done in %0.3fs." % (time() - t0))
+
+tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+plot_top_words(
+    mbnmf,
+    tfidf_feature_names,
+    n_top_words,
+    "Topics in MiniBatchNMF model (generalized Kullback-Leibler divergence)",
+)
+
 print(
     "\n" * 2,
     "Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
 
@@ -5,7 +5,11 @@
 """
 
 
-from ._nmf import NMF, non_negative_factorization
+from ._nmf import (
+    NMF,
+    MiniBatchNMF,
+    non_negative_factorization,
+)
 from ._pca import PCA
 from ._incremental_pca import IncrementalPCA
 from ._kernel_pca import KernelPCA
@@ -31,6 +35,7 @@
     "IncrementalPCA",
     "KernelPCA",
     "MiniBatchDictionaryLearning",
+    "MiniBatchNMF",
     "MiniBatchSparsePCA",
     "NMF",
     "PCA",