diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py
index ba15dca1cfb67..1e54b570c9d8a 100644
--- a/benchmarks/bench_plot_nmf.py
+++ b/benchmarks/bench_plot_nmf.py
@@ -1,168 +1,426 @@
 """
 Benchmarks of Non-Negative Matrix Factorization
 """
+# Authors: Tom Dupre la Tour (benchmark)
+#          Chih-Jen Linn (original projected gradient NMF implementation)
+#          Anthony Di Franco (projected gradient, Python and NumPy port)
+# License: BSD 3 clause
 
 from __future__ import print_function
-
-from collections import defaultdict
-import gc
 from time import time
-
-import six
+import sys
+import warnings
+import numbers
 
 import numpy as np
-from scipy.linalg import norm
+import matplotlib.pyplot as plt
+import pandas
+
+from sklearn.utils.testing import ignore_warnings
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition.nmf import NMF
+from sklearn.decomposition.nmf import _initialize_nmf
+from sklearn.decomposition.nmf import _beta_divergence
+from sklearn.decomposition.nmf import INTEGER_TYPES, _check_init
+from sklearn.externals.joblib import Memory
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.utils.extmath import fast_dot, safe_sparse_dot, squared_norm
+from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted, check_non_negative
+
+
+mem = Memory(cachedir='.', verbose=0)
 
-from sklearn.decomposition.nmf import NMF, _initialize_nmf
-from sklearn.datasets.samples_generator import make_low_rank_matrix
-from sklearn.externals.six.moves import xrange
+###################
+# Start of _PGNMF #
+###################
+# This class implements a projected gradient solver for the NMF.
+# The projected gradient solver was removed from scikit-learn in version 0.19,
+# and a simplified copy is used here for comparison purpose only.
+# It is not tested, and it may change or disappear without notice.
 
 
-def alt_nnmf(V, r, max_iter=1000, tol=1e-3, init='random'):
+def _norm(x):
+    """Dot product-based Euclidean norm implementation
+    See: http://fseoane.net/blog/2011/computing-the-vector-norm/
     """
-    A, S = nnmf(X, r, tol=1e-3, R=None)
+    return np.sqrt(squared_norm(x))
 
-    Implement Lee & Seung's algorithm
 
+def _nls_subproblem(X, W, H, tol, max_iter, alpha=0., l1_ratio=0.,
+                    sigma=0.01, beta=0.1):
+    """Non-negative least square solver
+    Solves a non-negative least squares subproblem using the projected
+    gradient descent algorithm.
     Parameters
     ----------
-    V : 2-ndarray, [n_samples, n_features]
-        input matrix
-    r : integer
-        number of latent features
-    max_iter : integer, optional
-        maximum number of iterations (default: 1000)
-    tol : double
-        tolerance threshold for early exit (when the update factor is within
-        tol of 1., the function exits)
-    init : string
-        Method used to initialize the procedure.
-
+    X : array-like, shape (n_samples, n_features)
+        Constant matrix.
+    W : array-like, shape (n_samples, n_components)
+        Constant matrix.
+    H : array-like, shape (n_components, n_features)
+        Initial guess for the solution.
+    tol : float
+        Tolerance of the stopping condition.
+    max_iter : int
+        Maximum number of iterations before timing out.
+    alpha : double, default: 0.
+        Constant that multiplies the regularization terms. Set it to zero to
+        have no regularization.
+    l1_ratio : double, default: 0.
+        The regularization mixing parameter, with 0 <= l1_ratio <= 1.
+        For l1_ratio = 0 the penalty is an L2 penalty.
+        For l1_ratio = 1 it is an L1 penalty.
+        For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+    sigma : float
+        Constant used in the sufficient decrease condition checked by the line
+        search.  Smaller values lead to a looser sufficient decrease condition,
+        thus reducing the time taken by the line search, but potentially
+        increasing the number of iterations of the projected gradient
+        procedure. 0.01 is a commonly used value in the optimization
+        literature.
+    beta : float
+        Factor by which the step size is decreased (resp. increased) until
+        (resp. as long as) the sufficient decrease condition is satisfied.
+        Larger values allow to find a better step size but lead to longer line
+        search. 0.1 is a commonly used value in the optimization literature.
     Returns
     -------
-    A : 2-ndarray, [n_samples, r]
-        Component part of the factorization
-
-    S : 2-ndarray, [r, n_features]
-        Data part of the factorization
-    Reference
-    ---------
-    "Algorithms for Non-negative Matrix Factorization"
-    by Daniel D Lee, Sebastian H Seung
-    (available at http://citeseer.ist.psu.edu/lee01algorithms.html)
+    H : array-like, shape (n_components, n_features)
+        Solution to the non-negative least squares problem.
+    grad : array-like, shape (n_components, n_features)
+        The gradient.
+    n_iter : int
+        The number of iterations done by the algorithm.
+    References
+    ----------
+    C.-J. Lin. Projected gradient methods for non-negative matrix
+    factorization. Neural Computation, 19(2007), 2756-2779.
+    http://www.csie.ntu.edu.tw/~cjlin/nmf/
     """
-    # Nomenclature in the function follows Lee & Seung
-    eps = 1e-5
-    n, m = V.shape
-    W, H = _initialize_nmf(V, r, init, random_state=0)
-
-    for i in xrange(max_iter):
-        updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps)
-        H *= updateH
-        updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps)
-        W *= updateW
-        if i % 10 == 0:
-            max_update = max(updateW.max(), updateH.max())
-            if abs(1. - max_update) < tol:
+    WtX = safe_sparse_dot(W.T, X)
+    WtW = fast_dot(W.T, W)
+
+    # values justified in the paper (alpha is renamed gamma)
+    gamma = 1
+    for n_iter in range(1, max_iter + 1):
+        grad = np.dot(WtW, H) - WtX
+        if alpha > 0 and l1_ratio == 1.:
+            grad += alpha
+        elif alpha > 0:
+            grad += alpha * (l1_ratio + (1 - l1_ratio) * H)
+
+        # The following multiplication with a boolean array is more than twice
+        # as fast as indexing into grad.
+        if _norm(grad * np.logical_or(grad < 0, H > 0)) < tol:
+            break
+
+        Hp = H
+
+        for inner_iter in range(20):
+            # Gradient step.
+            Hn = H - gamma * grad
+            # Projection step.
+            Hn *= Hn > 0
+            d = Hn - H
+            gradd = np.dot(grad.ravel(), d.ravel())
+            dQd = np.dot(np.dot(WtW, d).ravel(), d.ravel())
+            suff_decr = (1 - sigma) * gradd + 0.5 * dQd < 0
+            if inner_iter == 0:
+                decr_gamma = not suff_decr
+
+            if decr_gamma:
+                if suff_decr:
+                    H = Hn
+                    break
+                else:
+                    gamma *= beta
+            elif not suff_decr or (Hp == Hn).all():
+                H = Hp
                 break
-    return W, H
-
-
-def report(error, time):
-    print("Frobenius loss: %.5f" % error)
-    print("Took: %.2fs" % time)
-    print()
-
-
-def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
-    timeset = defaultdict(lambda: [])
-    err = defaultdict(lambda: [])
-
-    for n_samples in samples_range:
-        for n_features in features_range:
-            print("%2d samples, %2d features" % (n_samples, n_features))
-            print('=======================')
-            X = np.abs(make_low_rank_matrix(n_samples, n_features,
-                       effective_rank=rank, tail_strength=0.2))
-
-            gc.collect()
-            print("benchmarking nndsvd-nmf: ")
-            tstart = time()
-            m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
-            tend = time() - tstart
-            timeset['nndsvd-nmf'].append(tend)
-            err['nndsvd-nmf'].append(m.reconstruction_err_)
-            report(m.reconstruction_err_, tend)
-
-            gc.collect()
-            print("benchmarking nndsvda-nmf: ")
-            tstart = time()
-            m = NMF(n_components=30, init='nndsvda',
-                    tol=tolerance).fit(X)
-            tend = time() - tstart
-            timeset['nndsvda-nmf'].append(tend)
-            err['nndsvda-nmf'].append(m.reconstruction_err_)
-            report(m.reconstruction_err_, tend)
-
-            gc.collect()
-            print("benchmarking nndsvdar-nmf: ")
-            tstart = time()
-            m = NMF(n_components=30, init='nndsvdar',
-                    tol=tolerance).fit(X)
-            tend = time() - tstart
-            timeset['nndsvdar-nmf'].append(tend)
-            err['nndsvdar-nmf'].append(m.reconstruction_err_)
-            report(m.reconstruction_err_, tend)
-
-            gc.collect()
-            print("benchmarking random-nmf")
-            tstart = time()
-            m = NMF(n_components=30, init='random', max_iter=1000,
-                    tol=tolerance).fit(X)
-            tend = time() - tstart
-            timeset['random-nmf'].append(tend)
-            err['random-nmf'].append(m.reconstruction_err_)
-            report(m.reconstruction_err_, tend)
-
-            gc.collect()
-            print("benchmarking alt-random-nmf")
-            tstart = time()
-            W, H = alt_nnmf(X, r=30, init='random', tol=tolerance)
-            tend = time() - tstart
-            timeset['alt-random-nmf'].append(tend)
-            err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
-            report(norm(X - np.dot(W, H)), tend)
-
-    return timeset, err
+            else:
+                gamma /= beta
+                Hp = Hn
+
+    if n_iter == max_iter:
+        warnings.warn("Iteration limit reached in nls subproblem.",
+                      ConvergenceWarning)
+
+    return H, grad, n_iter
+
+
+def _fit_projected_gradient(X, W, H, tol, max_iter, nls_max_iter, alpha,
+                            l1_ratio):
+    gradW = (np.dot(W, np.dot(H, H.T)) -
+             safe_sparse_dot(X, H.T, dense_output=True))
+    gradH = (np.dot(np.dot(W.T, W), H) -
+             safe_sparse_dot(W.T, X, dense_output=True))
+
+    init_grad = squared_norm(gradW) + squared_norm(gradH.T)
+    # max(0.001, tol) to force alternating minimizations of W and H
+    tolW = max(0.001, tol) * np.sqrt(init_grad)
+    tolH = tolW
+
+    for n_iter in range(1, max_iter + 1):
+        # stopping condition as discussed in paper
+        proj_grad_W = squared_norm(gradW * np.logical_or(gradW < 0, W > 0))
+        proj_grad_H = squared_norm(gradH * np.logical_or(gradH < 0, H > 0))
+
+        if (proj_grad_W + proj_grad_H) / init_grad < tol ** 2:
+            break
+
+        # update W
+        Wt, gradWt, iterW = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter,
+                                            alpha=alpha, l1_ratio=l1_ratio)
+        W, gradW = Wt.T, gradWt.T
+
+        if iterW == 1:
+            tolW = 0.1 * tolW
+
+        # update H
+        H, gradH, iterH = _nls_subproblem(X, W, H, tolH, nls_max_iter,
+                                          alpha=alpha, l1_ratio=l1_ratio)
+        if iterH == 1:
+            tolH = 0.1 * tolH
+
+    H[H == 0] = 0   # fix up negative zeros
+
+    if n_iter == max_iter:
+        Wt, _, _ = _nls_subproblem(X.T, H.T, W.T, tolW, nls_max_iter,
+                                   alpha=alpha, l1_ratio=l1_ratio)
+        W = Wt.T
+
+    return W, H, n_iter
+
+
+class _PGNMF(NMF):
+    """Non-Negative Matrix Factorization (NMF) with projected gradient solver.
+
+    This class is private and for comparison purpose only.
+    It may change or disappear without notice.
+
+    """
+    def __init__(self, n_components=None, solver='pg', init=None,
+                 tol=1e-4, max_iter=200, random_state=None,
+                 alpha=0., l1_ratio=0., nls_max_iter=10):
+        self.nls_max_iter = nls_max_iter
+        self.n_components = n_components
+        self.init = init
+        self.solver = solver
+        self.tol = tol
+        self.max_iter = max_iter
+        self.random_state = random_state
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+
+    def fit(self, X, y=None, **params):
+        self.fit_transform(X, **params)
+        return self
+
+    def transform(self, X):
+        check_is_fitted(self, 'components_')
+        H = self.components_
+        W, _, self.n_iter_ = self._fit_transform(X, H=H, update_H=False)
+        return W
+
+    def inverse_transform(self, W):
+        check_is_fitted(self, 'components_')
+        return np.dot(W, self.components_)
+
+    def fit_transform(self, X, y=None, W=None, H=None):
+        W, H, self.n_iter = self._fit_transform(X, W=W, H=H, update_H=True)
+        self.components_ = H
+        return W
+
+    def _fit_transform(self, X, y=None, W=None, H=None, update_H=True):
+        X = check_array(X, accept_sparse=('csr', 'csc'))
+        check_non_negative(X, "NMF (input X)")
+
+        n_samples, n_features = X.shape
+        n_components = self.n_components
+        if n_components is None:
+            n_components = n_features
+
+        if (not isinstance(n_components, INTEGER_TYPES) or
+                n_components <= 0):
+            raise ValueError("Number of components must be a positive integer;"
+                             " got (n_components=%r)" % n_components)
+        if not isinstance(self.max_iter, INTEGER_TYPES) or self.max_iter < 0:
+            raise ValueError("Maximum number of iterations must be a positive "
+                             "integer; got (max_iter=%r)" % self.max_iter)
+        if not isinstance(self.tol, numbers.Number) or self.tol < 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol=%r)" % self.tol)
+
+        # check W and H, or initialize them
+        if self.init == 'custom' and update_H:
+            _check_init(H, (n_components, n_features), "NMF (input H)")
+            _check_init(W, (n_samples, n_components), "NMF (input W)")
+        elif not update_H:
+            _check_init(H, (n_components, n_features), "NMF (input H)")
+            W = np.zeros((n_samples, n_components))
+        else:
+            W, H = _initialize_nmf(X, n_components, init=self.init,
+                                   random_state=self.random_state)
+
+        if update_H:  # fit_transform
+            W, H, n_iter = _fit_projected_gradient(
+                X, W, H, self.tol, self.max_iter, self.nls_max_iter,
+                self.alpha, self.l1_ratio)
+        else:  # transform
+            Wt, _, n_iter = _nls_subproblem(X.T, H.T, W.T, self.tol,
+                                            self.nls_max_iter,
+                                            alpha=self.alpha,
+                                            l1_ratio=self.l1_ratio)
+            W = Wt.T
+
+        if n_iter == self.max_iter and self.tol > 0:
+            warnings.warn("Maximum number of iteration %d reached. Increase it"
+                          " to improve convergence." % self.max_iter,
+                          ConvergenceWarning)
+
+        return W, H, n_iter
+
+#################
+# End of _PGNMF #
+#################
+
+
+def plot_results(results_df, plot_name):
+    if results_df is None:
+        return None
+
+    plt.figure(figsize=(16, 6))
+    colors = 'bgr'
+    markers = 'ovs'
+    ax = plt.subplot(1, 3, 1)
+    for i, init in enumerate(np.unique(results_df['init'])):
+        plt.subplot(1, 3, i + 1, sharex=ax, sharey=ax)
+        for j, method in enumerate(np.unique(results_df['method'])):
+            mask = np.logical_and(results_df['init'] == init,
+                                  results_df['method'] == method)
+            selected_items = results_df[mask]
+
+            plt.plot(selected_items['time'], selected_items['loss'],
+                     color=colors[j % len(colors)], ls='-',
+                     marker=markers[j % len(markers)],
+                     label=method)
+
+        plt.legend(loc=0, fontsize='x-small')
+        plt.xlabel("Time (s)")
+        plt.ylabel("loss")
+        plt.title("%s" % init)
+    plt.suptitle(plot_name, fontsize=16)
+
+
+@ignore_warnings(category=ConvergenceWarning)
+# use joblib to cache the results.
+# X_shape is specified in arguments for avoiding hashing X
+@mem.cache(ignore=['X', 'W0', 'H0'])
+def bench_one(name, X, W0, H0, X_shape, clf_type, clf_params, init,
+              n_components, random_state):
+    W = W0.copy()
+    H = H0.copy()
+
+    clf = clf_type(**clf_params)
+    st = time()
+    W = clf.fit_transform(X, W=W, H=H)
+    end = time()
+    H = clf.components_
+
+    this_loss = _beta_divergence(X, W, H, 2.0, True)
+    duration = end - st
+    return this_loss, duration
+
+
+def run_bench(X, clfs, plot_name, n_components, tol, alpha, l1_ratio):
+    start = time()
+    results = []
+    for name, clf_type, iter_range, clf_params in clfs:
+        print("Training %s:" % name)
+        for rs, init in enumerate(('nndsvd', 'nndsvdar', 'random')):
+            print("    %s %s: " % (init, " " * (8 - len(init))), end="")
+            W, H = _initialize_nmf(X, n_components, init, 1e-6, rs)
+
+            for max_iter in iter_range:
+                clf_params['alpha'] = alpha
+                clf_params['l1_ratio'] = l1_ratio
+                clf_params['max_iter'] = max_iter
+                clf_params['tol'] = tol
+                clf_params['random_state'] = rs
+                clf_params['init'] = 'custom'
+                clf_params['n_components'] = n_components
+
+                this_loss, duration = bench_one(name, X, W, H, X.shape,
+                                                clf_type, clf_params,
+                                                init, n_components, rs)
+
+                init_name = "init='%s'" % init
+                results.append((name, this_loss, duration, init_name))
+                # print("loss: %.6f, time: %.3f sec" % (this_loss, duration))
+                print(".", end="")
+                sys.stdout.flush()
+            print(" ")
+
+    # Use a panda dataframe to organize the results
+    results_df = pandas.DataFrame(results,
+                                  columns="method loss time init".split())
+    print("Total time = %0.3f sec\n" % (time() - start))
+
+    # plot the results
+    plot_results(results_df, plot_name)
+    return results_df
+
+
+def load_20news():
+    print("Loading 20 newsgroups dataset")
+    print("-----------------------------")
+    from sklearn.datasets import fetch_20newsgroups
+    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
+                                 remove=('headers', 'footers', 'quotes'))
+    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
+    tfidf = vectorizer.fit_transform(dataset.data)
+    return tfidf
+
+
+def load_faces():
+    print("Loading Olivetti face dataset")
+    print("-----------------------------")
+    from sklearn.datasets import fetch_olivetti_faces
+    faces = fetch_olivetti_faces(shuffle=True)
+    return faces.data
+
+
+def build_clfs(cd_iters, pg_iters, mu_iters):
+    clfs = [("Coordinate Descent", NMF, cd_iters, {'solver': 'cd'}),
+            ("Projected Gradient", _PGNMF, pg_iters, {'solver': 'pg'}),
+            ("Multiplicative Update", NMF, mu_iters, {'solver': 'mu'}),
+            ]
+    return clfs
 
 
 if __name__ == '__main__':
-    from mpl_toolkits.mplot3d import axes3d  # register the 3d projection
-    axes3d
-    import matplotlib.pyplot as plt
-
-    samples_range = np.linspace(50, 500, 3).astype(np.int)
-    features_range = np.linspace(50, 500, 3).astype(np.int)
-    timeset, err = benchmark(samples_range, features_range)
-
-    for i, results in enumerate((timeset, err)):
-        fig = plt.figure('scikit-learn Non-Negative Matrix Factorization'
-                         'benchmark results')
-        ax = fig.gca(projection='3d')
-        for c, (label, timings) in zip('rbgcm', sorted(six.iteritems(results))):
-            X, Y = np.meshgrid(samples_range, features_range)
-            Z = np.asarray(timings).reshape(samples_range.shape[0],
-                                            features_range.shape[0])
-            # plot the actual surface
-            ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3,
-                            color=c)
-            # dummy point plot to stick the legend to since surface plot do not
-            # support legends (yet?)
-            ax.plot([1], [1], [1], color=c, label=label)
-
-        ax.set_xlabel('n_samples')
-        ax.set_ylabel('n_features')
-        zlabel = 'Time (s)' if i == 0 else 'reconstruction error'
-        ax.set_zlabel(zlabel)
-        ax.legend()
-        plt.show()
+    alpha = 0.
+    l1_ratio = 0.5
+    n_components = 10
+    tol = 1e-15
+
+    # first benchmark on 20 newsgroup dataset: sparse, shape(11314, 39116)
+    plot_name = "20 Newsgroups sparse dataset"
+    cd_iters = np.arange(1, 30)
+    pg_iters = np.arange(1, 6)
+    mu_iters = np.arange(1, 30)
+    clfs = build_clfs(cd_iters, pg_iters, mu_iters)
+    X_20news = load_20news()
+    run_bench(X_20news, clfs, plot_name, n_components, tol, alpha, l1_ratio)
+
+    # second benchmark on Olivetti faces dataset: dense, shape(400, 4096)
+    plot_name = "Olivetti Faces dense dataset"
+    cd_iters = np.arange(1, 30)
+    pg_iters = np.arange(1, 12)
+    mu_iters = np.arange(1, 30)
+    clfs = build_clfs(cd_iters, pg_iters, mu_iters)
+    X_faces = load_faces()
+    run_bench(X_faces, clfs, plot_name, n_components, tol, alpha, l1_ratio,)
+
+    plt.show()