scikit-learn
diff --git a/‎sklearn/decomposition/cdnmf_fast.pyx
Lines changed: 56 additions & 1 deletion b/‎sklearn/decomposition/cdnmf_fast.pyx
Lines changed: 56 additions & 1 deletion
diff --git a/‎sklearn/decomposition/nmf.py
Lines changed: 58 additions & 12 deletions b/‎sklearn/decomposition/nmf.py
Lines changed: 58 additions & 12 deletions
@@ -6,7 +6,7 @@
 # License: BSD 3 clause
 
 cimport cython
-from libc.math cimport fabs
+from libc.math cimport fabs, log
 
 
 def _update_cdnmf_fast(double[:, ::1] W, double[:, :] HHt, double[:, :] XHt,
@@ -39,3 +39,58 @@ def _update_cdnmf_fast(double[:, ::1] W, double[:, :] HHt, double[:, :] XHt,
                     W[i, t] = max(W[i, t] - grad / hess, 0.)
 
     return violation
+
+
+def _update_cdkl_fast(double[:, :] X, double[:, ::1] W, double[:, ::1] Ht,
+                      double[:, :] WH, Py_ssize_t[::1] permutation,
+                      double reset, double epsilon, int max_iter):
+
+    cdef double violation = 0
+    cdef int n_components = W.shape[1]
+    cdef int n_samples = W.shape[0]  # n_features for H update
+    cdef int n_features = Ht.shape[0]  # n_samples for H update
+    cdef double grad, hess, num, s, temp, div
+    cdef int j, i, rr, r, n_iter
+
+    with nogil:
+        for rr in range(n_components):
+            r = permutation[rr]
+            for i in range(n_samples):
+                for n_iter in range(max_iter):
+
+                    grad = 0.
+                    hess = 0.
+
+                    for j in range(n_features):
+                        div = Ht[j, r] / WH[i, j]
+                        temp = X[i, j] * div
+                        grad += Ht[j, r] - temp
+                        hess += temp * div
+
+                    if grad == 0:
+                        break
+
+                    if hess == 0:
+                        s = reset - W[i, r]
+                        with gil:
+                            print(reset, s)
+                    else:
+                        s = max(-grad / hess , -W[i, r])
+
+                        # projected gradient
+                        pg = min(0., grad) if W[i, r] == 0 else grad
+                        violation += fabs(pg)
+
+                    # maintain WH
+                    for j in range(n_features):
+                        WH[i, j] += s * Ht[j, r]
+
+                    # stopping condition
+                    #if epsilon > 0 and fabs(s) < epsilon * W[i, r]:
+                    #    W[i, r] += s
+                    #    break
+
+                    # update
+                    W[i, r] += s
+
+    return violation
@@ -28,7 +28,7 @@
 from ..utils.validation import check_is_fitted, check_non_negative
 from ..utils import deprecated
 from ..exceptions import ConvergenceWarning
-from .cdnmf_fast import _update_cdnmf_fast
+from .cdnmf_fast import _update_cdnmf_fast, _update_cdkl_fast
 
 EPSILSON = 1e-9
 
@@ -609,9 +609,28 @@ def _update_coordinate_descent(X, W, Ht, l1_reg, l2_reg, shuffle,
     return _update_cdnmf_fast(W, HHt, XHt, permutation)
 
 
-def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
-                            l1_reg_H=0, l2_reg_W=0, l2_reg_H=0, update_H=True,
-                            verbose=0, shuffle=False, random_state=None):
+def _update_coordinate_descent_kl(X, W, Ht, l1_reg, l2_reg, shuffle,
+                                  random_state, WH, reset):
+    epsilon = 0.
+    max_iter = 1
+    n_components = Ht.shape[1]
+
+    # TODO add regularization
+
+    if shuffle:
+        permutation = random_state.permutation(n_components)
+    else:
+        permutation = np.arange(n_components)
+    # The following seems to be required on 64-bit Windows w/ Python 3.5.
+    permutation = np.asarray(permutation, dtype=np.intp)
+    return _update_cdkl_fast(X, W, Ht, WH, permutation, reset,
+                             epsilon, max_iter)
+
+
+def _fit_coordinate_descent(X, W, H, beta_loss='frobenius', tol=1e-4,
+                            max_iter=200, l1_reg_W=0, l1_reg_H=0, l2_reg_W=0,
+                            l2_reg_H=0, update_H=True, verbose=0,
+                            shuffle=False, random_state=None):
     """Compute Non-negative Matrix Factorization (NMF) with Coordinate Descent
 
     The objective function is minimized with an alternating minimization of W
@@ -680,21 +699,46 @@ def _fit_coordinate_descent(X, W, H, tol=1e-4, max_iter=200, l1_reg_W=0,
     """
     # so W and Ht are both in C order in memory
     Ht = check_array(H.T, order='C')
-    X = check_array(X, accept_sparse='csr')
+
+    beta_loss = _beta_loss_to_float(beta_loss)
+    if beta_loss == 2:
+        X = check_array(X, accept_sparse='csr')
+    elif beta_loss == 1:
+        X = check_array(X, dtype=np.float64, accept_sparse=None)
+        WH = fast_dot(W, Ht.T)
+        reset = X.mean() * 0.01
+    else:
+        raise ValueError("cd solver only accepts beta_loss in ('frobenius', "
+                         "'kullback-leibler', 2, 1). Got %s" % str(beta_loss))
 
     rng = check_random_state(random_state)
 
     for n_iter in range(max_iter):
         violation = 0.
 
         # Update W
-        violation += _update_coordinate_descent(X, W, Ht, l1_reg_W,
-                                                l2_reg_W, shuffle, rng)
+        if beta_loss == 2:
+            violation += _update_coordinate_descent(X, W, Ht, l1_reg_W,
+                                                    l2_reg_W,
+                                                    shuffle, rng)
+        else:
+            violation += _update_coordinate_descent_kl(X, W, Ht, l1_reg_W,
+                                                       l2_reg_W,
+                                                       shuffle, rng, WH,
+                                                       reset)
         # Update H
         if update_H:
-            violation += _update_coordinate_descent(X.T, Ht, W, l1_reg_H,
-                                                    l2_reg_H, shuffle, rng)
-
+            if beta_loss == 2:
+                violation += _update_coordinate_descent(X.T, Ht, W,
+                                                        l1_reg_H,
+                                                        l2_reg_H,
+                                                        shuffle, rng)
+            else:
+                violation += _update_coordinate_descent_kl(X.T, Ht, W,
+                                                           l1_reg_H,
+                                                           l2_reg_H,
+                                                           shuffle, rng,
+                                                           WH.T, reset)
         if n_iter == 0:
             violation_init = violation
 
@@ -1185,7 +1229,8 @@ def non_negative_factorization(X, W=None, H=None, n_components=None,
                                                         sparseness, beta,
                                                         eta)
     elif solver == 'cd':
-        W, H, n_iter = _fit_coordinate_descent(X, W, H, tol, max_iter,
+        W, H, n_iter = _fit_coordinate_descent(X, W, H, beta_loss,
+                                               tol, max_iter,
                                                l1_reg_W, l1_reg_H,
                                                l2_reg_W, l2_reg_H,
                                                update_H=update_H,
@@ -1306,7 +1351,8 @@ class NMF(BaseEstimator, TransformerMixin):
         For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
 
         .. versionadded:: 0.17
-           Regularization parameter *l1_ratio* used in the Coordinate Descent solver.
+           Regularization parameter *l1_ratio* used in the
+           Coordinate Descent solver.
 
     shuffle : boolean, default: False
         If true, randomize the order of coordinates in the CD solver.