Sundrique
diff --git a/‎doc/whats_new.rst
Lines changed: 11 additions & 0 deletions b/‎doc/whats_new.rst
Lines changed: 11 additions & 0 deletions
diff --git a/‎sklearn/decomposition/online_lda.py
Lines changed: 71 additions & 12 deletions b/‎sklearn/decomposition/online_lda.py
Lines changed: 71 additions & 12 deletions
diff --git a/‎sklearn/decomposition/tests/test_online_lda.py
Lines changed: 46 additions & 16 deletions b/‎sklearn/decomposition/tests/test_online_lda.py
Lines changed: 46 additions & 16 deletions
@@ -122,6 +122,11 @@ Bug fixes
      when a numpy array is passed in for weights. :issue:`7983` by
      :user:`Vincent Pham <vincentpham1991>`.
 
+   - Fix a bug in :class:`sklearn.decomposition.LatentDirichletAllocation`
+     where the ``perplexity`` method was returning incorrect results because
+     the ``transform`` method returns normalized document topic distributions
+     as of version 0.18. :issue:`7954` by :user:`Gary Foreman <garyForeman>`.
+     
    - Fix a bug where :class:`sklearn.ensemble.GradientBoostingClassifier` and
      :class:`sklearn.ensemble.GradientBoostingRegressor` ignored the
      ``min_impurity_split`` parameter.
@@ -135,6 +140,12 @@ API changes summary
      ensemble estimators (deriving from :class:`ensemble.BaseEnsemble`)
      now only have ``self.estimators_`` available after ``fit``.
      :issue:`7464` by `Lars Buitinck`_ and `Loic Esteve`_.
+     
+   - Deprecate the ``doc_topic_distr`` argument of the ``perplexity`` method
+     in :class:`sklearn.decomposition.LatentDirichletAllocation` because the
+     user no longer has access to the unnormalized document topic distribution
+     needed for the perplexity calculation. :issue:`7954` by
+     :user:`Gary Foreman <garyForeman>`.
 
 .. _changes_0_18_1:
 
 
@@ -505,7 +505,7 @@ def fit(self, X, y=None):
             warnings.warn("The default value for 'learning_method' will be "
                           "changed from 'online' to 'batch' in the release 0.20. "
                           "This warning was introduced in 0.18.",
-                          DeprecationWarning)          
+                          DeprecationWarning)
             learning_method = 'online'
 
         batch_size = self.batch_size
@@ -531,8 +531,8 @@ def fit(self, X, y=None):
                     doc_topics_distr, _ = self._e_step(X, cal_sstats=False,
                                                        random_init=False,
                                                        parallel=parallel)
-                    bound = self.perplexity(X, doc_topics_distr,
-                                            sub_sampling=False)
+                    bound = self._perplexity_precomp_distr(X, doc_topics_distr,
+                                                           sub_sampling=False)
                     if self.verbose:
                         print('iteration: %d, perplexity: %.4f'
                               % (i + 1, bound))
@@ -541,10 +541,18 @@ def fit(self, X, y=None):
                         break
                     last_bound = bound
                 self.n_iter_ += 1
+
+        # calculate final perplexity value on train set
+        doc_topics_distr, _ = self._e_step(X, cal_sstats=False,
+                                           random_init=False,
+                                           parallel=parallel)
+        self.bound_ = self._perplexity_precomp_distr(X, doc_topics_distr,
+                                                     sub_sampling=False)
+
         return self
 
-    def transform(self, X):
-        """Transform data X according to the fitted model.
+    def _unnormalized_transform(self, X):
+        """Transform data X according to fitted model.
 
         Parameters
         ----------
@@ -556,7 +564,6 @@ def transform(self, X):
         doc_topic_distr : shape=(n_samples, n_topics)
             Document topic distribution for X.
         """
-
         if not hasattr(self, 'components_'):
             raise NotFittedError("no 'components_' attribute in model."
                                  " Please fit model first.")
@@ -572,7 +579,26 @@ def transform(self, X):
 
         doc_topic_distr, _ = self._e_step(X, cal_sstats=False,
                                           random_init=False)
-        # normalize doc_topic_distr
+
+        return doc_topic_distr
+
+    def transform(self, X):
+        """Transform data X according to the fitted model.
+
+           .. versionchanged:: 0.18
+              *doc_topic_distr* is now normalized
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, shape=(n_samples, n_features)
+            Document word matrix.
+
+        Returns
+        -------
+        doc_topic_distr : shape=(n_samples, n_topics)
+            Document topic distribution for X.
+        """
+        doc_topic_distr = self._unnormalized_transform(X)
         doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
         return doc_topic_distr
 
@@ -665,15 +691,16 @@ def score(self, X, y=None):
         score : float
             Use approximate bound as score.
         """
-
         X = self._check_non_neg_array(X, "LatentDirichletAllocation.score")
 
-        doc_topic_distr = self.transform(X)
+        doc_topic_distr = self._unnormalized_transform(X)
         score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)
         return score
 
-    def perplexity(self, X, doc_topic_distr=None, sub_sampling=False):
-        """Calculate approximate perplexity for data X.
+    def _perplexity_precomp_distr(self, X, doc_topic_distr=None,
+                                  sub_sampling=False):
+        """Calculate approximate perplexity for data X with ability to accept
+        precomputed doc_topic_distr
 
         Perplexity is defined as exp(-1. * log-likelihood per word)
 
@@ -699,7 +726,7 @@ def perplexity(self, X, doc_topic_distr=None, sub_sampling=False):
                                       "LatentDirichletAllocation.perplexity")
 
         if doc_topic_distr is None:
-            doc_topic_distr = self.transform(X)
+            doc_topic_distr = self._unnormalized_transform(X)
         else:
             n_samples, n_topics = doc_topic_distr.shape
             if n_samples != X.shape[0]:
@@ -719,3 +746,35 @@ def perplexity(self, X, doc_topic_distr=None, sub_sampling=False):
         perword_bound = bound / word_cnt
 
         return np.exp(-1.0 * perword_bound)
+
+    def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False):
+        """Calculate approximate perplexity for data X.
+
+        Perplexity is defined as exp(-1. * log-likelihood per word)
+
+        .. versionchanged:: 0.19
+           *doc_topic_distr* argument has been depricated because user no
+           longer has access to unnormalized distribution
+
+        Parameters
+        ----------
+        X : array-like or sparse matrix, [n_samples, n_features]
+            Document word matrix.
+
+        doc_topic_distr : None or array, shape=(n_samples, n_topics)
+            Document topic distribution.
+            If it is None, it will be generated by applying transform on X.
+
+            .. deprecated:: 0.19
+
+        Returns
+        -------
+        score : float
+            Perplexity score.
+        """
+        if doc_topic_distr != 'deprecated':
+            warnings.warn("Argument 'doc_topic_distr' is deprecated and will "
+                          "be ignored as of 0.19. Support for this argument "
+                          "will be removed in 0.21.", DeprecationWarning)
+
+        return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)
@@ -14,6 +14,7 @@
 from sklearn.utils.testing import assert_greater_equal
 from sklearn.utils.testing import assert_raises_regexp
 from sklearn.utils.testing import if_safe_multiprocessing_with_blas
+from sklearn.utils.testing import assert_warns
 
 from sklearn.exceptions import NotFittedError
 from sklearn.externals.six.moves import xrange
@@ -238,12 +239,12 @@ def test_lda_preplexity_mismatch():
     lda.fit(X)
     # invalid samples
     invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics))
-    assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X,
-                         invalid_n_samples)
+    assert_raises_regexp(ValueError, r'Number of samples',
+                         lda._perplexity_precomp_distr, X, invalid_n_samples)
     # invalid topic number
     invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1))
-    assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X,
-                         invalid_n_topics)
+    assert_raises_regexp(ValueError, r'Number of topics',
+                         lda._perplexity_precomp_distr, X, invalid_n_topics)
 
 
 def test_lda_perplexity():
@@ -257,15 +258,15 @@ def test_lda_perplexity():
         lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                           learning_method=method,
                                           total_samples=100, random_state=0)
-        distr_1 = lda_1.fit_transform(X)
-        perp_1 = lda_1.perplexity(X, distr_1, sub_sampling=False)
+        lda_1.fit(X)
+        perp_1 = lda_1.perplexity(X, sub_sampling=False)
 
-        distr_2 = lda_2.fit_transform(X)
-        perp_2 = lda_2.perplexity(X, distr_2, sub_sampling=False)
+        lda_2.fit(X)
+        perp_2 = lda_2.perplexity(X, sub_sampling=False)
         assert_greater_equal(perp_1, perp_2)
 
-        perp_1_subsampling = lda_1.perplexity(X, distr_1, sub_sampling=True)
-        perp_2_subsampling = lda_2.perplexity(X, distr_2, sub_sampling=True)
+        perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
+        perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
         assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
 
 
@@ -295,27 +296,56 @@ def test_perplexity_input_format():
     lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
                                     learning_method='batch',
                                     total_samples=100, random_state=0)
-    distr = lda.fit_transform(X)
+    lda.fit(X)
     perp_1 = lda.perplexity(X)
-    perp_2 = lda.perplexity(X, distr)
-    perp_3 = lda.perplexity(X.toarray(), distr)
+    perp_2 = lda.perplexity(X.toarray())
     assert_almost_equal(perp_1, perp_2)
-    assert_almost_equal(perp_1, perp_3)
 
 
 def test_lda_score_perplexity():
     # Test the relationship between LDA score and perplexity
     n_topics, X = _build_sparse_mtx()
     lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                     random_state=0)
-    distr = lda.fit_transform(X)
-    perplexity_1 = lda.perplexity(X, distr, sub_sampling=False)
+    lda.fit(X)
+    perplexity_1 = lda.perplexity(X, sub_sampling=False)
 
     score = lda.score(X)
     perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
     assert_almost_equal(perplexity_1, perplexity_2)
 
 
+def test_lda_fit_perplexity():
+    # Test that the perplexity computed during fit is consistent with what is
+    # returned by the perplexity method
+    n_topics, X = _build_sparse_mtx()
+    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
+                                    learning_method='batch', random_state=0,
+                                    evaluate_every=1)
+    lda.fit(X)
+
+    # Perplexity computed at end of fit method
+    perplexity1 = lda.bound_
+
+    # Result of perplexity method on the train set
+    perplexity2 = lda.perplexity(X)
+
+    assert_almost_equal(perplexity1, perplexity2)
+
+
+def test_doc_topic_distr_deprecation():
+    # Test that the appropriate warning message is displayed when a user
+    # attempts to pass the doc_topic_distr argument to the perplexity method
+    n_topics, X = _build_sparse_mtx()
+    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1,
+                                    learning_method='batch',
+                                    total_samples=100, random_state=0)
+    distr1 = lda.fit_transform(X)
+    distr2 = None
+    assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
+    assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
+
 def test_lda_empty_docs():
     """Test LDA on empty document (all-zero rows)."""
     Z = np.zeros((5, 4))