From f6cf3d028b91e021515dbb8c630002fe5a771474 Mon Sep 17 00:00:00 2001
From: Arthur Ozga <aozgaa@twosigma.com>
Date: Fri, 15 Jun 2018 13:47:49 -0400
Subject: [PATCH 1/3] skip dataset downloading doctest

---
 doc/conftest.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/conftest.py b/doc/conftest.py
index 158fff5830acf..a4898f37145e4 100644
--- a/doc/conftest.py
+++ b/doc/conftest.py
@@ -47,12 +47,14 @@ def setup_rcv1():
 
 def setup_twenty_newsgroups():
     data_home = get_data_home()
-    if not exists(join(data_home, '20news_home')):
+    if not exists(join(data_home, "20news-bydate_py3.pkz")):
         raise SkipTest("Skipping dataset loading doctests")
 
 
 def setup_working_with_text_data():
     check_skip_network()
+    if not exists(join(get_data_home(), "20news-bydate_py3.pkz")):
+        raise SkipTest("Skipping dataset loading doctests")
 
 
 def setup_compose():

From a925bf3cb4b8cca2f09a903fe9c6d481b742ca83 Mon Sep 17 00:00:00 2001
From: Arthur Ozga <aozgaa@gmail.com>
Date: Fri, 15 Jun 2018 14:58:38 -0400
Subject: [PATCH 2/3] Use pickle file for python version

---
 doc/conftest.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/conftest.py b/doc/conftest.py
index a4898f37145e4..0c9c79a2dd7ed 100644
--- a/doc/conftest.py
+++ b/doc/conftest.py
@@ -6,6 +6,8 @@
 from sklearn.utils.testing import SkipTest
 from sklearn.utils.testing import check_skip_network
 from sklearn.datasets import get_data_home
+from sklearn.datasets.base import _pkl_filepath
+from sklearn.datasets.twenty_newsgroups import CACHE_NAME
 from sklearn.utils.testing import install_mldata_mock
 from sklearn.utils.testing import uninstall_mldata_mock
 
@@ -47,13 +49,15 @@ def setup_rcv1():
 
 def setup_twenty_newsgroups():
     data_home = get_data_home()
-    if not exists(join(data_home, "20news-bydate_py3.pkz")):
+    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
+    if not exists(cache_path):
         raise SkipTest("Skipping dataset loading doctests")
 
 
 def setup_working_with_text_data():
     check_skip_network()
-    if not exists(join(get_data_home(), "20news-bydate_py3.pkz")):
+    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
+    if not exists(cache_path):
         raise SkipTest("Skipping dataset loading doctests")
 
 

From 915f5b704584ded35477bb2be189c5370b8036e2 Mon Sep 17 00:00:00 2001
From: Arthur Ozga <aozgaa@gmail.com>
Date: Fri, 15 Jun 2018 14:59:00 -0400
Subject: [PATCH 3/3] update twenty_newsgroups baseline

---
 doc/datasets/twenty_newsgroups.rst | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst
index 23c11b2998ddd..5d6906c3cfa14 100644
--- a/doc/datasets/twenty_newsgroups.rst
+++ b/doc/datasets/twenty_newsgroups.rst
@@ -62,7 +62,7 @@ attribute is the integer index of the category::
   >>> newsgroups_train.target.shape
   (11314,)
   >>> newsgroups_train.target[:10]
-  array([12,  6,  9,  8,  6,  7,  9,  2, 13, 19])
+  array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])
 
 It is possible to load only a sub-selection of the categories by passing the
 list of the categories to load to the
@@ -78,7 +78,7 @@ list of the categories to load to the
   >>> newsgroups_train.target.shape
   (1073,)
   >>> newsgroups_train.target[:10]
-  array([1, 1, 1, 0, 1, 0, 0, 1, 1, 1])
+  array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])
 
 Converting text to vectors
 --------------------------
@@ -105,7 +105,7 @@ components by sample in a more than 30000-dimensional space
 (less than .5% non-zero features)::
 
   >>> vectors.nnz / float(vectors.shape[0])
-  159.01327433628319
+  159.01327...
 
 :func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which returns
 ready-to-use tfidf features instead of file names.
@@ -131,9 +131,11 @@ which is fast to train and achieves a decent F-score::
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> clf = MultinomialNB(alpha=.01)
   >>> clf.fit(vectors, newsgroups_train.target)
+  MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
+
   >>> pred = clf.predict(vectors_test)
   >>> metrics.f1_score(newsgroups_test.target, pred, average='macro')
-  0.88213592402729568
+  0.88213...
 
 (The example :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` shuffles
 the training and test data, instead of segmenting by time, and in that case
@@ -150,10 +152,10 @@ Let's take a look at what the most informative features are:
   ...         print("%s: %s" % (category, " ".join(feature_names[top10])))
   ...
   >>> show_top10(clf, vectorizer, newsgroups_train.target_names)
-  alt.atheism: sgi livesey atheists writes people caltech com god keith edu
-  comp.graphics: organization thanks files subject com image lines university edu graphics
-  sci.space: toronto moon gov com alaska access henry nasa edu space
-  talk.religion.misc: article writes kent people christian jesus sandvik edu com god
+  alt.atheism: edu it and in you that is of to the
+  comp.graphics: edu in graphics it is for and of to the
+  sci.space: edu it that is in and space to of the
+  talk.religion.misc: not it you in is that and to of the
 
 You can now see many things that these features have overfit to:
 
@@ -183,7 +185,7 @@ blocks, and quotation blocks respectively.
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> pred = clf.predict(vectors_test)
   >>> metrics.f1_score(pred, newsgroups_test.target, average='macro')
-  0.77310350681274775
+  0.77310...
 
 This classifier lost over a lot of its F-score, just because we removed
 metadata that has little to do with topic classification.
@@ -195,10 +197,12 @@ It loses even more if we also strip this metadata from the training data:
   >>> vectors = vectorizer.fit_transform(newsgroups_train.data)
   >>> clf = MultinomialNB(alpha=.01)
   >>> clf.fit(vectors, newsgroups_train.target)
+  MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
+
   >>> vectors_test = vectorizer.transform(newsgroups_test.data)
   >>> pred = clf.predict(vectors_test)
   >>> metrics.f1_score(newsgroups_test.target, pred, average='macro')
-  0.76995175184521725
+  0.76995...
 
 Some other classifiers cope better with this harder version of the task. Try
 running :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py` with and without