From f6cf3d028b91e021515dbb8c630002fe5a771474 Mon Sep 17 00:00:00 2001 From: Arthur Ozga Date: Fri, 15 Jun 2018 13:47:49 -0400 Subject: [PATCH 1/3] skip dataset downloading doctest --- doc/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/conftest.py b/doc/conftest.py index 158fff5830acf..a4898f37145e4 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -47,12 +47,14 @@ def setup_rcv1(): def setup_twenty_newsgroups(): data_home = get_data_home() - if not exists(join(data_home, '20news_home')): + if not exists(join(data_home, "20news-bydate_py3.pkz")): raise SkipTest("Skipping dataset loading doctests") def setup_working_with_text_data(): check_skip_network() + if not exists(join(get_data_home(), "20news-bydate_py3.pkz")): + raise SkipTest("Skipping dataset loading doctests") def setup_compose(): From a925bf3cb4b8cca2f09a903fe9c6d481b742ca83 Mon Sep 17 00:00:00 2001 From: Arthur Ozga Date: Fri, 15 Jun 2018 14:58:38 -0400 Subject: [PATCH 2/3] Use pickle file for python version --- doc/conftest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/conftest.py b/doc/conftest.py index a4898f37145e4..0c9c79a2dd7ed 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -6,6 +6,8 @@ from sklearn.utils.testing import SkipTest from sklearn.utils.testing import check_skip_network from sklearn.datasets import get_data_home +from sklearn.datasets.base import _pkl_filepath +from sklearn.datasets.twenty_newsgroups import CACHE_NAME from sklearn.utils.testing import install_mldata_mock from sklearn.utils.testing import uninstall_mldata_mock @@ -47,13 +49,15 @@ def setup_rcv1(): def setup_twenty_newsgroups(): data_home = get_data_home() - if not exists(join(data_home, "20news-bydate_py3.pkz")): + cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) + if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests") def setup_working_with_text_data(): check_skip_network() - if not exists(join(get_data_home(), "20news-bydate_py3.pkz")): + cache_path = _pkl_filepath(get_data_home(), CACHE_NAME) + if not exists(cache_path): raise SkipTest("Skipping dataset loading doctests") From 915f5b704584ded35477bb2be189c5370b8036e2 Mon Sep 17 00:00:00 2001 From: Arthur Ozga Date: Fri, 15 Jun 2018 14:59:00 -0400 Subject: [PATCH 3/3] update twenty_newsgroups baseline --- doc/datasets/twenty_newsgroups.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/doc/datasets/twenty_newsgroups.rst b/doc/datasets/twenty_newsgroups.rst index 23c11b2998ddd..5d6906c3cfa14 100644 --- a/doc/datasets/twenty_newsgroups.rst +++ b/doc/datasets/twenty_newsgroups.rst @@ -62,7 +62,7 @@ attribute is the integer index of the category:: >>> newsgroups_train.target.shape (11314,) >>> newsgroups_train.target[:10] - array([12, 6, 9, 8, 6, 7, 9, 2, 13, 19]) + array([ 7, 4, 4, 1, 14, 16, 13, 3, 2, 4]) It is possible to load only a sub-selection of the categories by passing the list of the categories to load to the @@ -78,7 +78,7 @@ list of the categories to load to the >>> newsgroups_train.target.shape (1073,) >>> newsgroups_train.target[:10] - array([1, 1, 1, 0, 1, 0, 0, 1, 1, 1]) + array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0]) Converting text to vectors -------------------------- @@ -105,7 +105,7 @@ components by sample in a more than 30000-dimensional space (less than .5% non-zero features):: >>> vectors.nnz / float(vectors.shape[0]) - 159.01327433628319 + 159.01327... :func:`sklearn.datasets.fetch_20newsgroups_vectorized` is a function which returns ready-to-use tfidf features instead of file names. @@ -131,9 +131,11 @@ which is fast to train and achieves a decent F-score:: >>> vectors_test = vectorizer.transform(newsgroups_test.data) >>> clf = MultinomialNB(alpha=.01) >>> clf.fit(vectors, newsgroups_train.target) + MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True) + >>> pred = clf.predict(vectors_test) >>> metrics.f1_score(newsgroups_test.target, pred, average='macro') - 0.88213592402729568 + 0.88213... (The example :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py` shuffles the training and test data, instead of segmenting by time, and in that case @@ -150,10 +152,10 @@ Let's take a look at what the most informative features are: ... print("%s: %s" % (category, " ".join(feature_names[top10]))) ... >>> show_top10(clf, vectorizer, newsgroups_train.target_names) - alt.atheism: sgi livesey atheists writes people caltech com god keith edu - comp.graphics: organization thanks files subject com image lines university edu graphics - sci.space: toronto moon gov com alaska access henry nasa edu space - talk.religion.misc: article writes kent people christian jesus sandvik edu com god + alt.atheism: edu it and in you that is of to the + comp.graphics: edu in graphics it is for and of to the + sci.space: edu it that is in and space to of the + talk.religion.misc: not it you in is that and to of the You can now see many things that these features have overfit to: @@ -183,7 +185,7 @@ blocks, and quotation blocks respectively. >>> vectors_test = vectorizer.transform(newsgroups_test.data) >>> pred = clf.predict(vectors_test) >>> metrics.f1_score(pred, newsgroups_test.target, average='macro') - 0.77310350681274775 + 0.77310... This classifier lost over a lot of its F-score, just because we removed metadata that has little to do with topic classification. @@ -195,10 +197,12 @@ It loses even more if we also strip this metadata from the training data: >>> vectors = vectorizer.fit_transform(newsgroups_train.data) >>> clf = MultinomialNB(alpha=.01) >>> clf.fit(vectors, newsgroups_train.target) + MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True) + >>> vectors_test = vectorizer.transform(newsgroups_test.data) >>> pred = clf.predict(vectors_test) >>> metrics.f1_score(newsgroups_test.target, pred, average='macro') - 0.76995175184521725 + 0.76995... Some other classifiers cope better with this harder version of the task. Try running :ref:`sphx_glr_auto_examples_model_selection_grid_search_text_feature_extraction.py` with and without