From 2be4cd089546c173457d1dc9fe95152cedeeb36a Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 00:40:50 +0800 Subject: [PATCH 01/11] update examples using fetch_openml --- examples/gaussian_process/plot_gpr_co2.py | 4 ++-- examples/linear_model/plot_sgd_early_stopping.py | 4 ++-- .../linear_model/plot_sparse_logistic_regression_mnist.py | 4 ++-- examples/multioutput/plot_classifier_chain_yeast.py | 4 ++-- examples/neural_networks/plot_mnist_filters.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py index b0b271a3644a2..86a940f28c4c6 100644 --- a/examples/gaussian_process/plot_gpr_co2.py +++ b/examples/gaussian_process/plot_gpr_co2.py @@ -70,9 +70,9 @@ from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels \ import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared -from sklearn.datasets import fetch_mldata +from sklearn.datasets import fetch_openml -data = fetch_mldata('mauna-loa-atmospheric-co2').data +data = fetch_openml('mauna-loa-atmospheric-co2').data X = data[:, [1]] y = data[:, 0] diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py index 31ce61f39d22c..60696aea1aab3 100644 --- a/examples/linear_model/plot_sgd_early_stopping.py +++ b/examples/linear_model/plot_sgd_early_stopping.py @@ -47,7 +47,7 @@ import matplotlib.pyplot as plt from sklearn import linear_model -from sklearn.datasets import fetch_mldata +from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.utils.testing import ignore_warnings from sklearn.exceptions import ConvergenceWarning @@ -58,7 +58,7 @@ def load_mnist(n_samples=None, class_0=0, class_1=8): """Load MNIST, select two classes, shuffle and return only n_samples.""" - mnist = fetch_mldata('MNIST original') + mnist = fetch_openml('MNIST original') # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py index 5610f471b5d05..175c2ab9403cf 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py +++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py @@ -20,7 +20,7 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.datasets import fetch_mldata +from sklearn.datasets import fetch_openml from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -35,7 +35,7 @@ t0 = time.time() train_samples = 5000 -mnist = fetch_mldata('MNIST original') +mnist = fetch_openml('MNIST original') X = mnist.data.astype('float64') y = mnist.target random_state = check_random_state(0) diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py index 6a90e14dfc379..2c4245f8ef3ad 100644 --- a/examples/multioutput/plot_classifier_chain_yeast.py +++ b/examples/multioutput/plot_classifier_chain_yeast.py @@ -44,10 +44,10 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import jaccard_similarity_score from sklearn.linear_model import LogisticRegression -from sklearn.datasets import fetch_mldata +from sklearn.datasets import fetch_openml # Load a multi-label dataset -yeast = fetch_mldata('yeast') +yeast = fetch_openml('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py index 6c3b8b2284ea2..144bc3a514b60 100644 --- a/examples/neural_networks/plot_mnist_filters.py +++ b/examples/neural_networks/plot_mnist_filters.py @@ -23,10 +23,10 @@ print(__doc__) import matplotlib.pyplot as plt -from sklearn.datasets import fetch_mldata +from sklearn.datasets import fetch_openml from sklearn.neural_network import MLPClassifier -mnist = fetch_mldata("MNIST original") +mnist = fetch_openml("MNIST original") # rescale the data, use the traditional train/test split X, y = mnist.data / 255., mnist.target X_train, X_test = X[:60000], X[60000:] From 65bb1a738a64df6f23e81402c7fb97da60025a9e Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 01:19:27 +0800 Subject: [PATCH 02/11] add return_X_y for fetch_openml --- sklearn/datasets/openml.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 6864fa865abc0..ce6832abfd1a3 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -347,7 +347,7 @@ def _verify_target_data_type(features_dict, target_columns): def fetch_openml(name=None, version='active', data_id=None, data_home=None, - target_column='default-target', cache=True): + target_column='default-target', cache=True, return_X_y=False): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a @@ -394,6 +394,10 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, cache : boolean, default=True Whether to cache downloaded datasets using joblib. + + return_X_y : boolean, default=False. + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` object Returns ------- @@ -415,6 +419,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, that the value encoded as i is ith in the list. details : dict More metadata from OpenML + + (data, target) : tuple if ``return_X_y`` is True .. note:: EXPERIMENTAL @@ -562,5 +568,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) + + if return_X_y: + return X, y return bunch From 4bf5d7b33d49581a2bed78410f679d4a05e9a481 Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 02:25:01 +0800 Subject: [PATCH 03/11] modify mnist name --- examples/gaussian_process/plot_gpr_co2.py | 4 ++-- examples/linear_model/plot_sgd_early_stopping.py | 2 +- .../linear_model/plot_sparse_logistic_regression_mnist.py | 2 +- examples/multioutput/plot_classifier_chain_yeast.py | 4 ++-- examples/neural_networks/plot_mnist_filters.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py index 86a940f28c4c6..b0b271a3644a2 100644 --- a/examples/gaussian_process/plot_gpr_co2.py +++ b/examples/gaussian_process/plot_gpr_co2.py @@ -70,9 +70,9 @@ from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels \ import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared -from sklearn.datasets import fetch_openml +from sklearn.datasets import fetch_mldata -data = fetch_openml('mauna-loa-atmospheric-co2').data +data = fetch_mldata('mauna-loa-atmospheric-co2').data X = data[:, [1]] y = data[:, 0] diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py index 60696aea1aab3..7b8b5f50bc93f 100644 --- a/examples/linear_model/plot_sgd_early_stopping.py +++ b/examples/linear_model/plot_sgd_early_stopping.py @@ -58,7 +58,7 @@ def load_mnist(n_samples=None, class_0=0, class_1=8): """Load MNIST, select two classes, shuffle and return only n_samples.""" - mnist = fetch_openml('MNIST original') + mnist = fetch_openml('mnist_784') # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py index 175c2ab9403cf..eced6dffd787f 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py +++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py @@ -35,7 +35,7 @@ t0 = time.time() train_samples = 5000 -mnist = fetch_openml('MNIST original') +mnist = fetch_openml('mnist_784') X = mnist.data.astype('float64') y = mnist.target random_state = check_random_state(0) diff --git a/examples/multioutput/plot_classifier_chain_yeast.py b/examples/multioutput/plot_classifier_chain_yeast.py index 2c4245f8ef3ad..6a90e14dfc379 100644 --- a/examples/multioutput/plot_classifier_chain_yeast.py +++ b/examples/multioutput/plot_classifier_chain_yeast.py @@ -44,10 +44,10 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.metrics import jaccard_similarity_score from sklearn.linear_model import LogisticRegression -from sklearn.datasets import fetch_openml +from sklearn.datasets import fetch_mldata # Load a multi-label dataset -yeast = fetch_openml('yeast') +yeast = fetch_mldata('yeast') X = yeast['data'] Y = yeast['target'].transpose().toarray() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py index 144bc3a514b60..8c7beedda48fb 100644 --- a/examples/neural_networks/plot_mnist_filters.py +++ b/examples/neural_networks/plot_mnist_filters.py @@ -26,7 +26,7 @@ from sklearn.datasets import fetch_openml from sklearn.neural_network import MLPClassifier -mnist = fetch_openml("MNIST original") +mnist = fetch_openml("mnist_784") # rescale the data, use the traditional train/test split X, y = mnist.data / 255., mnist.target X_train, X_test = X[:60000], X[60000:] From 2a5b66c9699c10807f717fb18b80c3b21b455fcf Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 02:53:44 +0800 Subject: [PATCH 04/11] roll back exmples --- examples/linear_model/plot_sgd_early_stopping.py | 4 ++-- .../linear_model/plot_sparse_logistic_regression_mnist.py | 4 ++-- examples/neural_networks/plot_mnist_filters.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py index 7b8b5f50bc93f..31ce61f39d22c 100644 --- a/examples/linear_model/plot_sgd_early_stopping.py +++ b/examples/linear_model/plot_sgd_early_stopping.py @@ -47,7 +47,7 @@ import matplotlib.pyplot as plt from sklearn import linear_model -from sklearn.datasets import fetch_openml +from sklearn.datasets import fetch_mldata from sklearn.model_selection import train_test_split from sklearn.utils.testing import ignore_warnings from sklearn.exceptions import ConvergenceWarning @@ -58,7 +58,7 @@ def load_mnist(n_samples=None, class_0=0, class_1=8): """Load MNIST, select two classes, shuffle and return only n_samples.""" - mnist = fetch_openml('mnist_784') + mnist = fetch_mldata('MNIST original') # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) diff --git a/examples/linear_model/plot_sparse_logistic_regression_mnist.py b/examples/linear_model/plot_sparse_logistic_regression_mnist.py index eced6dffd787f..5610f471b5d05 100644 --- a/examples/linear_model/plot_sparse_logistic_regression_mnist.py +++ b/examples/linear_model/plot_sparse_logistic_regression_mnist.py @@ -20,7 +20,7 @@ import matplotlib.pyplot as plt import numpy as np -from sklearn.datasets import fetch_openml +from sklearn.datasets import fetch_mldata from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler @@ -35,7 +35,7 @@ t0 = time.time() train_samples = 5000 -mnist = fetch_openml('mnist_784') +mnist = fetch_mldata('MNIST original') X = mnist.data.astype('float64') y = mnist.target random_state = check_random_state(0) diff --git a/examples/neural_networks/plot_mnist_filters.py b/examples/neural_networks/plot_mnist_filters.py index 8c7beedda48fb..6c3b8b2284ea2 100644 --- a/examples/neural_networks/plot_mnist_filters.py +++ b/examples/neural_networks/plot_mnist_filters.py @@ -23,10 +23,10 @@ print(__doc__) import matplotlib.pyplot as plt -from sklearn.datasets import fetch_openml +from sklearn.datasets import fetch_mldata from sklearn.neural_network import MLPClassifier -mnist = fetch_openml("mnist_784") +mnist = fetch_mldata("MNIST original") # rescale the data, use the traditional train/test split X, y = mnist.data / 255., mnist.target X_train, X_test = X[:60000], X[60000:] From 68981bf78cb22e32649eaec7d6b99e9ddb41cf83 Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 03:22:38 +0800 Subject: [PATCH 05/11] delete whitespace --- sklearn/datasets/openml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index ce6832abfd1a3..32d6c2adf597f 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -568,7 +568,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) - + if return_X_y: return X, y From 7ac4dfb27cb4937400590b316ce966f0bcdb67bc Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 14:29:51 +0800 Subject: [PATCH 06/11] fix pep8 error --- sklearn/datasets/openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 32d6c2adf597f..f87a0b301d354 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -394,7 +394,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, cache : boolean, default=True Whether to cache downloaded datasets using joblib. - + return_X_y : boolean, default=False. If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` object @@ -419,7 +419,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, that the value encoded as i is ith in the list. details : dict More metadata from OpenML - + (data, target) : tuple if ``return_X_y`` is True .. note:: EXPERIMENTAL From e6a702dc80b54696f631f2a4700ba93bca789399 Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 15:20:55 +0800 Subject: [PATCH 07/11] add check_return_X_y for test_openml --- sklearn/datasets/tests/test_openml.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index da2b913eda7ed..346aa841d7016 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -15,7 +15,8 @@ assert_raise_message) from sklearn.externals.six import string_types from sklearn.externals.six.moves.urllib.error import HTTPError - +from sklearn.datasets.tests.test_common import check_return_X_y +from functools import partial currdir = os.path.dirname(os.path.abspath(__file__)) # if True, urlopen will be monkey patched to only use local files @@ -124,6 +125,11 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version, # np.isnan doesn't work on CSR matrix assert (np.count_nonzero(np.isnan(data_by_id.data)) == expected_missing) + + # test return_X_y option + fetch_func = partial(fetch_openml, data_id=data_id, cache=False, + target_column=target_column) + check_return_X_y(data_by_id, fetch_func) return data_by_id From 18cbe0fad64d58e2fb1f705aed29d626af11e0bc Mon Sep 17 00:00:00 2001 From: vufg Date: Fri, 17 Aug 2018 15:24:22 +0800 Subject: [PATCH 08/11] add check_return_X_y for test_openml --- sklearn/datasets/tests/test_openml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 346aa841d7016..3a24213062fc2 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -18,6 +18,7 @@ from sklearn.datasets.tests.test_common import check_return_X_y from functools import partial + currdir = os.path.dirname(os.path.abspath(__file__)) # if True, urlopen will be monkey patched to only use local files test_offline = True From e50c91b20cbbb578dde53ba2f5fbbfe854f0e168 Mon Sep 17 00:00:00 2001 From: vufg Date: Sat, 18 Aug 2018 04:07:29 +0800 Subject: [PATCH 09/11] fix format --- sklearn/datasets/openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index f87a0b301d354..150e6b5b1a5b0 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -396,8 +396,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, Whether to cache downloaded datasets using joblib. return_X_y : boolean, default=False. - If True, returns ``(data, target)`` instead of a Bunch object. - See below for more information about the `data` and `target` object + If True, returns ``(data, target)`` instead of a Bunch object. + See below for more information about the `data` and `target` objects. Returns ------- From a360473f7319347991f4bfdaecd68fc47aba51eb Mon Sep 17 00:00:00 2001 From: vufg Date: Sat, 18 Aug 2018 04:23:44 +0800 Subject: [PATCH 10/11] fix format --- sklearn/datasets/openml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index 150e6b5b1a5b0..f7b8ea785ee0c 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -396,8 +396,8 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, Whether to cache downloaded datasets using joblib. return_X_y : boolean, default=False. - If True, returns ``(data, target)`` instead of a Bunch object. - See below for more information about the `data` and `target` objects. + If True, returns ``(data, target)`` instead of a Bunch object. See + below for more information about the `data` and `target` objects. Returns ------- From 85f88b3c156849c7aab96a1df3a2fbacf9009a4a Mon Sep 17 00:00:00 2001 From: vufg Date: Sat, 18 Aug 2018 23:32:14 +0800 Subject: [PATCH 11/11] move return_X_y --- sklearn/datasets/openml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/datasets/openml.py b/sklearn/datasets/openml.py index f7b8ea785ee0c..a35fa51307993 100644 --- a/sklearn/datasets/openml.py +++ b/sklearn/datasets/openml.py @@ -563,13 +563,13 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None, elif y.shape[1] == 0: y = None + if return_X_y: + return X, y + bunch = Bunch( data=X, target=y, feature_names=data_columns, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) - if return_X_y: - return X, y - return bunch