From 37043359160af656739a45c50a0f8ce5db52f3fd Mon Sep 17 00:00:00 2001 From: Dutta-SD Date: Wed, 27 Dec 2023 21:52:46 +0530 Subject: [PATCH 1/6] Feat: DocString Examples for sklearn/datasets --- sklearn/datasets/_base.py | 14 ++++++++++++++ sklearn/datasets/_covtype.py | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index e062bf381b393..d303325a8a4ea 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -85,6 +85,20 @@ def clear_data_home(data_home=None): data_home : str or path-like, default=None The path to scikit-learn data directory. If `None`, the default path is `~/scikit_learn_data`. + + Examples + ---------- + >>> from sklearn.datasets import clear_data_home + >>> import os + >>> import tempfile + >>> + >>> # Make Dummy Directory + >>> temp_dir = tempfile.TemporaryDirectory() + >>> os.path.isdir(temp_dir.name) + True + >>> clear_data_home(temp_dir.name) + >>> os.path.isdir(temp_dir.name) + False """ data_home = get_data_home(data_home) shutil.rmtree(data_home) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 7620e08c5ec92..f615885e70634 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -156,6 +156,28 @@ def fetch_covtype( ndarray of shape (n_samples,) containing the target samples. .. versionadded:: 0.20 + + Examples + ------- + >>> from sklearn.datasets import fetch_covtype + >>> + >>> # Default return sklearn.utils.Bunch + >>> cov_bunch = fetch_covtype() + >>> cov_bunch.data.shape + (581012, 54) + >>> cov_bunch.target.shape + (581012,) + >>> + >>> # Return Numpy Array + >>> X, y = fetch_covtype( + ... random_state=42, + ... shuffle=True, + ... return_X_y=True, + ... ) + >>> type(X), X.shape + (, (581012, 54)) + >>> type(y), y.shape + (, (581012,)) """ data_home = get_data_home(data_home=data_home) covtype_dir = join(data_home, "covertype") From 8a2cb9270a6a26f1d7aa976ae7fe09a981a949b8 Mon Sep 17 00:00:00 2001 From: Dutta-SD Date: Wed, 27 Dec 2023 22:58:50 +0530 Subject: [PATCH 2/6] DOC Fixed Lint Error --- sklearn/datasets/_covtype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index f615885e70634..5daa7a5d010bd 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -158,7 +158,7 @@ def fetch_covtype( .. versionadded:: 0.20 Examples - ------- + -------- >>> from sklearn.datasets import fetch_covtype >>> >>> # Default return sklearn.utils.Bunch From bcb3aa0db3d25144f168ce524ee9176433b321d1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Jan 2024 14:01:17 +0100 Subject: [PATCH 3/6] add mechanism for skipping docstring that fetch data --- sklearn/conftest.py | 17 +++++++++++++---- sklearn/datasets/_covtype.py | 22 ++++++---------------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/sklearn/conftest.py b/sklearn/conftest.py index d2f44f6912b62..79a9751c7c9f1 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -134,10 +134,18 @@ def pytest_collection_modifyitems(config, items): datasets_to_download = set() for item in items: - if not hasattr(item, "fixturenames"): - continue - item_fixtures = set(item.fixturenames) - dataset_to_fetch = item_fixtures & dataset_features_set + if isinstance(item, DoctestItem): + # The dataset fetchers require network access but we cannot use pytest + # fixture. Since our fixture use the name of the fetcher with the suffix + # "_fxt", we can try to match this name to trigger the download. + name = set([item.name.rsplit(".", 1)[-1] + "_fxt"]) + dataset_to_fetch = name & dataset_features_set + else: + if not hasattr(item, "fixturenames"): + continue + item_fixtures = set(item.fixturenames) + dataset_to_fetch = item_fixtures & dataset_features_set + if not dataset_to_fetch: continue @@ -209,6 +217,7 @@ def pytest_collection_modifyitems(config, items): # details. if item.name != "sklearn._config.config_context": item.add_marker(skip_marker) + try: import PIL # noqa diff --git a/sklearn/datasets/_covtype.py b/sklearn/datasets/_covtype.py index 5daa7a5d010bd..4e1b1d7961f2e 100644 --- a/sklearn/datasets/_covtype.py +++ b/sklearn/datasets/_covtype.py @@ -160,24 +160,14 @@ def fetch_covtype( Examples -------- >>> from sklearn.datasets import fetch_covtype - >>> - >>> # Default return sklearn.utils.Bunch - >>> cov_bunch = fetch_covtype() - >>> cov_bunch.data.shape + >>> cov_type = fetch_covtype() + >>> cov_type.data.shape (581012, 54) - >>> cov_bunch.target.shape + >>> cov_type.target.shape (581012,) - >>> - >>> # Return Numpy Array - >>> X, y = fetch_covtype( - ... random_state=42, - ... shuffle=True, - ... return_X_y=True, - ... ) - >>> type(X), X.shape - (, (581012, 54)) - >>> type(y), y.shape - (, (581012,)) + >>> # Let's check the 4 first feature names + >>> cov_type.feature_names[:4] + ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology'] """ data_home = get_data_home(data_home=data_home) covtype_dir = join(data_home, "covertype") From c9b08474f47d0ab8ae52c654e97964bb75fc73bf Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Jan 2024 14:02:28 +0100 Subject: [PATCH 4/6] less diff --- sklearn/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 79a9751c7c9f1..94e9fd83b1f76 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -217,7 +217,6 @@ def pytest_collection_modifyitems(config, items): # details. if item.name != "sklearn._config.config_context": item.add_marker(skip_marker) - try: import PIL # noqa From 957dc48d83447258f0cb7486e9ec5cc76d05dbbc Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 12 Jan 2024 14:03:31 +0100 Subject: [PATCH 5/6] skip clear_data_home doctest --- sklearn/datasets/_base.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/sklearn/datasets/_base.py b/sklearn/datasets/_base.py index d303325a8a4ea..ab2b8bd3f5110 100644 --- a/sklearn/datasets/_base.py +++ b/sklearn/datasets/_base.py @@ -89,16 +89,7 @@ def clear_data_home(data_home=None): Examples ---------- >>> from sklearn.datasets import clear_data_home - >>> import os - >>> import tempfile - >>> - >>> # Make Dummy Directory - >>> temp_dir = tempfile.TemporaryDirectory() - >>> os.path.isdir(temp_dir.name) - True - >>> clear_data_home(temp_dir.name) - >>> os.path.isdir(temp_dir.name) - False + >>> clear_data_home() # doctest: +SKIP """ data_home = get_data_home(data_home) shutil.rmtree(data_home) From 18ee95037c56b6f4289440e72a0729a187510531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 12 Jan 2024 15:24:49 +0100 Subject: [PATCH 6/6] Tweak the code slightly --- sklearn/conftest.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/conftest.py b/sklearn/conftest.py index 94e9fd83b1f76..d14afddc3773d 100644 --- a/sklearn/conftest.py +++ b/sklearn/conftest.py @@ -134,15 +134,13 @@ def pytest_collection_modifyitems(config, items): datasets_to_download = set() for item in items: - if isinstance(item, DoctestItem): - # The dataset fetchers require network access but we cannot use pytest - # fixture. Since our fixture use the name of the fetcher with the suffix - # "_fxt", we can try to match this name to trigger the download. - name = set([item.name.rsplit(".", 1)[-1] + "_fxt"]) - dataset_to_fetch = name & dataset_features_set + if isinstance(item, DoctestItem) and "fetch_" in item.name: + fetcher_function_name = item.name.split(".")[-1] + dataset_fetchers_key = f"{fetcher_function_name}_fxt" + dataset_to_fetch = set([dataset_fetchers_key]) & dataset_features_set + elif not hasattr(item, "fixturenames"): + continue else: - if not hasattr(item, "fixturenames"): - continue item_fixtures = set(item.fixturenames) dataset_to_fetch = item_fixtures & dataset_features_set