-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
ENH adding as_frame functionality for CA housing dataset loader #15950
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
78a196f
d604f4b
0a01ecf
e00f4f4
3bda9b5
0dba66a
df9f086
5019c9e
587be05
f8eeba9
ee19ac7
19ed2f3
45940b2
a6fcb50
e3db481
4c946e0
20daa13
815b419
c41c9a4
7d311d1
5062770
8e118a0
a963f0d
2083f6b
167dd9e
59d0e54
3f65f08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
import joblib | ||
|
||
from . import get_data_home | ||
from ._base import _convert_data_dataframe | ||
from ._base import _fetch_remote | ||
from ._base import _pkl_filepath | ||
from ._base import RemoteFileMetadata | ||
|
@@ -49,7 +50,7 @@ | |
|
||
|
||
def fetch_california_housing(data_home=None, download_if_missing=True, | ||
return_X_y=False): | ||
return_X_y=False, as_frame=False): | ||
"""Load the California housing dataset (regression). | ||
|
||
============== ============== | ||
|
@@ -78,15 +79,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True, | |
|
||
.. versionadded:: 0.20 | ||
|
||
as_frame : boolean, default=False | ||
If True, the data is a pandas DataFrame including columns with | ||
appropriate dtypes (numeric, string or categorical). The target is | ||
a pandas DataFrame or Series depending on the number of target_columns. | ||
|
||
.. versionadded:: 0.23 | ||
|
||
Returns | ||
------- | ||
dataset : dict-like object with the following attributes: | ||
|
||
dataset.data : ndarray, shape [20640, 8] | ||
Each row corresponding to the 8 feature values in order. | ||
If ``as_frame`` is True, ``data`` is a pandas object. | ||
|
||
dataset.target : numpy array of shape (20640,) | ||
Each value corresponds to the average house value in units of 100,000. | ||
If ``as_frame`` is True, ``target`` is a pandas object. | ||
|
||
dataset.feature_names : array of length 8 | ||
Array of ordered feature names used in the dataset. | ||
|
@@ -98,6 +108,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True, | |
|
||
.. versionadded:: 0.20 | ||
|
||
frame : pandas DataFrame | ||
Only present when `as_frame=True`. DataFrame with ``data`` and | ||
``target``. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pleased add the directive:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the right way to do it? def _pandas_is_missing():
try:
import pandas
except ImportError:
raise SkipTest('fetch_california_housing with as_frame=True'
' requires pandas')
@pytest.mark.skipif(
_pandas_is_missing(),
reason='Import pandas library to run this test'
) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In #15950 (comment) I mentioned to add a new test to check that the warning is raised without skipping the test when pandas is missing. Something like the following (untested): @pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_pandas_dependency_message()
try:
import pandas
pytest.skip("This test requires pandas to be not installed")
except ImportError:
# Check that pandas is imported lazily and that an informative error message is
# raised when pandas is missing:
expected_msg = "fetch_california_housing with as_frame=True requires pandas"
with pytest.raises(ImportError, match=expected_msg):
fetch_california_housing(as_frame=True) |
||
.. versionadded:: 0.23 | ||
|
||
Notes | ||
----- | ||
|
||
|
@@ -155,10 +171,24 @@ def fetch_california_housing(data_home=None, download_if_missing=True, | |
with open(join(module_path, 'descr', 'california_housing.rst')) as dfile: | ||
descr = dfile.read() | ||
|
||
X = data | ||
y = target | ||
|
||
frame = None | ||
target_names = ["MedHouseVal", ] | ||
if as_frame: | ||
frame, X, y = _convert_data_dataframe("fetch_california_housing", | ||
data, | ||
target, | ||
feature_names, | ||
target_names) | ||
|
||
if return_X_y: | ||
return data, target | ||
return X, y | ||
|
||
return Bunch(data=data, | ||
target=target, | ||
return Bunch(data=X, | ||
target=y, | ||
frame=frame, | ||
target_names=target_names, | ||
feature_names=feature_names, | ||
DESCR=descr) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,9 @@ | |
Skipped if california_housing is not already downloaded to data_home. | ||
""" | ||
|
||
import pytest | ||
|
||
from sklearn.datasets import fetch_california_housing | ||
from sklearn.utils._testing import SkipTest | ||
from sklearn.datasets.tests.test_common import check_return_X_y | ||
from functools import partial | ||
|
||
|
@@ -13,14 +14,54 @@ def fetch(*args, **kwargs): | |
return fetch_california_housing(*args, download_if_missing=False, **kwargs) | ||
|
||
|
||
def test_fetch(): | ||
def _is_california_housing_dataset_not_available(): | ||
try: | ||
data = fetch() | ||
fetch_california_housing(download_if_missing=False) | ||
return False | ||
except IOError: | ||
raise SkipTest("California housing dataset can not be loaded.") | ||
return True | ||
|
||
|
||
@pytest.mark.skipif( | ||
_is_california_housing_dataset_not_available(), | ||
reason='Download California Housing dataset to run this test' | ||
) | ||
def test_fetch(): | ||
data = fetch() | ||
assert((20640, 8) == data.data.shape) | ||
assert((20640, ) == data.target.shape) | ||
|
||
# test return_X_y option | ||
fetch_func = partial(fetch) | ||
check_return_X_y(data, fetch_func) | ||
|
||
|
||
@pytest.mark.skipif( | ||
_is_california_housing_dataset_not_available(), | ||
reason='Download California Housing dataset to run this test' | ||
) | ||
def test_fetch_asframe(): | ||
pd = pytest.importorskip('pandas') | ||
bunch = fetch(as_frame=True) | ||
frame = bunch.frame | ||
assert hasattr(bunch, 'frame') is True | ||
assert frame.shape == (20640, 9) | ||
assert isinstance(bunch.data, pd.DataFrame) | ||
assert isinstance(bunch.target, pd.DataFrame) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add a test that checks that |
||
|
||
|
||
@pytest.mark.skipif( | ||
_is_california_housing_dataset_not_available(), | ||
reason='Download California Housing dataset to run this test' | ||
) | ||
def test_pandas_dependency_message(): | ||
try: | ||
import pandas # noqa | ||
pytest.skip("This test requires pandas to be not installed") | ||
except ImportError: | ||
# Check that pandas is imported lazily and that an informative error | ||
# message is raised when pandas is missing: | ||
expected_msg = ('fetch_california_housing with as_frame=True' | ||
' requires pandas') | ||
with pytest.raises(ImportError, match=expected_msg): | ||
fetch_california_housing(as_frame=True) |
Uh oh!
There was an error while loading. Please reload this page.