BUG: for several datasets, download_if_missing keyword was ignored. (scikit-learn#7944)

rgommers · maskani-moh · commit 72085ff58068 · 2017-11-15T12:11:36.000-05:00
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
@@ -87,8 +87,12 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
     data_home = get_data_home(data_home=data_home)
     if not exists(data_home):
         makedirs(data_home)
+
     filepath = _pkl_filepath(data_home, TARGET_FILENAME)
     if not exists(filepath):
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
+
         print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
         archive_fileobj = BytesIO(urlopen(DATA_URL).read())
         fileobj = tarfile.open(
diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py
@@ -99,6 +99,9 @@ def fetch_covtype(data_home=None, download_if_missing=True,
 
         joblib.dump(X, samples_path, compress=9)
         joblib.dump(y, targets_path, compress=9)
+    elif not available:
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
 
     try:
         X, y
diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
@@ -345,6 +345,9 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
 
         joblib.dump(X, samples_path, compress=0)
         joblib.dump(y, targets_path, compress=0)
+    elif not available:
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
 
     try:
         X, y
diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py
@@ -111,6 +111,9 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         makedirs(data_home)
     filepath = _pkl_filepath(data_home, TARGET_FILENAME)
     if not exists(filepath):
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
+
         print('downloading Olivetti faces from %s to %s'
               % (DATA_URL, data_home))
         fhandle = urlopen(DATA_URL)
@@ -121,6 +124,7 @@ def fetch_olivetti_faces(data_home=None, shuffle=False, random_state=0,
         del mfile
     else:
         faces = joblib.load(filepath)
+
     # We want floating point data, but float32 is enough (there is only
     # one byte of precision in the original uint8s anyway)
     faces = np.float32(faces)
diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py
@@ -222,6 +222,9 @@ def fetch_species_distributions(data_home=None,
     archive_path = _pkl_filepath(data_home, DATA_ARCHIVE_NAME)
 
     if not exists(archive_path):
+        if not download_if_missing:
+            raise IOError("Data not found and `download_if_missing` is False")
+
         print('Downloading species data from %s to %s' % (SAMPLES_URL,
                                                           data_home))
         X = np.load(BytesIO(urlopen(SAMPLES_URL).read()))
diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py
@@ -3,7 +3,6 @@
 Skipped if covtype is not already downloaded to data_home.
 """
 
-import errno
 from sklearn.datasets import fetch_covtype
 from sklearn.utils.testing import assert_equal, SkipTest
 
@@ -15,9 +14,8 @@ def fetch(*args, **kwargs):
 def test_fetch():
     try:
         data1 = fetch(shuffle=True, random_state=42)
-    except IOError as e:
-        if e.errno == errno.ENOENT:
-            raise SkipTest("Covertype dataset can not be loaded.")
+    except IOError:
+        raise SkipTest("Covertype dataset can not be loaded.")
 
     data2 = fetch(shuffle=True, random_state=37)
 
diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py
@@ -5,17 +5,15 @@
 scikit-learn data folder.
 """
 
-import errno
 from sklearn.datasets import fetch_kddcup99
 from sklearn.utils.testing import assert_equal, SkipTest
 
 
 def test_percent10():
     try:
         data = fetch_kddcup99(download_if_missing=False)
-    except IOError as e:
-        if e.errno == errno.ENOENT:
-            raise SkipTest("kddcup99 dataset can not be loaded.")
+    except IOError:
+        raise SkipTest("kddcup99 dataset can not be loaded.")
 
     assert_equal(data.data.shape, (494021, 41))
     assert_equal(data.target.shape, (494021,))