lithuak
diff --git a/‎doc/whats_new/v0.20.rst
Lines changed: 3 additions & 0 deletions b/‎doc/whats_new/v0.20.rst
Lines changed: 3 additions & 0 deletions
diff --git a/‎sklearn/datasets/openml.py
Lines changed: 23 additions & 11 deletions b/‎sklearn/datasets/openml.py
Lines changed: 23 additions & 11 deletions
diff --git a/‎sklearn/datasets/tests/test_openml.py
Lines changed: 36 additions & 6 deletions b/‎sklearn/datasets/tests/test_openml.py
Lines changed: 36 additions & 6 deletions
@@ -37,6 +37,9 @@ Changelog
 :mod:`sklearn.datasets`
 ............................
 
+- |Fix| :func:`dataset.fetch_openml` to correctly use the local cache.
+  :issue:`12246` by :user:`Jan N. van Rijn <janvanrijn>`.
+
 - |Fix| Fixed integer overflow in :func:`datasets.make_classification`
   for values of ``n_informative`` parameter larger than 64.
   :issue:10811 by :user:`Roman Feldbauer <VarIr>`.
 
@@ -31,6 +31,10 @@
 _DATA_FILE = "data/v1/download/{}"
 
 
+def _get_local_path(openml_path, data_home):
+    return os.path.join(data_home, 'openml.org', openml_path + ".gz")
+
 def _open_openml_url(openml_path, data_home):
     """
     Returns a resource from OpenML.org. Caches it to data_home if required.
@@ -50,37 +54,45 @@ def _open_openml_url(openml_path, data_home):
     result : stream
         A stream to the OpenML resource
     """
+    def is_gzip(_fsrc):
+        return _fsrc.info().get('Content-Encoding', '') == 'gzip'
+
     req = Request(_OPENML_PREFIX + openml_path)
     req.add_header('Accept-encoding', 'gzip')
-    fsrc = urlopen(req)
-    is_gzip = fsrc.info().get('Content-Encoding', '') == 'gzip'
 
     if data_home is None:
-        if is_gzip:
+        fsrc = urlopen(req)
+        if is_gzip(fsrc):
             if PY2:
                 fsrc = BytesIO(fsrc.read())
             return gzip.GzipFile(fileobj=fsrc, mode='rb')
         return fsrc
 
-    local_path = os.path.join(data_home, 'openml.org', openml_path + ".gz")
+    local_path = _get_local_path(openml_path, data_home)
     if not os.path.exists(local_path):
+        fsrc = urlopen(req)
         try:
             os.makedirs(os.path.dirname(local_path))
         except OSError:
             # potentially, the directory has been created already
             pass
 
         try:
-            with open(local_path, 'wb') as fdst:
-                shutil.copyfileobj(fsrc, fdst)
-                fsrc.close()
+            if is_gzip(fsrc):
+                with open(local_path, 'wb') as fdst:
+                    shutil.copyfileobj(fsrc, fdst)
+                    fsrc.close()
+            else:
+                with gzip.GzipFile(local_path, 'wb') as fdst:
+                    shutil.copyfileobj(fsrc, fdst)
+                    fsrc.close()
         except Exception:
             os.unlink(local_path)
             raise
-    # XXX: unnecessary decompression on first access
-    if is_gzip:
-        return gzip.GzipFile(local_path, 'rb')
-    return fsrc
+
+    # XXX: First time, decompression will not be necessary (by using fsrc), but
+    # it will happen nonetheless
+    return gzip.GzipFile(local_path, 'rb')
 
 
 def _get_json_content_from_openml_api(url, error_message, raise_if_error,
 
@@ -12,7 +12,8 @@
 from sklearn.datasets import fetch_openml
 from sklearn.datasets.openml import (_open_openml_url,
                                      _get_data_description_by_id,
-                                     _download_data_arff)
+                                     _download_data_arff,
+                                     _get_local_path)
 from sklearn.utils.testing import (assert_warns_message,
                                    assert_raise_message)
 from sklearn.externals.six import string_types
@@ -77,6 +78,8 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
                                    cache=False)
     assert int(data_by_name_id.details['id']) == data_id
 
+    # Please note that cache=False is crucial, as the monkey patched files are
+    # not consistent with reality
     fetch_openml(name=data_name, cache=False)
     # without specifying the version, there is no guarantee that the data id
     # will be the same
@@ -138,6 +141,9 @@ def _fetch_dataset_from_openml(data_id, data_name, data_version,
 def _monkey_patch_webbased_functions(context,
                                      data_id,
                                      gzip_response):
+    # monkey patches the urlopen function. Important note: Do NOT use this
+    # in combination with a regular cache directory, as the files that are
+    # stored as cache should not be mixed up with real openml datasets
     url_prefix_data_description = "https://openml.org/api/v1/json/data/"
     url_prefix_data_features = "https://openml.org/api/v1/json/data/features/"
     url_prefix_download_data = "https://openml.org/data/v1/"
@@ -453,23 +459,47 @@ def test_decode_emotions(monkeypatch):
 
 
 @pytest.mark.parametrize('gzip_response', [True, False])
-def test_open_openml_url_cache(monkeypatch, gzip_response):
+def test_open_openml_url_cache(monkeypatch, gzip_response, tmpdir):
     data_id = 61
 
     _monkey_patch_webbased_functions(
         monkeypatch, data_id, gzip_response)
     openml_path = sklearn.datasets.openml._DATA_FILE.format(data_id)
-    test_directory = os.path.join(os.path.expanduser('~'), 'scikit_learn_data')
+    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
     # first fill the cache
-    response1 = _open_openml_url(openml_path, test_directory)
+    response1 = _open_openml_url(openml_path, cache_directory)
     # assert file exists
-    location = os.path.join(test_directory, 'openml.org', openml_path + '.gz')
+    location = _get_local_path(openml_path, cache_directory)
     assert os.path.isfile(location)
     # redownload, to utilize cache
-    response2 = _open_openml_url(openml_path, test_directory)
+    response2 = _open_openml_url(openml_path, cache_directory)
     assert response1.read() == response2.read()
 
 
+@pytest.mark.parametrize('gzip_response', [True, False])
+def test_fetch_openml_cache(monkeypatch, gzip_response, tmpdir):
+    def _mock_urlopen_raise(request):
+        raise ValueError('This mechanism intends to test correct cache'
+                         'handling. As such, urlopen should never be '
+                         'accessed. URL: %s' % request.get_full_url())
+    data_id = 2
+    cache_directory = str(tmpdir.mkdir('scikit_learn_data'))
+    _monkey_patch_webbased_functions(
+        monkeypatch, data_id, gzip_response)
+    X_fetched, y_fetched = fetch_openml(data_id=data_id, cache=True,
+                                        data_home=cache_directory,
+                                        return_X_y=True)
+
+    monkeypatch.setattr(sklearn.datasets.openml, 'urlopen',
+                        _mock_urlopen_raise)
+
+    X_cached, y_cached = fetch_openml(data_id=data_id, cache=True,
+                                      data_home=cache_directory,
+                                      return_X_y=True)
+    np.testing.assert_array_equal(X_fetched, X_cached)
+    np.testing.assert_array_equal(y_fetched, y_cached)
+
+
 @pytest.mark.parametrize('gzip_response', [True, False])
 def test_fetch_openml_notarget(monkeypatch, gzip_response):
     data_id = 61