deepatdotnet
diff --git a/‎benchmarks/bench_covertype.py
Lines changed: 23 additions & 46 deletions b/‎benchmarks/bench_covertype.py
Lines changed: 23 additions & 46 deletions
diff --git a/‎doc/datasets/covtype.rst
Lines changed: 19 additions & 0 deletions b/‎doc/datasets/covtype.rst
Lines changed: 19 additions & 0 deletions
diff --git a/‎doc/datasets/index.rst
Lines changed: 1 addition & 0 deletions b/‎doc/datasets/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎sklearn/datasets/__init__.py
Lines changed: 2 additions & 0 deletions b/‎sklearn/datasets/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎sklearn/datasets/covtype.py
Lines changed: 101 additions & 0 deletions b/‎sklearn/datasets/covtype.py
Lines changed: 101 additions & 0 deletions
diff --git a/‎sklearn/datasets/tests/test_covtype.py
Lines changed: 32 additions & 0 deletions b/‎sklearn/datasets/tests/test_covtype.py
Lines changed: 32 additions & 0 deletions
@@ -44,25 +44,29 @@
 
 print __doc__
 
-# Author: Peter Prettenhoer <peter.prettenhofer@gmail.com>
+# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
 # License: BSD Style.
 
-# $Id$
-
-from time import time
+import logging
 import os
 import sys
-import numpy as np
+from time import time
 from optparse import OptionParser
 
+import numpy as np
+
+from sklearn.datasets import fetch_covtype
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import SGDClassifier
 from sklearn.naive_bayes import GaussianNB
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 from sklearn import metrics
 from sklearn.externals.joblib import Memory
-from sklearn.utils import check_random_state
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s %(levelname)s %(message)s')
+logger = logging.getLogger(__name__)
 
 op = OptionParser()
 op.add_option("--classifiers",
@@ -80,8 +84,7 @@
 # estimators.
 op.add_option("--random-seed",
               dest="random_seed", default=13, type=int,
-              help="Common seed used by random number generator."
-              )
+              help="Common seed used by random number generator.")
 
 op.print_help()
 
@@ -97,57 +100,31 @@
 joblib_cache_folder = os.path.join(bench_folder, 'bench_covertype_data')
 m = Memory(joblib_cache_folder, mmap_mode='r')
 
-# Set seed for rng
-rng = check_random_state(opts.random_seed)
-
 
 # Load the data, then cache and memmap the train/test split
 @m.cache
 def load_data(dtype=np.float32, order='F'):
-    ######################################################################
-    ## Download the data, if not already on disk
-    if not os.path.exists(original_archive):
-        # Download the data
-        import urllib
-        print "Downloading data, Please Wait (11MB)..."
-        opener = urllib.urlopen(
-            'http://archive.ics.uci.edu/ml/'
-            'machine-learning-databases/covtype/covtype.data.gz')
-        open(original_archive, 'wb').write(opener.read())
-
     ######################################################################
     ## Load dataset
     print("Loading dataset...")
-    import gzip
-    f = gzip.open(original_archive)
-    X = np.fromstring(f.read().replace(",", " "), dtype=dtype, sep=" ",
-                      count=-1)
-    X = X.reshape((581012, 55))
+    data = fetch_covtype(download_if_missing=True, shuffle=True,
+                         random_state=opts.random_seed)
+    X, y = data.data, data.target
     if order.lower() == 'f':
         X = np.asfortranarray(X)
-    f.close()
 
     # class 1 vs. all others.
-    y = np.ones(X.shape[0]) * -1
-    y[np.where(X[:, -1] == 1)] = 1
-    X = X[:, :-1]
+    y[np.where(y != 1)] = -1
 
     ######################################################################
     ## Create train-test split (as [Joachims, 2006])
-    print("Creating train-test split...")
-    idx = np.arange(X.shape[0])
-    rng.shuffle(idx)
-    train_idx = idx[:522911]
-    test_idx = idx[522911:]
+    logger.info("Creating train-test split...")
+    n_train = 522911
 
-    X_train = X[train_idx]
-    y_train = y[train_idx]
-    X_test = X[test_idx]
-    y_test = y[test_idx]
-
-    # free memory
-    del X
-    del y
+    X_train = X[:n_train]
+    y_train = y[:n_train]
+    X_test = X[n_train:]
+    y_test = y[n_train:]
 
     ######################################################################
     ## Standardize first 10 features (the numerical ones)
@@ -204,7 +181,7 @@ def benchmark(clf):
     'dual': False,
     'tol': 1e-3,
     "random_state": opts.random_seed,
-    }
+}
 classifiers['liblinear'] = LinearSVC(**liblinear_parameters)
 
 ######################################################################
@@ -218,7 +195,7 @@ def benchmark(clf):
     'n_iter': 2,
     'n_jobs': opts.n_jobs,
     "random_state": opts.random_seed,
-    }
+}
 classifiers['SGD'] = SGDClassifier(**sgd_parameters)
 
 ######################################################################
 
@@ -0,0 +1,19 @@
+
+.. _covtype:
+
+Forest covertypes
+=================
+
+The samples in this dataset correspond to 30×30m patches of forest in the US,
+collected for the task of predicting each patch's cover type,
+i.e. the dominant species of tree.
+There are seven covertypes, making this a multiclass classification problem.
+Each sample has 54 features, described on the
+`dataset's homepage <http://archive.ics.uci.edu/ml/datasets/Covertype>`_.
+Some of the features are boolean indicators,
+while others are discrete or continuous measurements.
+
+``sklearn.datasets.fetch_covtype`` will load the covertype dataset;
+it returns a ``Bunch`` object with the feature matrix in the ``data`` member
+and the target values in ``target``.
+The dataset will be downloaded from the web if necessary.
@@ -180,3 +180,4 @@ features::
 
 .. include:: labeled_faces.rst
 
+.. include:: covtype.rst
@@ -14,6 +14,7 @@
 from .base import clear_data_home
 from .base import load_sample_images
 from .base import load_sample_image
+from .covtype import fetch_covtype
 from .mlcomp import load_mlcomp
 from .lfw import load_lfw_pairs
 from .lfw import load_lfw_people
@@ -57,6 +58,7 @@
            'fetch_olivetti_faces',
            'fetch_species_distributions',
            'fetch_california_housing',
+           'fetch_covtype',
            'get_data_home',
            'load_20newsgroups',
            'load_boston',
 
@@ -0,0 +1,101 @@
+"""Forest covertype dataset.
+
+A classic dataset for classification benchmarks, featuring categorical and
+real-valued features.
+"""
+
+# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
+#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
+# License: 3-clause BSD.
+
+import errno
+from gzip import GzipFile
+from io import BytesIO
+import logging
+import os
+from os.path import exists, join
+from urllib2 import urlopen
+
+import numpy as np
+
+from .base import get_data_home
+from .base import Bunch
+from ..externals import joblib
+from ..utils import check_random_state
+
+
+URL = ('http://archive.ics.uci.edu/ml/'
+       'machine-learning-databases/covtype/covtype.data.gz')
+
+
+logger = logging.getLogger()
+
+
+def fetch_covtype(data_home=None, download_if_missing=True,
+                  random_state=None, shuffle=False):
+    """Load the covertype dataset, downloading it if necessary.
+
+    Parameters
+    ----------
+    data_home : string, optional
+        Specify another download and cache folder for the datasets. By default
+        all scikit learn data is stored in '~/scikit_learn_data' subfolders.
+
+    download_if_missing : boolean, default=True
+        If False, raise a IOError if the data is not locally available
+        instead of trying to download the data from the source site.
+
+    random_state : int, RandomState instance or None, optional (default=None)
+< F438 /span>
Random state for shuffling the dataset.        Random state for shuffling the dataset.
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by `np.random`.
+
+    shuffle : bool, default=False
+        Whether to shuffle dataset.
+    """
+
+    data_home = get_data_home(data_home=data_home)
+    covtype_dir = join(data_home, "covertype")
+    samples_path = join(covtype_dir, "samples")
+    targets_path = join(covtype_dir, "targets")
+    available = exists(samples_path)
+
+    if download_if_missing and not available:
+        _mkdirp(covtype_dir)
+        logger.warn("Downloading %s" % URL)
+        f = BytesIO(urlopen(URL).read())
+        Xy = np.genfromtxt(GzipFile(fileobj=f), delimiter=',')
+
+        X = Xy[:, :-1]
+        y = Xy[:, -1].astype(np.int32)
+
+        joblib.dump(X, samples_path, compress=9)
+        joblib.dump(y, targets_path, compress=9)
+
+    try:
+        X, y
+    except NameError:
+        X = joblib.load(samples_path)
+        y = joblib.load(targets_path)
+
+    if shuffle:
+        ind = np.arange(X.shape[0])
+        rng = check_random_state(random_state)
+        rng.shuffle(ind)
+        X = X[ind]
+        y = y[ind]
+
+    return Bunch(data=X, target=y, DESCR=__doc__)
+
+
+def _mkdirp(d):
+    """Ensure directory d exists (like mkdir -p on Unix)
+    No guarantee that the directory is writable.
+    """
+    try:
+        os.makedirs(d)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
@@ -0,0 +1,32 @@
+"""Test the covtype loader.
+
+Skipped if covtype is not already downloaded to data_home.
+"""
+
+import errno
+from sklearn.datasets import fetch_covtype
+from sklearn.utils.testing import assert_equal, SkipTest
+
+
+def fetch(*args, **kwargs):
+    return fetch_covtype(*args, download_if_missing=False, **kwargs)
+
+
+def test_fetch():
+    try:
+        data1 = fetch(shuffle=True, random_state=42)
+    except IOError as e:
+        if e.errno == errno.ENOENT:
+            raise SkipTest()
+
+    data2 = fetch(shuffle=True, random_state=37)
+
+    X1, X2 = data1.data, data2.data
+    assert_equal((581012, 54), X1.shape)
+    assert_equal(X1.shape, X2.shape)
+
+    assert_equal(X1.sum(), X2.sum())
+
     y1, y2 = data1.target, data2.target
+    assert_equal((X1.shape[0],), y1.shape)
+    assert_equal((X1.shape[0],), y2.shape)
Original file line number	Diff line number	Diff line change
`@@ -180,3 +180,4 @@ features::`
`180`	`180`
`181`	`181`	`.. include:: labeled_faces.rst`
`182`	`182`
	`183`	`+.. include:: covtype.rst`