scikit-learn · ogrisel · Oct 18, 2010 · Oct 19, 2010 · Oct 19, 2010 · Oct 23, 2010
diff --git a/examples/sgd/mlcomp_sparse_document_classification_sgd.py b/examples/sgd/mlcomp_sparse_document_classification_sgd.py
@@ -0,0 +1,113 @@
+"""
+======================================================
+Classification of text documents using sparse features
+======================================================
+
+This is an example showing how the scikit-learn can be used to classify
+documents by topics using a bag-of-words approach. This example uses
+a scipy.sparse matrix to store the features instead of standard numpy arrays.
+
+The dataset used in this example is the 20 newsgroups dataset and should be
+downloaded from the http://mlcomp.org (free registration required):
+
+  http://mlcomp.org/datasets/379
+
+Once downloaded unzip the arhive somewhere on your filesystem. For instance in::
+
+  % mkdir -p ~/data/mlcomp
+  % cd  ~/data/mlcomp
+  % unzip /path/to/dataset-379-20news-18828_XXXXX.zip
+
+You should get a folder ``~/data/mlcomp/379`` with a file named ``metadata`` and
+subfolders ``raw``, ``train`` and ``test`` holding the text documents organized by
+newsgroups.
+
+Then set the ``MLCOMP_DATASETS_HOME`` environment variable pointing to
+the root folder holding the uncompressed archive::
+
+  % export MLCOMP_DATASETS_HOME="~/data/mlcomp"
+
+Then you are ready to run this example using your favorite python shell::
+
+  % ipython examples/mlcomp_sparse_document_classification.py
+
+"""
+# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+# Author: Olivier Grisel <olivier.grisel@ensta.org>
+# License: Simplified BSD
+
+from time import time
+import sys
+import os
+import numpy as np
+# import pylab as pl
+
+from scikits.learn.datasets import load_mlcomp
+from scikits.learn.metrics import confusion_matrix
+
+# from scikits.learn.svm.sparse import LinearSVC
+from scikits.learn.sgd.sparse import SGD
+
+
+if 'MLCOMP_DATASETS_HOME' not in os.environ:
+    print "Please follow those instructions to get started:"
+    print __doc__
+    sys.exit(0)
+
+# Load the training set
+print "Loading 20 newsgroups training set... "
+t0 = time()
+news_train = load_mlcomp('20news-18828', 'train', sparse=True)
+print "done in %fs" % (time() - t0)
+
+print "Creating binary classification task\n"\
+      "alt.atheism vs. comp.graphics"
+target = news_train.target
+pos = 0 # alt.atheism
+neg = 1 # comp.graphics
+pos_idx = np.where(target == pos)[0]
+neg_idx = np.where(target == neg)[0]
+idx = np.concatenate((pos_idx, neg_idx))
+np.random.seed(13)
+np.random.shuffle(idx)
+data = news_train.data[idx]
+target = news_train.target[idx]
+
+print "num train docs: ", data.shape[0]
+print ""
+print "Training a linear SVM (hinge loss and L2 regularizer) using SGD:"
+
+clf = SGD(n_iter=50, alpha=0.00001, fit_intercept=True)
+print clf
+
+clf.fit(data, target)
+print "done in %fs" % (time() - t0)
+print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)
+
+print "Loading 20 newsgroups test set... "
+t0 = time()
+news_test = load_mlcomp('20news-18828', 'test', sparse=True)
+print "done in %fs" % (time() - t0)
+
+target = news_test.target
+pos_idx = np.where(target == pos)[0]
+neg_idx = np.where(target == neg)[0]
+idx = np.concatenate((pos_idx, neg_idx))
+data = news_test.data[idx]
+target = news_test.target[idx]
+
+print "Predicting the labels of the test set..."
+t0 = time()
+pred = clf.predict(data)
+print "done in %fs" % (time() - t0)
+print "Classification accuracy: %f" % (np.mean(pred == target) * 100)
+
+cm = confusion_matrix(target, pred)
+print "Confusion matrix:"
+print cm
+
+## # Show confusion matrix
+## pl.matshow(cm)
+## pl.title('Confusion matrix')
+## pl.colorbar()
+## pl.show()
diff --git a/scikits/learn/__init__.py b/scikits/learn/__init__.py
@@ -14,37 +14,54 @@
 """
 
 from .base import clone
-from . import cross_val
 from . import ball_tree
 from . import cluster
 from . import covariance
+from . import cross_val
 from . import datasets
-from . import gmm
+from . import fastica
+from . import feature_selection
 from . import glm
+from . import gmm
 from . import lda
 from . import metrics
+from . import pca
+from . import sgd
 from . import svm
-from . import feature_selection
 
 try:
     from numpy.testing import nosetester
     class NoseTester(nosetester.NoseTester):
         """ Subclass numpy's NoseTester to add doctests by default
         """
-        def test(self, label='fast', verbose=1, extra_argv=['--exe'], 
+        def test(self, label='fast', verbose=1, extra_argv=['--exe'],
                         doctests=True, coverage=False):
             return super(NoseTester, self).test(label=label, verbose=verbose,
                                     extra_argv=extra_argv,
                                     doctests=doctests, coverage=coverage)
-        
+
     test = NoseTester().test
     del nosetester
 except:
     pass
 
-__all__ = ['cross_val', 'ball_tree', 'cluster', 'covariance',
-           'datasets', 'gmm', 'glm', 'lda', 'metrics', 'svm',
-           'features', 'clone', 'test']
+__all__ = [
+    'ball_tree',
+    'clone',
+    'cluster',
+    'covariance',
+    'cross_val',
+    'datasets',
+    'features',
+    'glm',
+    'gmm',
+    'lda',
+    'logistic',
+    'metrics',
+    'sgd',
+    'svm',
+    'test',
+]
 
 __version__ = '0.6.git'
 
diff --git a/scikits/learn/setup.py b/scikits/learn/setup.py
@@ -10,6 +10,7 @@ def configuration(parent_package='', top_path=None):
     config = Configuration('learn', parent_package, top_path)
 
     config.add_subpackage('svm')
+    config.add_subpackage('sgd')
     config.add_subpackage('datasets')
     config.add_subpackage('feature_extraction')
     config.add_subpackage('feature_extraction/tests')

diff --git a/scikits/learn/sgd/__init__.py b/scikits/learn/sgd/__init__.py
@@ -0,0 +1,8 @@
+"""
+Module that implements Stochastic Gradient Descent related algorithms.
+
+See http://scikit-learn.sourceforge.net/modules/sgd.html for complete
+documentation.
+"""
+
+from . import sparse
diff --git a/scikits/learn/sgd/base.py b/scikits/learn/sgd/base.py
@@ -0,0 +1,97 @@
+# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
+#
+# License: BSD Style.
+"""Stochastic Gradient Descent (SGD) with sparse data. """
+
+from ..base import BaseEstimator
+
+
+class LinearModel(BaseEstimator):
+    """Linear Model trained by regularized Stochastic Gradient Descent
+
+    SGD works by iteratively minimizing (sample by sample) the sum a
+    running estimate of a loss function (e.g. hinge loss or quadratic
+    loss) and a regularizer (e.g. the squared euclidean (L2) norm of the
+    coefs) that encodes apriori knowledge of the distribution of the coefs
+    (e.g. centered guaussian distribution for the L2 regularizer.
+
+    Parameters
+    ----------
+    loss : str, ('hinge'|'log'|'modifiedhuber')
+        The loss function to be used.
+    penalty : str, ('l2'|'l1'|'elasticnet')
+        The penalty (aka regularization term) to be used.
+    alpha : float
+        Constant that multiplies the regularization term. Defaults to 0.0001
+    rho : float
+        The Elastic Net mixing parameter, with 0 < rho <= 1.
+    coef_ : ndarray of shape n_features
+        The initial coeffients to warm-start the optimization
+    intercept_ : float
+        The initial intercept to warm-start the optimization
+    fit_intercept: bool
+        Whether the intercept should be estimated or not. If False, the
+        data is assumed to be already centered.
+    n_iter: int
+        The number of passes over the training data (aka epochs).
+    shuffle: bool
+        Whether or not the training data should be shuffled after each epoch.
+        Defaults to False.
+
+    Attributes
+    ----------
+    `coef_` : array, shape = [n_features]
+        Weights asigned to the features.
+
+    `intercept_` : float
+        Constants in decision function.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
+    >>> Y = np.array([1, 1, 2, 2])
+    >>> from scikits.learn.sgd.sparse import SGD
+    >>> clf = SGD()
+    >>> clf.fit(X, Y)
+    SGD(loss='hinge', shuffle=False, fit_intercept=True, n_iter=5, penalty='l2',
+      coef_=array([-9.80373, -9.80373]), rho=1.0, alpha=0.0001, intercept_=0.1)
+    >>> print clf.predict([[-0.8, -1]])
+    [ 1.]
+
+    See also
+    --------
+    LinearSVC
+
+    """
+
+    def __init__(self, loss="hinge", penalty='l2', alpha=0.0001,
+                 rho=0.85, coef_=None, intercept_=0.0,
+                 fit_intercept=True, n_iter=5, shuffle=False):
+        self.loss = loss
+        self.penalty = penalty
+        self.alpha = alpha
+        self.rho = rho
+        self.coef_ = coef_
+        self.intercept_ = intercept_
+        self.fit_intercept = fit_intercept
+        self.n_iter = int(n_iter)
+        if self.n_iter <= 0:
+            raise ValueError("n_iter must be greater than zero.")
+        if not isinstance(shuffle, bool):
+            raise ValueError("shuffle must be either True or False")
+        self.shuffle = shuffle
+        self._get_loss_function()
+        self._get_penalty_type()
+
+    def _get_penalty_type(self):
+        penalty_types = {"l2": 2, "l1": 1, "elasticnet": 3}
+        try:
+            self.penalty_type = penalty_types[self.penalty]
+            if self.penalty_type == 2:
+                self.rho = 1.0
+            elif self.penalty_type == 1:
+                self.rho = 0.0
+        except KeyError:
+            raise ValueError("Penalty %s is not supported. " % self.penalty)
+
diff --git a/scikits/learn/sgd/setup.py b/scikits/learn/sgd/setup.py
@@ -0,0 +1,19 @@
+from ConfigParser import ConfigParser
+
+def configuration(parent_package='', top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    from numpy.distutils.system_info import get_standard_file
+    config = Configuration('sgd', parent_package, top_path)
+
+    site_cfg  = ConfigParser()
+    site_cfg.read(get_standard_file('site.cfg'))
+
+    # add other directories
+    config.add_subpackage('tests')
+    config.add_subpackage('sparse')
+
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(**configuration(top_path='').todict())
diff --git a/scikits/learn/sgd/sparse/__init__.py b/scikits/learn/sgd/sparse/__init__.py
@@ -0,0 +1,10 @@
+"""
+Stochastic gradient descent with sparse data
+==========================================
+
+scikits.learn.sgd.sparse is the sparse counterpart
+of scikits.learn.sgd
+
+"""
+
+from .sgd import SGD
diff --git a/scikit 9C7A s/learn/sgd/sparse/setup.py b/scikit 9C7A s/learn/sgd/sparse/setup.py
@@ -0,0 +1,26 @@
+from os.path import join
+import numpy
+from ConfigParser import ConfigParser
+
+def configuration(parent_package='', top_path=None):
+    from numpy.distutils.misc_util import Configuration
+    from numpy.distutils.system_info import get_standard_file
+
+    config = Configuration('sparse', parent_package, top_path)
+    site_cfg  = ConfigParser()
+    site_cfg.read(get_standard_file('site.cfg'))
+
+    config.add_extension('sgd_fast_sparse',
+                         sources=[join('src', 'sgd_fast_sparse.c')],
+                         include_dirs=[numpy.get_include()]
+                         )
+
+    # add other directories
+    # config.add_subpackage('tests')
+    return config
+
+if __name__ == '__main__':
+    from numpy.distutils.core import setup
+    setup(**configuration(top_path='').todict())
+
+