8000 broken doctests by ogrisel · Pull Request #10 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

broken doctests #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions examples/sgd/mlcomp_sparse_document_classification_sgd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""
======================================================
Classification of text documents using sparse features
======================================================

This is an example showing how the scikit-learn can be used to classify
documents by topics using a bag-of-words approach. This example uses
a scipy.sparse matrix to store the features instead of standard numpy arrays.

The dataset used in this example is the 20 newsgroups dataset and should be
downloaded from the http://mlcomp.org (free registration required):

http://mlcomp.org/datasets/379

Once downloaded unzip the arhive somewhere on your filesystem. For instance in::

% mkdir -p ~/data/mlcomp
% cd ~/data/mlcomp
% unzip /path/to/dataset-379-20news-18828_XXXXX.zip

You should get a folder ``~/data/mlcomp/379`` with a file named ``metadata`` and
subfolders ``raw``, ``train`` and ``test`` holding the text documents organized by
newsgroups.

Then set the ``MLCOMP_DATASETS_HOME`` environment variable pointing to
the root folder holding the uncompressed archive::

% export MLCOMP_DATASETS_HOME="~/data/mlcomp"

Then you are ready to run this example using your favorite python shell::

% ipython examples/mlcomp_sparse_document_classification.py

"""
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

from time import time
import sys
import os
import numpy as np
# import pylab as pl

from scikits.learn.datasets import load_mlcomp
from scikits.learn.metrics import confusion_matrix

# from scikits.learn.svm.sparse import LinearSVC
from scikits.learn.sgd.sparse import SGD


if 'MLCOMP_DATASETS_HOME' not in os.environ:
print "Please follow those instructions to get started:"
print __doc__
sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
t0 = time()
news_train = load_mlcomp('20news-18828', 'train', sparse=True)
print "done in %fs" % (time() - t0)

print "Creating binary classification task\n"\
"alt.atheism vs. comp.graphics"
target = news_train.target
pos = 0 # alt.atheism
neg = 1 # comp.graphics
pos_idx = np.where(target == pos)[0]
neg_idx = np.where(target == neg)[0]
idx = np.concatenate((pos_idx, neg_idx))
np.random.seed(13)
np.random.shuffle(idx)
data = news_train.data[idx]
target = news_train.target[idx]

print "num train docs: ", data.shape[0]
print ""
print "Training a linear SVM (hinge loss and L2 regularizer) using SGD:"

clf = SGD(n_iter=50, alpha=0.00001, fit_intercept=True)
print clf

clf.fit(data, target)
print "done in %fs" % (time() - t0)
print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)

print "Loading 20 newsgroups test set... "
t0 = time()
news_test = load_mlcomp('20news-18828', 'test', sparse=True)
print "done in %fs" % (time() - t0)

target = news_test.target
pos_idx = np.where(target == pos)[0]
neg_idx = np.where(target == neg)[0]
idx = np.concatenate((pos_idx, neg_idx))
data = news_test.data[idx]
target = news_test.target[idx]

print "Predicting the labels of the test set..."
t0 = time()
pred = clf.predict(data)
print "done in %fs" % (time() - t0)
print "Classification accuracy: %f" % (np.mean(pred == target) * 100)

cm = confusion_matrix(target, pred)
print "Confusion matrix:"
print cm

## # Show confusion matrix
## pl.matshow(cm)
## pl.title('Confusion matrix')
## pl.colorbar()
## pl.show()
33 changes: 25 additions & 8 deletions scikits/learn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,37 +14,54 @@
"""

from .base import clone
from . import cross_val
from . import ball_tree
from . import cluster
from . import covariance
from . import cross_val
from . import datasets
from . import gmm
from . import fastica
from . import feature_selection
from . import glm
from . import gmm
from . import lda
from . import metrics
from . import pca
from . import sgd
from . import svm
from . import feature_selection

try:
from numpy.testing import nosetester
class NoseTester(nosetester.NoseTester):
""" Subclass numpy's NoseTester to add doctests by default
"""
def test(self, label='fast', verbose=1, extra_argv=['--exe'],
def test(self, label='fast', verbose=1, extra_argv=['--exe'],
doctests=True, coverage=False):
return super(NoseTester, self).test(label=label, verbose=verbose,
extra_argv=extra_argv,
doctests=doctests, coverage=coverage)

test = NoseTester().test
del nosetester
except:
pass

__all__ = ['cross_val', 'ball_tree', 'cluster', 'covariance',
'datasets', 'gmm', 'glm', 'lda', 'metrics', 'svm',
'features', 'clone', 'test']
__all__ = [
'ball_tree',
'clone',
'cluster',
'covariance',
'cross_val',
'datasets',
'features',
'glm',
'gmm',
'lda',
'logistic',
'metrics',
'sgd',
'svm',
'test',
]

__version__ = '0.6.git'

1 change: 1 addition & 0 deletions scikits/learn/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def configuration(parent_package='', top_path=None):
config = Configuration('learn', parent_package, top_path)

config.add_subpackage('svm')
config.add_subpackage('sgd')
config.add_subpackage 67E6 ('datasets')
config.add_subpackage('feature_extraction')
config.add_subpackage('feature_extraction/tests')
Expand Down
8 changes: 8 additions & 0 deletions scikits/learn/sgd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""
Module that implements Stochastic Gradient Descent related algorithms.

See http://scikit-learn.sourceforge.net/modules/sgd.html for complete
documentation.
"""

from . import sparse
97 changes: 97 additions & 0 deletions scikits/learn/sgd/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#
# License: BSD Style.
"""Stochastic Gradient Descent (SGD) with sparse data. """

from ..base import BaseEstimator


class LinearModel(BaseEstimator):
"""Linear Model trained by regularized Stochastic Gradient Descent

SGD works by iteratively minimizing (sample by sample) the sum a
running estimate of a loss function (e.g. hinge loss or quadratic
loss) and a regularizer (e.g. the squared euclidean (L2) norm of the
coefs) that encodes apriori knowledge of the distribution of the coefs
(e.g. centered guaussian distribution for the L2 regularizer.

Parameters
----------
loss : str, ('hinge'|'log'|'modifiedhuber')
The loss function to be used.
penalty : str, ('l2'|'l1'|'elasticnet')
The penalty (aka regularization term) to be used.
alpha : float
Constant that multiplies the regularization term. Defaults to 0.0001
rho : float
The Elastic Net mixing parameter, with 0 < rho <= 1.
coef_ : ndarray of shape n_features
The initial coeffients to warm-start the optimization
intercept_ : float
The initial intercept to warm-start the optimization
fit_intercept: bool
Whether the intercept should be estimated or not. If False, the
data is assumed to be already centered.
n_iter: int
The number of passes over the training data (aka epochs).
shuffle: bool
Whether or not the training data should be shuffled after each epoch.
Defaults to False.

Attributes
----------
`coef_` : array, shape = [n_features]
Weights asigned to the features.

`intercept_` : float
Constants in decision function.

Examples
--------
>>> import numpy as np
>>> X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
>>> Y = np.array([1, 1, 2, 2])
>>> from scikits.learn.sgd.sparse import SGD
>>> clf = SGD()
>>> clf.fit(X, Y)
SGD(loss='hinge', shuffle=False, fit_intercept=True, n_iter=5, penalty='l2',
coef_=array([-9.80373, -9.80373]), rho=1.0, alpha=0.0001, intercept_=0.1)
>>> print clf.predict([[-0.8, -1]])
[ 1.]

See also
--------
LinearSVC

"""

def __init__(self, loss="hinge", penalty='l2', alpha=0.0001,
rho=0.85, coef_=None, intercept_=0.0,
fit_intercept=True, n_iter=5, shuffle=False):
self.loss = loss
self.penalty = penalty
self.alpha = alpha
self.rho = rho
self.coef_ = coef_
self.intercept_ = intercept_
self.fit_intercept = fit_intercept
self.n_iter = int(n_iter)
if self.n_iter <= 0:
raise ValueError("n_iter must be greater than zero.")
if not isinstance(shuffle, bool):
raise ValueError("shuffle must be either True or False")
self.shuffle = shuffle
self._get_loss_function()
self._get_penalty_type()

def _get_penalty_type(self):
penalty_types = {"l2": 2, "l1": 1, "elasticnet": 3}
try:
self.penalty_type = penalty_types[self.penalty]
if self.penalty_type == 2:
self.rho = 1.0
elif self.penalty_type == 1:
self.rho = 0.0
except KeyError:
raise ValueError("Penalty %s is not supported. " % self.penalty)

19 changes: 19 additions & 0 deletions scikits/learn/sgd/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from ConfigParser import ConfigParser

def configuration(parent_package='', top_path=None):
from numpy.distutils.misc_util import Configuration
from numpy.distutils.system_info import get_standard_file
config = Configuration('sgd', parent_package, top_path)

site_cfg = ConfigParser()
site_cfg.read(get_standard_file('site.cfg'))

# add other directories
config.add_subpackage('tests')
config.add_subpackage('sparse')

return config

if __name__ == '__main__':
from numpy.distutils.core import setup
setup(**configuration(top_path='').todict())
10 changes: 10 additions & 0 deletions scikits/learn/sgd/sparse/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
Stochastic gradient descent with sparse data
==========================================

scikits.learn.sgd.sparse is the sparse counterpart
of scikits.learn.sgd

"""

from .sgd import SGD
26 changes: 26 additions & 0 deletions scikits/learn/sgd/sparse/setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from os.path import join
import numpy
from ConfigParser import ConfigParser

def configuration(parent_package='', top_path=None):
from numpy.distutils.misc_util import Configuration
from numpy.distutils.system_info import get_standard_file

config = Configuration('sparse', parent_package, top_path)
site_cfg = ConfigParser()
site_cfg.read(get_standard_file('site.cfg'))

config.add_extension('sgd_fast_sparse',
sources=[join('src', 'sgd_fast_sparse.c')],
include_dirs=[numpy.get_include()]
)

# add other directories
# config.add_subpackage('tests')
return config

if __name__ == '__main__':
from numpy.distutils.core import setup
setup(**configuration(top_path='').todict())


Loading
0