8000 WIP: Classes support for all classifiers by erg · Pull Request #1304 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

WIP: Classes support for all classifiers #1304

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
Closed
109 changes: 109 additions & 0 deletions sklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import copy
import inspect
import numpy as np
from collections import Sequence
from scipy import sparse

from .metrics import r2_score
from .utils.fixes import unique


###############################################################################
Expand Down Expand Up @@ -257,10 +259,117 @@ def __str__(self):
)


###############################################################################
def is_label_indicator_matrix(y):
return hasattr(y, "shape") and len(y.shape) == 2


def is_multilabel(y):
# the explicit check for ndarray is for forward compatibility; future
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I don't understand this. Can you elaborate?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just moved it out of preprocessing.py into base.py.

ebb42e78 sklearn/preprocessing/preprocessing.py (Mathieu Blondel 2012-01-21 05:29:42 +0900 593) def _is_label_indicator_matrix(y):
ebb42e78 sklearn/preprocessing/preprocessing.py (Mathieu Blondel 2012-01-21 05:29:42 +0900 594)     return hasattr(y, "shape") and len(y.shape) == 2
ebb42e78 sklearn/preprocessing/preprocessing.py (Mathieu Blondel 2012-01-21 05:29:42 +0900 595) 
ebb42e78 sklearn/preprocessing/preprocessing.py (Mathieu Blondel 2012-01-21 05:29:42 +0900 596) 
ebb42e78 sklearn/preprocessing/preprocessing.py (Mathieu Blondel 2012-01-21 05:29:42 +0900 597) def _is_multilabel(y):
7b060267 sklearn/preprocessing.py               (Lars Buitinck   2012-05-18 09:25:21 +0200 598)     # the explicit check for ndarray is for forward compatibility; future
7b060267 sklearn/preprocessing.py               (Lars Buitinck   2012-05-18 09:25:21 +0200 599)     # versions of Numpy might want to register ndarray as a Sequence
6d041623 sklearn/preprocessing.py               (Lars Buitinck   2012-05-18 12:28:30 +0200 600)     return not isinstance(y[0], np.ndarray) and isinstance(y[0], Sequence) \
855257f6 sklearn/preprocessing.py               (Lars Buitinck   2012-05-24 16:08:07 +0200 601)        and not isinstance(y[0], basestring) \
7b060267 sklearn/preprocessing.py               (Lars Buitinck   2012-05-18 09:25:21 +0200 602)         or _is_label_indicator_matrix(y)

# versions of Numpy might want to register ndarray as a Sequence
return not isinstance(y[0], np.ndarray) and isinstance(y[0], Sequence) \
and not isinstance(y[0], basestring) \
or is_label_indicator_matrix(y)


def is_iterable(y):
"""Check if an item implements the __iter__ protocol."""
try:
iter(y)
return True
except:
return False


###############################################################################
class ClassifierMixin(object):
"""Mixin class for all classifiers in scikit-learn"""

def _check_classes(self, classes):
"""Common error checking for the prepare functions below."""
if len(classes) is 0:
raise ValueError("no output classes")
if is_multilabel(classes):
for c in classes:
if len(c) != len(set(c)):
raise ValueError("duplicate class label")
else:
if len(classes) != len(np.unique(classes)):
raise ValueError("duplicate class label")

def _prepare_single_label_classes(self, y):
"""Set self.classes and self.y_inverse_"""
if self.classes is None:
self.classes_, self.y_inverse_ = unique(y, return_inverse=True)
self.n_classes_ = len(self.classes_)
else:
self._check_classes(self.classes)
self.classes_ = self.classes
classidx = dict((v,k) for k,v in enumerate(self.classes))
# check that all y classes are expected
y_inverse = np.array([classidx.get(k, None) for k in y])
if any(y is None for y in y_inverse):
bad_classes = [classidx[k] for k in np.unique(self.classes[y_inverse == None])]
raise ValueError("unknown classes in y vector: %s" % bad_classes)
self.y_inverse_ = y_inverse
self.n_classes_ = len(self.classes_)
return self.y_inverse_

def _prepare_multilabel_classes(self, y):
"""Ensure that none of the output classes are different from
the ones that we expect if the constructor had a classes parameter.
"""
y_classes = []
y_n_classes = []
self.n_outputs_ = y.shape[1]

for k in xrange(self.n_outputs_):
unique = np.unique(y[:, k])
y_classes.append(unique)
y_n_classes.append(unique.shape[0])

# discover classes
if self.classes is None:
self.classes_ = np.asarray(y_classes)
self.n_classes_ = np.asarray(y_n_classes)
self.n_outputs_ = len(y_classes)

# check known classes
else:
if is_multilabel(self.classes):
self.classes_ = self.classes
else:
self.classes_ = np.asarray([self.classes])

for x in self.classes_:
if not is_iterable(x):
raise ValueError("classes in a multilabel class list must be sequences")

self.classes_ = [np.unique(x) for x in self.classes_]
self.n_classes_ = np.asarray([len(x) for x in self.classes_])
self.n_outputs_ = len(self.classes_)
for i in xrange(self.n_outputs_):
diff = set(y_classes[i]) - set(self.classes_[i])
if len(diff) > 0:
raise ValueError("classes to constructor and fit don't match each other: %s" % diff)

for k in xrange(self.n_outputs_):
y[:, k] = np.searchsorted(self.classes_[k], y[:, k])
return y

def _prepare_classes(self, y):
"""Return y."""
if is_multilabel(y):
return self._prepare_multilabel_classes(y)
else:
return self._prepare_single_label_classes(y)

def _check_found_classes(self, found_classes):
if self.classes is not None:
diff = set(found_classes) - set(self.classes)
if len(diff) > 0:
raise ValueError("unknown classes in y vector: %s" % diff)

def score(self, X, y):
"""Returns the mean accuracy on the given test data and labels.

Expand Down
19 changes: 11 additions & 8 deletions sklearn/ensemble/forest.py
< 8000 /tr>
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _parallel_build_trees(n_trees, forest, X, y,
tree = forest._make_estimator(append=False)
tree.set_params(compute_importances=forest.compute_importances)
tree.set_params(random_state=check_random_state(seed))
tree.classes = forest.classes

if forest.bootstrap:
n_samples = X.shape[0]
Expand Down Expand Up @@ -190,6 +191,7 @@ def __init__(self, base_estimator,
compute_importances=False,
oob_score=False,
n_jobs=1,
classes=None,
random_state=None,
verbose=0):
super(BaseForest, self).__init__(
Expand All @@ -202,11 +204,10 @@ def __init__(self, base_estimator,
self.oob_score = oob_score
self.n_jobs = n_jobs
self.random_state = random_state
self.classes = np.asarray(classes) if classes is not None else None

self.n_features_ = None
self.n_outputs_ = None
self.classes_ = None
self.n_classes_ = None
self.feature_importances_ = None

self.verbose = verbose
Expand Down Expand Up @@ -285,12 +286,7 @@ def fit(self, X, y):

if isinstance(self.base_estimator, ClassifierMixin):
y = np.copy(y)

for k in xrange(self.n_outputs_):
unique = np.unique(y[:, k])
self.classes_.append(unique)
self.n_classes_.append(unique.shape[0])
y[:, k] = np.searchsorted(unique, y[:, k])
y = self._prepare_classes(y)

if getattr(y, "dtype", None) != DTYPE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)
Expand Down Expand Up @@ -411,6 +407,7 @@ def __init__(self, base_estimator,
compute_importances=False,
oob_score=False,
n_jobs=1,
classes=None,
random_state=None,
verbose=0):

Expand All @@ -422,6 +419,7 @@ def __init__(self, base_estimator,
compute_importances=compute_importances,
oob_score=oob_score,
n_jobs=n_jobs,
classes=classes,
random_state=random_state,
verbose=verbose)

Expand Down Expand Up @@ -565,6 +563,7 @@ def __init__(self, base_estimator,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
classes=None,
verbose=verbose)

def predict(self, X):
Expand Down Expand Up @@ -718,6 +717,7 @@ def __init__(self, n_estimators=10,
oob_score=False,
n_jobs=1,
random_state=None,
classes=None,
verbose=0):
super(RandomForestClassifier, self).__init__(
base_estimator=DecisionTreeClassifier(),
Expand All @@ -730,6 +730,7 @@ def __init__(self, n_estimators=10,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
classes=classes,
verbose=verbose)

self.criterion = criterion
Expand Down Expand Up @@ -997,6 +998,7 @@ def __init__(self, n_estimators=10,
oob_score=False,
n_jobs=1,
random_state=None,
classes=None,
verbose=0):
super(ExtraTreesClassifier, self).__init__(
base_estimator=ExtraTreeClassifier(),
Expand All @@ -1009,6 +1011,7 @@ def __init__(self, n_estimators=10,
oob_score=oob_score,
n_jobs=n_jobs,
random_state=random_state,
classes=classes,
verbose=verbose)

self.criterion = criterion
Expand Down
17 changes: 6 additions & 11 deletions sklearn/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,10 @@ class LDA(BaseEstimator, ClassifierMixin, TransformerMixin):

"""

def __init__(self, n_components=None, priors=None):
def __init__(self, n_components=None, priors=None, classes=None):
self.n_components = n_components
self.priors = np.asarray(priors) if priors is not None else None
self.classes = np.asarray(classes) if classes is not None else None

if self.priors is not None:
if (self.priors < 0).any():
Expand All @@ -97,13 +98,14 @@ def fit(self, X, y, store_covariance=False, tol=1.0e-4):
and stored in `self.covariance_` attribute.
"""
X, y = check_arrays(X, y, sparse_format='dense')
self.classes_, y = unique(y, return_inverse=True)
self._prepare_classes(y)

n_samples, n_features = X.shape
n_classes = len(self.classes_)
if n_classes < 2:
raise ValueError('y has less than 2 classes')
if self.priors is None:
self.priors_ = np.bincount(y) / float(n_samples)
self.priors_ = np.bincount(self.y_inverse_) / float(n_samples)
else:
self.priors_ = self.priors

Expand All @@ -114,7 +116,7 @@ def fit(self, X, y, store_covariance=False, tol=1.0e-4):
if store_covariance:
cov = np.zeros((n_features, n_features))
for ind in xrange(n_classes):
Xg = X[y == ind, :]
Xg = X[self.y_inverse_ == ind, :]
meang = Xg.mean(0)
means.append(meang)
# centered group data
Expand Down Expand Up @@ -169,13 +171,6 @@ def fit(self, X, y, store_covariance=False, tol=1.0e-4):
np.log(self.priors_)
return self

@property
def classes(self):
warnings.warn("LDA.classes is deprecated and will be removed in 0.14. "
"Use LDA.classes_ instead.", DeprecationWarning,
stacklevel=2)
return self.classes_

def _decision_function(self, X):
X = array2d(X)
# center and scale data
Expand Down
38 changes: 25 additions & 13 deletions sklearn/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class BaseNB(BaseEstimator, ClassifierMixin):

__metaclass__ = ABCMeta

def __init__(self, classes=None):
self.classes = np.asarray(classes) if classes is not None else None

@abstractmethod
def _joint_log_likelihood(self, X):
"""Compute the unnormalized posterior log probability of X
Expand Down Expand Up @@ -134,6 +137,9 @@ class GaussianNB(BaseNB):
[1]
"""

def __init__(self, classes=None):
super(GaussianNB, self).__init__(classes)

def fit(self, X, y):
"""Fit Gaussian Naive Bayes according to X, y

Expand All @@ -159,14 +165,13 @@ def fit(self, X, y):
if n_samples != y.shape[0]:
raise ValueError("X and y have incompatible shapes")

self.classes_ = unique_y = np.unique(y)
n_classes = unique_y.shape[0]
self._prepare_classes(y)

self.theta_ = np.zeros((n_classes, n_features))
self.sigma_ = np.zeros((n_classes, n_features))
self.class_prior_ = np.zeros(n_classes)
self.theta_ = np.zeros((self.n_classes_, n_features))
self.sigma_ = np.zeros((self.n_classes_, n_features))
self.class_prior_ = np.zeros(self.n_classes_)
epsilon = 1e-9
for i, y_i in enumerate(unique_y):
for i, y_i in enumerate(self.classes_):
self.theta_[i, :] = np.mean(X[y == y_i, :], axis=0)
self.sigma_[i, :] = np.var(X[y == y_i, :], axis=0) + epsilon
self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples
Expand All @@ -183,6 +188,7 @@ def _joint_log_likelihood(self, X):
joint_log_likelihood.append(jointi + n_ij)

joint_log_likelihood = np.array(joint_log_likelihood).T
joint_log_likelihood[np.isnan(joint_log_likelihood)] = -np.inf
return joint_log_likelihood


Expand All @@ -194,6 +200,9 @@ class BaseDiscreteNB(BaseNB):
__init__
_joint_log_likelihood(X) as per BaseNB
"""
def __init__(self, classes=None):
super(BaseDiscreteNB, self).__init__(classes)


def fit(self, X, y, sample_weight=None, class_prior=None):
"""Fit Naive Bayes classifier according to X, y
Expand Down Expand Up @@ -221,10 +230,11 @@ def fit(self, X, y, sample_weight=None, class_prior=None):
"""
X = atleast2d_or_csr(X)

labelbin = LabelBinarizer()
labelbin = LabelBinarizer(classes=self.classes)
Y = labelbin.fit_transform(y)
self._check_found_classes(labelbin.classes_)
self.classes_ = labelbin.classes_
n_classes = len(self.classes_)
self.n_classes_ = len(self.classes_)
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)

Expand All @@ -238,8 +248,8 @@ def fit(self, X, y, sample_weight=None, class_prior=None):
if sample_weight is not None:
Y *= array2d(sample_weight).T

if class_prior:
if len(class_prior) != n_classes:
if class_prior is not None:
if len(class_prior) != self.n_classes_:
raise ValueError(
"Number of priors must match number of classes")
self.class_log_prior_ = np.log(class_prior)
Expand All @@ -248,7 +258,7 @@ def fit(self, X, y, sample_weight=None, class_prior=None):
y_freq = Y.sum(axis=0)
self.class_log_prior_ = np.log(y_freq) - np.log(y_freq.sum())
else:
self.class_log_prior_ = np.zeros(n_classes) - np.log(n_classes)
self.class_log_prior_ = np.zeros(self.n_classes_) - np.log(self.n_classes_)

N_c, N_c_i = self._count(X, Y)

Expand Down Expand Up @@ -337,7 +347,8 @@ class MultinomialNB(BaseDiscreteNB):
Tackling the poor assumptions of naive Bayes text classifiers, ICML.
"""

def __init__(self, alpha=1.0, fit_prior=True):
def __init__(self, alpha=1.0, fit_prior=True, classes=None):
super(MultinomialNB, self).__init__(classes)
self.alpha = alpha
self.fit_prior = fit_prior

Expand Down Expand Up @@ -401,7 +412,8 @@ class BernoulliNB(BaseDiscreteNB):
naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS).
"""

def __init__(self, alpha=1.0, binarize=.0, fit_prior=True):
def __init__(self, alpha=1.0, binarize=.0, fit_prior=True, classes=None):
super(BernoulliNB, self).__init__(classes)
self.alpha = alpha
self.binarize = binarize
self.fit_prior = fit_prior
Expand Down
Loading
0