diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 43639e0b7bd5d..3029cfaf3ce8c 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -25,7 +25,7 @@ from .preprocessing import binarize, LabelBinarizer from .utils import array2d, atleast2d_or_csr from .utils.extmath import safe_sparse_dot, logsumexp -from .utils import check_arrays +from .utils import check_arrays, compute_class_weight __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB'] @@ -211,10 +211,6 @@ def fit(self, X, y, sample_weight=None, class_prior=None): sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples (1. for unweighted). - class_prior : array, shape [n_classes] - Custom prior probability per class. - Overrides the fit_prior parameter. - Returns ------- self : object @@ -225,7 +221,6 @@ def fit(self, X, y, sample_weight=None, class_prior=None): labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ - n_classes = len(self.classes_) if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) @@ -243,20 +238,22 @@ def fit(self, X, y, sample_weight=None, class_prior=None): warnings.warn('class_prior is deprecated in fit function and will ' 'be removed in version 0.15. Use the `__init__` ' 'parameter class_weight instead.') + class_weight = class_prior else: - class_prior = self.class_weight - - if class_prior: - if len(class_prior) != n_classes: - raise ValueError("Number of priors must match number of" - " classes.") - self.class_log_prior_ = np.log(class_prior) - elif self.fit_prior: - # empirical prior, with sample_weight taken into account - y_freq = Y.sum(axis=0) - self.class_log_prior_ = np.log(y_freq) - np.log(y_freq.sum()) - else: - self.class_log_prior_ = np.zeros(n_classes) - np.log(n_classes) + class_weight = self.class_weight + + if self.fit_prior is not None: + warnings.warn('fit_prior is deprecated in fit function and will ' + 'be removed in version 0.15. Use the `__init__` ' + 'parameter class_weight instead.') + if self.fit_prior is False: + class_prior = None + else: + class_prior = 'auto' + + class_weight = compute_class_weight(class_weight, self.classes_, y) + class_weight /= class_weight.sum() + self.class_log_prior_ = np.log(class_weight) # N_c_i is the count of feature i in all samples of class c. # N_c is the denominator. @@ -295,13 +292,12 @@ class MultinomialNB(BaseDiscreteNB): Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). - fit_prior : boolean - Whether to learn class prior probabilities or not. - If false, a uniform prior will be used. - - class_weight : array-like, size=[n_classes,] - Prior probabilities of the classes. If specified the priors are not - adjusted according to the data. + class_weight : 'auto', None or array-like. default='auto' + Prior probabilities of the classes. 'auto' means class priors + are estimated from the data, None means uniform priors + over all classes. + If an array is given, it must have shape [n_classes,] and + explicitly specifies the class priors. Attributes ---------- @@ -324,7 +320,7 @@ class MultinomialNB(BaseDiscreteNB): >>> from sklearn.naive_bayes import MultinomialNB >>> clf = MultinomialNB() >>> clf.fit(X, Y) - MultinomialNB(alpha=1.0, class_weight=None, fit_prior=True) + MultinomialNB(alpha=1.0, class_weight='auto', fit_prior=None) >>> print(clf.predict(X[2])) [3] @@ -335,7 +331,7 @@ class MultinomialNB(BaseDiscreteNB): Tackling the poor assumptions of naive Bayes text classifiers, ICML. """ - def __init__(self, alpha=1.0, fit_prior=True, class_weight=None): + def __init__(self, alpha=1.0, fit_prior=None, class_weight='auto'): self.alpha = alpha self.fit_prior = fit_prior self.class_weight = class_weight @@ -373,13 +369,12 @@ class BernoulliNB(BaseDiscreteNB): Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. - fit_prior : boolean - Whether to learn class prior probabilities or not. - If false, a uniform prior will be used. - - class_weight : array-like, size=[n_classes,] - Prior probabilities of the classes. If specified the priors are not - adjusted according to the data. + class_weight : 'auto', None or array-like. default='auto' + Prior probabilities of the classes. 'auto' means class priors + are estimated from the data, None means uniform priors + over all classes. + If an array is given, it must have shape [n_classes,] and + explicitly specifies the class priors. Attributes ---------- @@ -397,7 +392,7 @@ class BernoulliNB(BaseDiscreteNB): >>> from sklearn.naive_bayes import BernoulliNB >>> clf = BernoulliNB() >>> clf.fit(X, Y) - BernoulliNB(alpha=1.0, binarize=0.0, class_weight=None, fit_prior=True) + BernoulliNB(alpha=1.0, binarize=0.0, class_weight='auto', fit_prior=None) >>> print(clf.predict(X[2])) [3] @@ -415,8 +410,8 @@ class BernoulliNB(BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, alpha=1.0, binarize=.0, fit_prior=True, - class_weight=None): + def __init__(self, alpha=1.0, binarize=.0, fit_prior=None, + class_weight='auto'): self.alpha = alpha self.binarize = binarize self.fit_prior = fit_prior diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index eb8f620e44996..30cd05dac7215 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -721,8 +721,7 @@ def test_class_weight_auto_classifies(): continue if name.endswith("NB"): - # NaiveBayes classifiers have a somewhat differnt interface. - # FIXME SOON! + # naive bayes classifiers don't work on this kind of data :( continue with warnings.catch_warnings(record=True): diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index b999ea8f4decc..67569c759e0f7 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -138,11 +138,10 @@ def test_discretenb_predict_proba(): def test_discretenb_uniform_prior(): """Test whether discrete NB classes fit a uniform prior - when fit_prior=False and class_prior=None""" + when class_weight=None""" for cls in [BernoulliNB, MultinomialNB]: - clf = cls() - clf.set_params(fit_prior=False) + clf = cls(class_weight=None) clf.fit([[0], [0], [1]], [0, 0, 1]) prior = np.exp(clf.class_log_prior_) assert_array_equal(prior, np.array([.5, .5])) @@ -152,8 +151,8 @@ def test_discretenb_provide_prior(): """Test whether discrete NB classes use provided prior""" for cls in [BernoulliNB, MultinomialNB]: - clf = cls() - clf.fit([[0], [0], [1]], [0, 0, 1], class_prior=[0.5, 0.5]) + clf = cls(class_weight=[.5, .5]) + clf.fit([[0], [0], [1]], [0, 0, 1]) prior = np.exp(clf.class_log_prior_) assert_array_equal(prior, np.array([.5, .5])) diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py index 6eaaadf530cd0..95038c6a4550b 100644 --- a/sklearn/utils/class_weight.py +++ b/sklearn/utils/class_weight.py @@ -37,17 +37,20 @@ def compute_class_weight(class_weight, classes, y): weight = np.array([1.0 / np.sum(y == i) for i in classes], dtype=np.float64, order='C') weight *= classes.shape[0] / np.sum(weight) - else: + elif isinstance(class_weight, dict): # user-defined dictionary weight = np.ones(classes.shape[0], dtype=np.float64, order='C') - if not isinstance(class_weight, dict): - raise ValueError("class_weight must be dict, 'auto', or None," - " got: %r" % class_weight) for c in class_weight: i = np.searchsorted(classes, c) if classes[i] != c: raise ValueError("Class label %d not present." % c) else: weight[i] = class_weight[c] - + else: + # user-specified array or list + weight = np.array(class_weight) + if len(weight) != len(classes): + raise ValueError("The number of entries in class_weight is %d, " + "which does not match the number of classes %d." + % (len(weight), len(classes))) return weight