Use more natural class_weight="auto" heuristic

amueller · amueller · commit 3d7ad59eacc9 · 2015-03-05T17:43:39.000-05:00
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
@@ -47,8 +47,9 @@ def compute_class_weight(class_weight, classes, y):
             raise ValueError("classes should have valid labels that are in y")
 
         # inversely proportional to the number of samples in the class
-        recip_freq = 1. / bincount(y_ind)
-        weight = recip_freq[le.transform(classes)] / np.mean(recip_freq)
+        recip_freq = len(y) / (len(le.classes_) *
+                               bincount(y_ind).astype(np.float64))
+        weight = recip_freq[le.transform(classes)]
     else:
         # user-defined dictionary
         weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -905,10 +905,9 @@ def check_class_weight_auto_linear_classifier(name, Classifier):
     coef_auto = classifier.fit(X, y).coef_.copy()
 
     # Count each label occurrence to reweight manually
-    mean_weight = (1. / 3 + 1. / 2) / 2
     class_weight = {
-        1: 1. / 3 / mean_weight,
-        -1: 1. / 2 / mean_weight,
+        1: 5. / (2 * 3),
+        -1: 5. / (2 * 2)
     }
     classifier.set_params(class_weight=class_weight)
     coef_manual = classifier.fit(X, y).coef_.copy()
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
@@ -1,5 +1,8 @@
 import numpy as np
 
+from sklearn.linear_model import LogisticRegression
+from sklearn.datasets import make_blobs
+
 from sklearn.utils.class_weight import compute_class_weight
 from sklearn.utils.class_weight import compute_sample_weight
 
@@ -26,6 +29,27 @@ def test_compute_class_weight_not_present():
     assert_raises(ValueError, compute_class_weight, "auto", classes, y)
 
 
+def test_compute_class_weight_invariance():
+    # test that results with class_weight="auto" is invariant against
+    # class imbalance if the number of samples is identical
+    X, y = make_blobs(centers=2, random_state=0)
+    # create dataset where class 1 is duplicated twice
+    X_1 = np.vstack([X] + [X[y == 1]] * 2)
+    y_1 = np.hstack([y] + [y[y == 1]] * 2)
+    # create dataset where class 0 is duplicated twice
+    X_0 = np.vstack([X] + [X[y == 0]] * 2)
+    y_0 = np.hstack([y] + [y[y == 0]] * 2)
+    # cuplicate everything
+    X_ = np.vstack([X] * 2)
+    y_ = np.hstack([y] * 2)
+    # results should be identical
+    logreg1 = LogisticRegression(class_weight="auto").fit(X_1, y_1)
+    logreg0 = LogisticRegression(class_weight="auto").fit(X_0, y_0)
+    logreg = LogisticRegression(class_weight="auto").fit(X_, y_)
+    assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
+    assert_array_almost_equal(logreg.coef_, logreg0.coef_)
+
+
 def test_compute_class_weight_auto_negative():
     """Test compute_class_weight when labels are negative"""
     # Test with balanced class labels.
@@ -116,7 +140,7 @@ def test_compute_sample_weight_with_subsample():
     # Test with a bootstrap subsample
     y = np.asarray([1, 1, 1, 2, 2, 2])
     sample_weight = compute_sample_weight("auto", y, [0, 1, 1, 2, 2, 3])
-    expected = np.asarray([1/3., 1/3., 1/3., 5/3., 5/3., 5/3.])
+    expected = np.asarray([1 / 3., 1 / 3., 1 / 3., 5 / 3., 5 / 3., 5 / 3.])
     assert_array_almost_equal(sample_weight, expected)
 
     # Test with a bootstrap subsample for multi-output