10000 Common check for sample weight invariance with removed samples by rth · Pull Request #17176 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

Common check for sample weight invariance with removed samples #17176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 3, 2020
8 changes: 8 additions & 0 deletions sklearn/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,14 @@ class that has the highest probability, and can thus be different
check_is_fitted(self)
return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class _CalibratedClassifier:
"""Probability calibration with isotonic regression or sigmoid.
Expand Down
16 changes: 16 additions & 0 deletions sklearn/cluster/_kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,6 +1215,14 @@ def score(self, X, y=None, sample_weight=None):
return -_labels_inertia(X, sample_weight, x_squared_norms,
self.cluster_centers_)[1]

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
old_center_buffer, compute_squared_diff,
Expand Down Expand Up @@ -1871,3 +1879,11 @@ def predict(self, X, sample_weight=None):

X = self._check_test_data(X)
return self._labels_inertia_minibatch(X, sample_weight)[0]

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}
8 changes: 8 additions & 0 deletions sklearn/ensemble/_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,14 @@ def _compute_score_samples(self, X, subsample_features):
)
return scores

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


def _average_path_length(n_samples_leaf):
"""
Expand Down
8 changes: 8 additions & 0 deletions sklearn/linear_model/_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2084,3 +2084,11 @@ def score(self, X, y, sample_weight=None):
scoring = get_scorer(scoring)

return scoring(self, X, y, sample_weight=sample_weight)

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}
8 changes: 8 additions & 0 deletions sklearn/linear_model/_ransac.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,3 +502,11 @@ def score(self, X, y):
check_is_fitted(self)

return self.estimator_.score(X, y)

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}
8 changes: 8 additions & 0 deletions sklearn/linear_model/_ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1909,3 +1909,11 @@ def fit(self, X, y, sample_weight=None):
@property
def classes_(self):
return self._label_binarizer.classes_

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}
16 changes: 16 additions & 0 deletions sklearn/linear_model/_stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,14 @@ def predict_log_proba(self):
def _predict_log_proba(self, X):
return np.log(self.predict_proba(X))

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class BaseSGDRegressor(RegressorMixin, BaseSGD):

Expand Down Expand Up @@ -1576,3 +1584,11 @@ def __init__(self, loss="squared_loss", *, penalty="l2", alpha=0.0001,
validation_fraction=validation_fraction,
n_iter_no_change=n_iter_no_change, warm_start=warm_start,
average=average)

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}
8 changes: 8 additions & 0 deletions sklearn/neighbors/_kde.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,3 +274,11 @@ def sample(self, n_samples=1, random_state=None):
correction = (gammainc(0.5 * dim, 0.5 * s_sq) ** (1. / dim)
* self.bandwidth / np.sqrt(s_sq))
return data[i] + X * correction[:, np.newaxis]

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'sample_weight must have positive values',
}
}
52 changes: 51 additions & 1 deletion sklearn/svm/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,14 @@ def fit(self, X, y, sample_weight=None):

return self

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class LinearSVR(RegressorMixin, LinearModel):
"""Linear Support Vector Regression.
Expand Down Expand Up @@ -424,6 +432,14 @@ def fit(self, X, y, sample_weight=None):

return self

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class SVC(BaseSVC):
"""C-Support Vector Classification.
Expand Down Expand Up @@ -650,6 +666,14 @@ def __init__(self, *, C=1.0, kernel='rbf', degree=3, gamma='scale',
break_ties=break_ties,
random_state=random_state)

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class NuSVC(BaseSVC):
"""Nu-Support Vector Classification.
Expand Down Expand Up @@ -866,7 +890,9 @@ def _more_tags(self):
'_xfail_checks': {
'check_methods_subset_invariance':
'fails for the decision_function method',
'check_class_weight_classifiers': 'class_weight is ignored.'
'check_class_weight_classifiers': 'class_weight is ignored.',
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}

Expand Down Expand Up @@ -1027,6 +1053,14 @@ def probA_(self):
def probB_(self):
return self._probB

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class NuSVR(RegressorMixin, BaseLibSVM):
"""Nu Support Vector Regression.
Expand Down Expand Up @@ -1157,6 +1191,14 @@ def __init__(self, *, nu=0.5, C=1.0, kernel='rbf', degree=3,
probability=False, cache_size=cache_size, class_weight=None,
verbose=verbose, max_iter=max_iter, random_state=None)

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}


class OneClassSVM(OutlierMixin, BaseLibSVM):
"""Unsupervised Outlier Detection.
Expand Down Expand Up @@ -1371,3 +1413,11 @@ def probA_(self):
@property
def probB_(self):
return self._probB

def _more_tags(self):
return {
'_xfail_checks': {
'check_sample_weights_invariance(kind=zeros)':
'zero sample_weight is not equivalent to removing samples',
}
}
89 changes: 54 additions & 35 deletions sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,12 @@ def _yield_checks(estimator):
yield check_sample_weights_not_an_array
yield check_sample_weights_list
yield check_sample_weights_shape
yield check_sample_weights_invariance
if (has_fit_parameter(estimator, "sample_weight")
and not (hasattr(estimator, "_pairwise")
and estimator._pairwise)):
# We skip pairwise because the data is not pairwise
yield partial(check_sample_weights_invariance, kind='ones')
yield partial(check_sample_weights_invariance, kind='zeros')
yield check_estimators_fit_returns_self
yield partial(check_estimators_fit_returns_self, readonly_memmap=True)

Expand Down Expand Up @@ -836,41 +841,55 @@ def check_sample_weights_shape(name, estimator_orig):


@ignore_warnings(category=FutureWarning)
def check_sample_weights_invariance(name, estimator_orig):
# check that the estimators yield same results for
def check_sample_weights_invariance(name, estimator_orig, kind="ones"):
# For kind="ones" check that the estimators yield same results for
# unit weights and no weights
if (has_fit_parameter(estimator_orig, "sample_weight") and
not (hasattr(estimator_orig, "_pairwise")
and estimator_orig._pairwise)):
# We skip pairwise because the data is not pairwise

estimator1 = clone(estimator_orig)
estimator2 = clone(estimator_orig)
set_random_state(estimator1, random_state=0)
set_random_state(estimator2, random_state=0)

X = np.array([[1, 3], [1, 3], [1, 3], [1, 3],
[2, 1], [2, 1], [2, 1], [2, 1],
[3, 3], [3, 3], [3, 3], [3, 3],
[4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float'))
y = np.array([1, 1, 1, 1, 2, 2, 2, 2,
1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int'))
y = _enforce_estimator_tags_y(estimator1, y)

estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)))
estimator2.fit(X, y=y, sample_weight=None)

for method in ["predict", "transform"]:
if hasattr(estimator_orig, method):
X_pred1 = getattr(estimator1, method)(X)
X_pred2 = getattr(estimator2, method)(X)
if sparse.issparse(X_pred1):
X_pred1 = X_pred1.toarray()
X_pred2 = X_pred2.toarray()
assert_allclose(X_pred1, X_pred2,
err_msg="For %s sample_weight=None is not"
" equivalent to sample_weight=ones"
% name)
# For kind="zeros" check that setting sample_weight to 0 is equivalent
# to removing corresponding samples.
estimator1 = clone(estimator_orig)
estimator2 = clone(estimator_orig)
set_random_state(estimator1, random_state=0)
set_random_state(estimator2, random_state=0)

X1 = np.array([[1, 3], [1, 3], [1, 3], [1, 3],
[2, 1], [2, 1], [2, 1], [2, 1],
[3, 3], [3, 3], [3, 3], [3, 3],
[4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.float64)
y1 = np.array([1, 1, 1, 1, 2, 2, 2, 2,
1, 1, 1, 1, 2, 2, 2, 2], dtype=np.int)

if kind == 'ones':
X2 = X1
y2 = y1
sw2 = np.ones(shape=len(y1))
err_msg = (f"For {name} sample_weight=None is not equivalent to "
f"sample_weight=ones")
elif kind == 'zeros':
# Construct a dataset that is very different to (X, y) if weights
# are disregarded, but identical to (X, y) given weights.
X2 = np.vstack([X1, X1 + 1])
y2 = np.hstack([y1, 3 - y1])
sw2 = np.ones(shape=len(y1) * 2)
sw2[len(y1):] = 0
X2, y2, sw2 = shuffle(X2, y2, sw2, random_state=0)

err_msg = (f"For {name}, a zero sample_weight is not equivalent "
f"to removing the sample")
else: # pragma: no cover
raise ValueError

y1 = _enforce_estimator_tags_y(estimator1, y1)
y2 = _enforce_estimator_tags_y(estimator2, y2)

estimator1.fit(X1, y=y1, sample_weight=None)
estimator2.fit(X2, y=y2, sample_weight=sw2)

for method in ["predict", "predict_proba",
"decision_function", "transform"]:
if hasattr(estimator_orig, method):
X_pred1 = getattr(estimator1, method)(X1)
X_pred2 = getattr(estimator2, method)(X1)
assert_allclose_dense_sparse(X_pred1, X_pred2, err_msg=err_msg)


@ignore_warnings(category=(FutureWarning, UserWarning))
Expand Down
0