8000 Fix bugs in robust module and improve test coverage (#78) · rth/scikit-learn-extra@5b45d64 · GitHub
[go: up one dir, main page]

Skip to content

Commit 5b45d64

Browse files
TimotheeMathieurth
andauthored
Fix bugs in robust module and improve test coverage (scikit-learn-contrib#78)
* Creation of one class for each task classification, regression and clustering. Update doc, test, examples. Fix typo * Update plot clustering * Update test_common * Fix test and corss_val * fix doc function RobustWeightedEstimators * fix _RobustWeightedEstimator doc and RobustWeightedKMeans doc * Black reformatted * Add Huber loss * reformat black * Changes suggested by @rth in PR * Add stop criterion and change for faster examples * Subsample clustering * fix clustering example * cython kmeans loss * fix black * type conversion cython * fix setup.py windows * fix black * Add losses and fix docstrings * black and clean print * fix mistake in code * Fix diverse small problems. Better test coverage * black * black and flake8 * fix base estimator fitted error * fix test common * add test for weights_ * add more tests * add assert almost equal * add test vs HuberRegressor * flake8 * fix verbose * fix api doc and add n_iter_no_change as parameter in robust * Update sklearn_extra/robust/robust_weighted_estimator.py Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com> * fix black * fix import error * change test kmedoids * negate commit 6ecdc1b * Update sklearn_extra/robust/mean_estimators.py Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com> Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
1 parent 44ac430 commit 5b45d64

File tree

5 files changed

+599
-110
lines changed

5 files changed

+599
-110
lines changed

examples/robust/plot_robust_regression_california_houses.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,10 @@ def quadratic_loss(est, X, y, X_test, y_test):
5454
"RobustWeightedRegressor",
5555
RobustWeightedRegressor(
5656
weighting="huber",
57-
c=0.01,
57+
c=0.1,
5858
eta0=1e-2,
5959
sgd_args={
60-
"learning_rate": "adaptive",
61-
"eta0": 1e-3,
60+
"learning_rate": "invscaling",
6261
},
6362
),
6463
),
@@ -89,7 +88,7 @@ def quadratic_loss(est, X, y, X_test, y_test):
8988
res[i, f, 0] = np.mean(cv)
9089
res[i, f, 1] = np.median(cv)
9190

92-
fig, (axe1, axe2) = plt.subplots(1, 2)
91+
fig, (axe1, axe2) = plt.subplots(2, 1)
9392
names = [name for name, est in estimators]
9493

9594
axe1.boxplot(res[:, :, 0].T, labels=names)

sklearn_extra/robust/_robust_weighted_estimator_helper.pyx

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,14 @@ cimport numpy as np
1010
from sklearn.utils.extmath import row_norms
1111
from cython cimport floating
1212

13+
import sys
14+
from time import time
15+
16+
from libc.math cimport exp, log, sqrt, pow, fabs
17+
cimport numpy as np
18+
from numpy.math cimport INFINITY
19+
20+
1321
# Modified from sklearn.cluster._k_means_fast.pyx
1422
np.import_array()
1523

@@ -73,3 +81,255 @@ cpdef np.ndarray[floating] _kmeans_loss(np.ndarray[floating, ndim=2, mode='c'] X
7381
j = labels[i]
7482
inertias[i] = _euclidean_dense_dense(&X[i, 0], &centers[j, 0], n_features)
7583
return inertias
84+
85+
86+
87+
88+
89+
# Regression and Classification losses, from scikit-learn.
90+
91+
92+
93+
94+
# ----------------------------------------
95+
# Extension Types for Loss Functions
96+
# ----------------------------------------
97+
98+
cdef class LossFunction:
99+
"""Base class for convex loss functions"""
100+
101+
cdef double loss(self, double p, double y) nogil:
102+
"""Evaluate the loss function.
103+
104+
Parameters
105+
----------
106+
p : double
107+
The prediction, p = w^T x
108+
y : double
109+
The true value (aka target)
110+
111+
Returns
112+
-------
113+
double
114+
The loss evaluated at `p` and `y`.
115+
"""
116+
return 0.
117+
118+
def py_dloss(self, double p, double y):
119+
"""Python version of `dloss` for testing.
120+
121+
Pytest needs a python function and can't use cdef functions.
122+
"""
123+
return self.dloss(p, y)
124+
125+
def py_loss(self, double p, double y):
126+
"""Python version of `dloss` for testing.
127+
128+
Pytest needs a python function and can't use cdef functions.
129+
"""
130+
return self.loss(p, y)
131+
132+
133+
cdef double dloss(self, double p, double y) nogil:
134+
"""Evaluate the derivative of the loss function with respect to
135+
the prediction `p`.
136+
137+
Parameters
138+
----------
139+
p : double
140+
The prediction, p = w^T x
141+
y : double
142+
The true value (aka target)
143+
Returns
144+
-------
145+
double
146+
The derivative of the loss function with regards to `p`.
147+
"""
148+
return 0.
149+
150+
151+
cdef class Regression(LossFunction):
152+
"""Base class for loss functions for regression"""
153+
154+
cdef double loss(self, double p, double y) nogil:
155+
return 0.
156+
157+
cdef double dloss(self, double p, double y) nogil:
158+
return 0.
159+
160+
161+
cdef class Classification(LossFunction):
162+
"""Base class for loss functions for classification"""
163+
164+
cdef double loss(self, double p, double y) nogil:
165+
return 0.
166+
167+
cdef double dloss(self, double p, double y) nogil:
168+
return 0.
169+
170+
171+
cdef class ModifiedHuber(Classification):
172+
"""Modified Huber loss for binary classification with y in {-1, 1}
173+
174+
This is equivalent to quadratically smoothed SVM with gamma = 2.
175+
176+
See T. Zhang 'Solving Large Scale Linear Prediction Problems Using
177+
Stochastic Gradient Descent', ICML'04.
178+
"""
179+
cdef double loss(self, double p, double y) nogil:
180+
cdef double z = p * y
181+
if z >= 1.0:
182+
return 0.0
183+
elif z >= -1.0:
184+
return (1.0 - z) * (1.0 - z)
185+
else:
186+
return -4.0 * z
187+
188+
cdef double dloss(self, double p, double y) nogil:
189+
cdef double z = p * y
190+
if z >= 1.0:
191+
return 0.0
192+
elif z >= -1.0:
193+
return 2.0 * (1.0 - z) * -y
194+
else:
195+
return -4.0 * y
196+
197+
def __reduce__(self):
198+
return ModifiedHuber, ()
199+
200+
201+
cdef class Hinge(Classification):
202+
"""Hinge loss for binary classification tasks with y in {-1,1}
203+
204+
Parameters
205+
----------
206+
207+
threshold : float > 0.0
208+
Margin threshold. When threshold=1.0, one gets the loss used by SVM.
209+
When threshold=0.0, one gets the loss used by the Perceptron.
210+
"""
211+
212+
cdef double threshold
213+
214+
def __init__(self, double threshold=1.0):
215+
self.threshold = threshold
216+
217+
cdef double loss(self, double p, double y) nogil:
218+
cdef double z = p * y
219+
if z <= self.threshold:
220+
return self.threshold - z
221+
return 0.0
222+
223+
cdef double dloss(self, double p, double y) nogil:
224+
cdef double z = p * y
225+
if z <= self.threshold:
226+
return -y
227+
return 0.0
228+
229+
def __reduce__(self):
230+
return Hinge, (self.threshold,)
231+
232+
233+
cdef class SquaredHinge(Classification):
234+
"""Squared Hinge loss for binary classification tasks with y in {-1,1}
235+
236+
Parameters
237+
----------
238+
239+
threshold : float > 0.0
240+
Margin threshold. When threshold=1.0, one gets the loss used by
241+
(quadratically penalized) SVM.
242+
"""
243+
244+
cdef double threshold
245+
246+
def __init__(self, double threshold=1.0):
247+
self.threshold = threshold
248+
249+
cdef double loss(self, double p, double y) nogil:
250+
cdef double z = self.threshold - p * y
251+
if z > 0:
252+
return z * z
253+
return 0.0
254+
255+
cdef double dloss(self, double p, double y) nogil:
256+
cdef double z = self.threshold - p * y
257+
if z > 0:
258+
return -2 * y * z
259+
return 0.0
260+
261+
def __reduce__(self):
262+
return SquaredHinge, (self.threshold,)
263+
264+
265+
cdef class Log(Classification):
266+
"""Logistic regression loss for binary classification with y in {-1, 1}"""
267+
268+
cdef double loss(self, double p, double y) nogil:
269+
cdef double z = p * y
270+
# approximately equal and saves the computation of the log
271+
if z > 18:
272+
return exp(-z)
273+
if z < -18:
274+
return -z
275+
return log(1.0 + exp(-z))
276+
277+
cdef double dloss(self, double p, double y) nogil:
278+
cdef double z = p * y
279+
# approximately equal and saves the computation of the log
280+
if z > 18.0:
281+
return exp(-z) * -y
282+
if z < -18.0:
283+
return -y
284+
return -y / (exp(z) + 1.0)
285+
286+
def __reduce__(self):
287+
return Log, ()
288+
289+
290+
cdef class SquaredLoss(Regression):
291+
"""Squared loss traditional used in linear regression."""
292+
cdef double loss(self, double p, double y) nogil:
293+
return 0.5 * (p - y) * (p - y)
294+
295+
cdef double dloss(self, double p, double y) nogil:
296+
return p - y
297+
298+
def __reduce__(self):
299+
return SquaredLoss, ()
300+
301+
302+
cdef class Huber(Regression):
303+
"""Huber regression loss
304+
305+
Variant of the SquaredLoss that is robust to outliers (quadratic near zero,
306+
linear in for large errors).
307+
308+
https://en.wikipedia.org/wiki/Huber_Loss_Function
309+
"""
310+
311+
cdef double c
312+
313+
def __init__(self, double c):
314+
self.c = c
315+
316+
cdef double loss(self, double p, double y) nogil:
317+
cdef double r = p - y
318+
cdef double abs_r = fabs(r)
319+
if abs_r <= self.c:
320+
return 0.5 * r * r
321+
else:
322+
return self.c * abs_r - (0.5 * self.c * self.c)
323+
324+
cdef double dloss(self, double p, double y) nogil:
325+
cdef double r = p - y
326+
cdef double abs_r = fabs(r)
327+
if abs_r <= self.c:
328+
return r
329+
elif r > 0.0:
330+
return self.c
331+
else:
332+
return -self.c
333+
334+
def __reduce__(self):
335+
return Huber, (self.c,)

sklearn_extra/robust/mean_estimators.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,10 @@ def huber(X, c=1.35, T=20):
118118

119119
def psisx(x, c):
120120
# Huber weight function.
121-
res = np.ones(len(x))
122-
res[x != 0] = (2 * (x[x != 0] > 0) - 1) * c / x[x != 0]
123-
res[np.abs(x) < c] = 1
121+
res = np.zeros(len(x))
122+
mask = np.abs(x) <= c
123+
res[mask] = 1
124+
res[~mask] = (c / np.abs(x))[~mask]
124125
res[~np.isfinite(x)] = 0
125126
return res
126127

0 commit comments

Comments
 (0)
0