8000 fix edge cases and tests · scikit-learn/scikit-learn@4859b81 · GitHub
[go: up one dir, main page]

Skip to content

Commit 4859b81

Browse files
committed
fix edge cases and tests
1 parent c8e840d commit 4859b81

File tree

2 files changed

+107
-73
lines changed

2 files changed

+107
-73
lines changed

sklearn/metrics/pairwise.py

Lines changed: 83 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
from ..utils.validation import _num_samples
2323
from ..utils.validation import check_non_negative
24+
from ..utils.validation import check_consistent_length
2425
from ..utils import check_array
2526
from ..utils import gen_even_slices
2627
from ..utils import gen_batches, get_chunk_n_rows
@@ -35,6 +36,7 @@
3536
from ..utils.fixes import _object_dtype_isnan
3637
from ..preprocessing import MinMaxScaler
3738

39+
3840
# Utility Functions
3941
def _return_float_dtype(X, Y):
4042
"""
@@ -831,7 +833,33 @@ def cosine_distances(X, Y=None):
831833
return S
832834

833835

834-
def gower_distances(X, Y=None, categorical_features=None, scale=True):
836+
def _split_categorical_numerical(X, categorical_features):
837+
# the following bit is done before check_pairwise_array to avoid converting
838+
# numerical data to object dtype. First we split the data into categorical
839+
# and numerical, then we do check_array
840+
841+
# TODO: this should be more like check_array(..., accept_pandas=True)
842+
if (X is not None and not hasattr(X, 'iloc')
843+
and not hasattr(X, '__array__')):
844+
X = check_array(X, dtype=np.object, force_all_finite=False)
845+
846+
if callable(categorical_features) and X is not None:
847+
cols = categorical_features(X)
848+
else:
849+
cols = categorical_features
850+
if cols is None:
851+
cols = []
852+
853+
if X is not None:
854+
X_cat = _safe_indexing(X, cols, axis=1)
855+
X_num = _safe_indexing(X, cols, axis=1, inverse=True)
856+
else:
857+
X_cat = X_num = None
858+
return X_cat, X_num
859+
860+
861+
def gower_distances(X, Y=None, categorical_features=None, scale=True,
862+
min_values=None, scale_factor=None):
835863
"""Compute the distances between the observations in X and Y,
836864
that may contain mixed types of data, using an implementation
837865
of Gower formula.
@@ -857,7 +885,18 @@ def gower_distances(X, Y=None, categorical_features=None, scale=True):
857885
scale : bool, default=True
858886
Indicates if the numerical columns will be scaled between 0 and 1.
859887
If false, it is assumed the numerical columns are already scaled.
860-
The scaling factors, _i.e._ min and max, are taken from ``X``.
888+
The scaling factors, _i.e._ min and max, are taken from both ``X`` and
889+
``Y``.
890+
891+
min_values : ndarray of shape (n_features,), default=None
892+
Per feature adjustment for minimum. Equivalent to
893+
``min_values - X.min(axis=0) * scale_factor``
894+
If provided, ``scale_factor`` should be provided as well.
895+
896+
scale_factor : ndarray of shape (n_features,), default=None
897+
Per feature relative scaling of the data. Equivalent to
898+
``(max_values - min_values) / (X.max(axis=0) - X.min(axis=0))``
899+
If provided, ``min_values`` should be provided as well.
861900
862901
Returns
863902
-------
@@ -870,8 +909,6 @@ def gower_distances(X, Y=None, categorical_features=None, scale=True):
870909
871910
Notes
872911
-----
873-
The numeric feature ranges are determined from both X and Y.
874-
875912
Categorical ordinal attributes should be treated as numeric for the purpose
876913
of Gower similarity.
877914
@@ -897,7 +934,7 @@ def _n_cols(X):
897934
return np.asarray(X).shape[1]
898935

899936
def _nanmanhatan(x, y):
900-
return np.abs(np.nansum(x - y))
937+
return np.nansum(np.abs(x - y))
901938

902939
def _non_nans(x, y):
903940
return np.sum(~_object_dtype_isnan(x) & ~_object_dtype_isnan(y))
@@ -909,40 +946,34 @@ def _nanhamming(x, y):
909946
if issparse(X) or issparse(Y):
910947
raise TypeError("Gower distance does not support sparse matrices")
911948

912-
# TODO: this should be more like check_array(..., accept_pandas=True)
913-
if (X is not None and not hasattr(X, 'iloc')
914-
and not hasattr(X, '__array__')):
915-
X = check_array(X, dtype=np.object, force_all_finite=False)
916-
if (Y is not None and not hasattr(Y, 'iloc')
917-
and not hasattr(Y, '__array__')):
918-
Y = check_array(Y, dtype=np.object, force_all_finite=False)
919-
920949
if X is None or len(X) == 0:
921950
raise ValueError("X can not be None or empty")
922951

923-
if callable(categorical_features):
924-
cols = categorical_features(X)
925-
else:
926-
cols = categorical_features
927-
if cols is None:
928-
cols = []
952+
if scale:
953+
if (scale_factor is None) != (min_values is None):
954+
raise ValueError("min_value and scale_factor should be provided "
955+
"together.")
956+
X_cat, X_num = _split_categorical_numerical(X, categorical_features)
957+
Y_cat, Y_num = _split_categorical_numerical(Y, categorical_features)
929958

930-
X_cat = _safe_indexing(X, cols, axis=1)
931-
X_num = _safe_indexing(X, cols, axis=1, inverse=True)
932-
#print(X_cat)
933-
#print(X_num)
934-
if Y is not None:
935-
Y_cat = _safe_indexing(Y, cols, axis=1)
936-
Y_num = _safe_indexing(Y, cols, axis=1, inverse=True)
937-
else:
938-
Y_cat = Y_num = None
959+
if min_values is not None:
960+
min_values = np.asarray(min_values)
961+
scale_factor = np.asarray(scale_factor)
962+
check_consistent_length(min_values, scale_factor,
963+
np.ndarray(shape=(_n_cols(X_num), 0)))
939964

940965
if _n_cols(X_num):
941966
X_num, Y_num = check_pairwise_arrays(X_num, Y_num, precomputed=False,
942967
dtype=float,
943968
force_all_finite=False)
944969
if scale:
945-
trs = MinMaxScaler().fit(X_num)
970+
scale_data = X_num if Y_num is X_num else np.vstack((X_num, Y_num))
971+
if scale_factor is None:
972+
trs = MinMaxScaler().fit(scale_data)
973+
else:
974+
trs = MinMaxScaler()
975+
trs.scale_ = scale_factor
976+
trs.min_ = min_values
946977
X_num = trs.transform(X_num)
947978
Y_num = trs.transform(Y_num)
948979

@@ -960,11 +991,9 @@ def _nanhamming(x, y):
960991
else:
961992
nan_hamming = valid_cat = None
962993

963-
#print(nan_manhatan)
964-
#print(valid_num)
965-
#print(nan_hamming)
966-
#print(valid_cat)
967-
994+
# based on whether there are categorical and/or numerical data present,
995+
# we compute the distance metric
996+
# Division by zero and nans warnings are ignored since they are expected
968997
with np.errstate(divide='ignore', invalid='ignore'):
969998
if valid_num is not None and valid_cat is not None:
970999
D = (nan_manhatan + nan_hamming) / (valid_num + valid_cat)
@@ -1578,18 +1607,27 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
15781607
"""Precompute data-derived metric parameters if not provided
15791608
"""
15801609
if metric == 'gower':
1581-
categorical_features = None
1582-
if 'categorical_features' in kwds:
1583-
categorical_features = kwds['categorical_features']
1610+
categorical_features = kwds.get('categorical_features', None)
1611+
1612+
_, X_num = _split_categorical_numerical(X, categorical_features)
1613+
_, Y_num = _split_categorical_numerical(Y, categorical_features)
1614+
1615+
scale = kwds.get('scale', True)
1616+
if not scale:
1617+
return {'min_values': None, 'scale_factor': None, 'scale': False}
15841618

1585-
num_mask = ~ _detect_categorical_features(X, categorical_features)
1619+
scale_factor = kwds.get('scale_factor', None)
1620+
min_values = kwds.get('min_values', None)
1621+
if min_values is None:
1622+
data = X_num if Y is X or Y is None else np.vstack((X_num, Y_num))
1623+
trs = MinMaxScaler().fit(data)
1624+
min_values = trs.min_
1625+
scale_factor = trs.scale_
15861626

1587-
scale = None
1588-
if 'scale' in kwds:
1589-
scale = kwds['scale']
1590-
scale, _, _ = _precompute_gower_params(X, Y, scale, num_mask)
1627+
return {'min_values': min_values,
1628+
'scale_factor': scale_factor,
1629+
'scale': True}
15911630

1592-
return {'scale': scale}
15931631
if metric == "seuclidean" and 'V' not in kwds:
15941632
if X is Y:
15951633
V = np.var(X, axis=0, ddof=1)
@@ -1883,11 +1921,13 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
18831921
return X
18841922
elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
18851923
if metric == 'gower':
1924+
"""
18861925
# These convertions are necessary for matrices with string values
18871926
if not isinstance(X, np.ndarray):
18881927
X = np.asarray(X, dtype=np.object)
18891928
if Y is not None and not isinstance(Y, np.ndarray):
18901929
Y = np.asarray(Y, dtype=np.object)
1930+
"""
18911931
params = _precompute_metric_params(X, Y, metric=metric, **kwds)
18921932
kwds.update(**params)
18931933

sklearn/metrics/tests/test_pairwise.py

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,15 +1132,13 @@ def test_gower_distances():
11321132
assert_array_almost_equal(D * 2, manhattan_distances(X))
11331133

11341134
# Test to obtain a non-squared distance matrix
1135-
X = np.array([['Syria', 1.0, 0.0, 0.0, True],
1136-
['Ireland', 0.181818, 0.0, 1, False],
1137-
['United Kingdom', 0.0, 0.0, 0.160377, False]],
1138-
dtype=object)
1135+
X = [['Syria', 1.0, 0.0, 0.0, True],
1136+
['Ireland', 0.181818, 0.0, 1, False],
1137+
['United Kingdom', 0.0, 0.0, 0.160377, False]]
11391138

1140-
Y = np.array([['United Kingdom', 0.090909, 0.0, 0.500109, True]],
1141-
dtype=object)
1139+
Y = [['United Kingdom', 0.090909, 0.0, 0.500109, True]]
11421140

1143-
D = gower_distances(X, Y, categorical_features=[0, 3], scale=True)
1141+
D = gower_distances(X, Y, categorical_features=[0, 4], scale=False)
11441142

11451143
# Simplified calculation of Gower distance for expected values
11461144
D_expected = np.zeros((3, 1))
@@ -1204,12 +1202,7 @@ def test_gower_distances():
12041202
D = gower_distances(X)
12051203
assert_array_almost_equal(D_expected, D)
12061204

1207-
# Test warnings for unexpected non-normalized data
1208-
X = [[1, 20], [0, -10.0]]
1209-
with pytest.raises(ValueError):
1210-
gower_distances(X, scale=False)
1211-
1212-
# Test X and Y with diferent ranges of numeric values
1205+
# Test X and Y with different ranges of numeric values
12131206
X = [[9222.22, -11],
12141207
[41934.0, -44],
12151208
[1, 1]]
@@ -1218,7 +1211,7 @@ def test_gower_distances():
12181211
[1934.0, 4],
12191212
[3000, 3000]]
12201213

1221-
D = gower_distances(X, Y)
1214+
D = gower_distances(X, Y, scale=True)
12221215

12231216
# The expected normalized values above are:
12241217
Xn = [[0.22403432, 0.010841],
@@ -1244,18 +1237,11 @@ def test_gower_distances():
12441237
D = gower_distances(X, Y)
12451238
assert_array_almost_equal(D_expected, D)
12461239

1247-
# Test gower robustness after slice the data, with its original ranges
1248-
D = gower_distances(X, Y[1:2], scale=[42156.22, 3044.0])
1249-
assert_array_almost_equal(D_expected[:, 1:2], D)
1250-
12511240
# an assertion error is expected here, because there is no scale
12521241
D = gower_distances(X, Y[1:2])
12531242
with pytest.raises(AssertionError):
12541243
assert_array_almost_equal(D_expected[:, 1:2], D)
12551244

1256-
D = gower_distances(X, Y[0:1], scale=[42156.22, 3044.0])
1257-
assert_array_almost_equal(D_expected[:, 0:1], D)
1258-
12591245
# an assertion error is expected here, because there is no scale
12601246
D = gower_distances(X, Y[0:1])
12611247
with pytest.raises(AssertionError):
@@ -1265,7 +1251,7 @@ def test_gower_distances():
12651251
D = pairwise_distances(X, Y, metric='gower', n_jobs=2)
12661252
assert_array_almost_equal(D_expected, D)
12671253

1268-
# Test X and Y with diferent ranges of numeric values, categorical values,
1254+
# Test X and Y with different ranges of numeric values, categorical values,
12691255
# and using pairwise_distances
12701256
X = [[9222.22, -11, 'M', 1],
12711257
[41934.0, -44, 'F', 1],
@@ -1295,11 +1281,17 @@ def test_gower_distances():
12951281
# by observation, attribute by attribute.
12961282
D_expected[i][j] = ((abs(Xn[i][0] - Yn[j][0]) +
12971283
abs(Xn[i][1] - Yn[j][1]) +
1298-
[1, 0][Xn[i][2] == Yn[j][2]] +
1284+
([1, 0][Xn[i][2] == Yn[j][2]]
1285+
if (Xn[i][2] == Xn[i][2] and
1286+
Yn[i][2] == Yn[i][2]) else 0) +
12991287
abs(Xn[i][3] - Yn[j][3])) /
13001288
non_missing_cols[i])
13011289

1302-
D = pairwise_distances(X, Y, metric='gower', n_jobs=2)
1290+
# pairwise_distances will convert the input to strings and np.nan would
1291+
# therefore be 'nan'. Passing DataFrames will avoid that.
1292+
D = pairwise_distances(pd.DataFrame(X), pd.DataFrame(Y), metric='gower',
1293+
n_jobs=2,
1294+
categorical_features=[2])
13031295
assert_array_almost_equal(D_expected, D)
13041296

13051297
# Test categorical_values passed in kwargs
@@ -1311,12 +1303,16 @@ def test_gower_distances():
13111303
# by observation, attribute by attribute.
13121304
D_expected[i][j] = ((abs(Xn[i][0] - Yn[j][0]) +
13131305
abs(Xn[i][1] - Yn[j][1]) +
1314-
[1, 0][Xn[i][2] == Yn[j][2]] +
1306+
([1, 0][Xn[i][2] == Yn[j][2]]
1307+
if (Xn[i][2] == Xn[i][2] and
1308+
Yn[i][2] == Yn[i][2]) else 0) +
13151309
[1, 0][Xn[i][3] == Yn[j][3]]) /
13161310
non_missing_cols[i])
13171311

1318-
D = pairwise_distances(X, Y, metric='gower', n_jobs=2,
1319-
categorical_features=[False, False, True, True])
1312+
# TODO: boolean array with safe_indexing doesn't work?
1313+
D = pairwise_distances(pd.DataFrame(X), pd.DataFrame(Y), metric='gower',
1314+
n_jobs=2,
1315+
categorical_features=[2, 3])
13201316

13211317
assert_array_almost_equal(D_expected, D)
13221318

@@ -1358,7 +1354,7 @@ def test_gower_distances():
13581354

13591355
X[0] = np.nan
13601356
Y[0] = np.nan
1361-
D = gower_distances(X, Y)
1357+
D = gower_distances(X, Y, categorical_features=[0])
13621358
assert_array_equal(D_expected, D)
13631359

13641360
X = np.full((15, 1), True, dtype=np.object)
@@ -1378,7 +1374,6 @@ def test_gower_distances():
13781374
assert_array_equal(D_expected, D)
13791375

13801376

1381-
13821377
def test_haversine_distances():
13831378
# Check haversine distance with distances computation
13841379
def slow_haversine_distances(x, y):
@@ -1402,7 +1397,6 @@ def slow_haversine_distances(x, y):
14021397
haversine_distances(X)
14031398

14041399

1405-
14061400
# Paired distances
14071401

14081402
def test_paired_euclidean_distances():

0 commit comments

Comments
 (0)
0