scikit-learn
diff --git a/‎sklearn/metrics/pairwise.py
Lines changed: 83 additions & 43 deletions b/‎sklearn/metrics/pairwise.py
Lines changed: 83 additions & 43 deletions
diff --git a/‎sklearn/metrics/tests/test_pairwise.py
Lines changed: 24 additions & 30 deletions b/‎sklearn/metrics/tests/test_pairwise.py
Lines changed: 24 additions & 30 deletions
@@ -21,6 +21,7 @@
 
 from ..utils.validation import _num_samples
 from ..utils.validation import check_non_negative
+from ..utils.validation import check_consistent_length
 from ..utils import check_array
 from ..utils import gen_even_slices
 from ..utils import gen_batches, get_chunk_n_rows
@@ -35,6 +36,7 @@
 from ..utils.fixes import _object_dtype_isnan
 from ..preprocessing import MinMaxScaler
 
+
 # Utility Functions
 def _return_float_dtype(X, Y):
     """
@@ -831,7 +833,33 @@ def cosine_distances(X, Y=None):
     return S
 
 
-def gower_distances(X, Y=None, categorical_features=None, scale=True):
+def _split_categorical_numerical(X, categorical_features):
+    # the following bit is done before check_pairwise_array to avoid converting
+    # numerical data to object dtype. First we split the data into categorical
+    # and numerical, then we do check_array
+
+    # TODO: this should be more like check_array(..., accept_pandas=True)
+    if (X is not None and not hasattr(X, 'iloc')
+            and not hasattr(X, '__array__')):
+        X = check_array(X, dtype=np.object, force_all_finite=False)
+
+    if callable(categorical_features) and X is not None:
+        cols = categorical_features(X)
+    else:
+        cols = categorical_features
+    if cols is None:
+        cols = []
+
+    if X is not None:
+        X_cat = _safe_indexing(X, cols, axis=1)
+        X_num = _safe_indexing(X, cols, axis=1, inverse=True)
+    else:
+        X_cat = X_num = None
+    return X_cat, X_num
+
+
+def gower_distances(X, Y=None, categorical_features=None, scale=True,
+                    min_values=None, scale_factor=None):
     """Compute the distances between the observations in X and Y,
     that may contain mixed types of data, using an implementation
     of Gower formula.
@@ -857,7 +885,18 @@ def gower_distances(X, Y=None, categorical_features=None, scale=True):
     scale : bool, default=True
         Indicates if the numerical columns will be scaled between 0 and 1.
         If false, it is assumed the numerical columns are already scaled.
-
The scaling factors, _i.e._ min and max, are taken from ``X``.        The scaling factors, _i.e._ min and max, are taken from ``X``.
+        The scaling factors, _i.e._ min and max, are taken from both ``X`` and
+        ``Y``.
+
+    min_values : ndarray of shape (n_features,), default=None
+        Per feature adjustment for minimum. Equivalent to
+        ``min_values - X.min(axis=0) * scale_factor``
+        If provided, ``scale_factor`` should be provided as well.
+
+    scale_factor : ndarray of shape (n_features,), default=None
+        Per feature relative scaling of the data. Equivalent to
+        ``(max_values - min_values) / (X.max(axis=0) - X.min(axis=0))``
+        If provided, ``min_values`` should be provided as well.
 
     Returns
     -------
@@ -870,8 +909,6 @@ def gower_distances(X, Y=None, categorical_features=None, scale=True):
 
     Notes
     -----
-    The numeric feature ranges are determined from both X and Y.
-
     Categorical ordinal attributes should be treated as numeric for the purpose
     of Gower similarity.
 
@@ -897,7 +934,7 @@ def _n_cols(X):
         return np.asarray(X).shape[1]
 
     def _nanmanhatan(x, y):
-        return np.abs(np.nansum(x - y))
+        return np.nansum(np.abs(x - y))
 
     def _non_nans(x, y):
         return np.sum(~_object_dtype_isnan(x) & ~_object_dtype_isnan(y))
@@ -909,40 +946,34 @@ def _nanhamming(x, y):
     if issparse(X) or issparse(Y):
         raise TypeError("Gower distance does not support sparse matrices")
 
-    # TODO: this should be more like check_array(..., accept_pandas=True)
-    if (X is not None and not hasattr(X, 'iloc')
-            and not hasattr(X, '__array__')):
-        X = check_array(X, dtype=np.object, force_all_finite=False)
-    if (Y is not None and not hasattr(Y, 'iloc')
-            and not hasattr(Y, '__array__')):
-        Y = check_array(Y, dtype=np.object, force_all_finite=False)
-
     if X is None or len(X) == 0:
         raise ValueError("X can not be None or empty")
 
-    if callable(categorical_features):
-        cols = categorical_features(X)
-    else:
-        cols = categorical_features
-    if cols is None:
-        cols = []
+    if scale:
+        if (scale_factor is None) != (min_values is None):
+            raise ValueError("min_value and scale_factor should be provided "
+                             "together.")
+    X_cat, X_num = _split_categorical_numerical(X, categorical_features)
+    Y_cat, Y_num = _split_categorical_numerical(Y, categorical_features)
 
-    X_cat = _safe_indexing(X, cols, axis=1)
-    X_num = _safe_indexing(X, cols, axis=1, inverse=True)
-    #print(X_cat)
-    #print(X_num)
-    if Y is not None:
-        Y_cat = _safe_indexing(Y, cols, axis=1)
-        Y_num = _safe_indexing(Y, cols, axis=1, inverse=True)
-    else:
-        Y_cat = Y_num = None
+    if min_values is not None:
+        min_values = np.asarray(min_values)
+        scale_factor = np.asarray(scale_factor)
+        check_consistent_length(min_values, scale_factor,
+                                np.ndarray(shape=(_n_cols(X_num), 0)))
 
     if _n_cols(X_num):
         X_num, Y_num = check_pairwise_arrays(X_num, Y_num, precomputed=False,
                                              dtype=float,
                                              force_all_finite=False)
         if scale:
-            trs = MinMaxScaler().fit(X_num)
+            scale_data = X_num if Y_num is X_num else np.vstack((X_num, Y_num))
+            if scale_factor is None:
+                trs = MinMaxScaler().fit(scale_data)
+            else:
+                trs = MinMaxScaler()
+                trs.scale_ = scale_factor
+                trs.min_ = min_values
             X_num = trs.transform(X_num)
             Y_num = trs.transform(Y_num)
 
@@ -960,11 +991,9 @@ def _nanhamming(x, y):
     else:
         nan_hamming = valid_cat = None
 
-    #print(nan_manhatan)
-    #print(valid_num)
-    #print(nan_hamming)
-    #print(valid_cat)
-
+    # based on whether there are categorical and/or numerical data present,
+    # we compute the distance metric
+    # Division by zero and nans warnings are ignored since they are expected
     with np.errstate(divide='ignore', invalid='ignore'):
         if valid_num is not None and valid_cat is not None:
             D = (nan_manhatan + nan_hamming) / (valid_num + valid_cat)
@@ -1578,18 +1607,27 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
     """Precompute data-derived metric parameters if not provided
     """
     if metric == 'gower':
-        categorical_features = None
-        if 'categorical_features' in kwds:
-            categorical_features = kwds['categorical_features']
+        categorical_features = kwds.get('categorical_features', None)
+
+        _, X_num = _split_categorical_numerical(X, categorical_features)
+        _, Y_num = _split_categorical_numerical(Y, categorical_features)
+
+        scale = kwds.get('scale', True)
+        if not scale:
+            return {'min_values': None, 'scale_factor': None, 'scale': False}
 
-        num_mask = ~ _detect_categorical_features(X, categorical_features)
+        scale_factor = kwds.get('scale_factor', None)
+        min_values = kwds.get('min_values', None)
+        if min_values is None:
+            data = X_num if Y is X or Y is None else np.vstack((X_num, Y_num))
+            trs = MinMaxScaler().fit(data)
+            min_values = trs.min_
+            scale_factor = trs.scale_
 
-        scale = None
-        if 'scale' in kwds:
-            scale = kwds['scale']
-        scale, _, _ = _precompute_gower_params(X, Y, scale, num_mask)
+        return {'min_values': min_values,
+                'scale_factor': scale_factor,
+                'scale': True}
 
-        return {'scale': scale}
     if metric == "seuclidean" and 'V' not in kwds:
         if X is Y:
             V = np.var(X, axis=0, ddof=1)
@@ -1883,11 +1921,13 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
         return X
     elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
         if metric == 'gower':
+            """
             # These convertions are necessary for matrices with string values
             if not isinstance(X, np.ndarray):
                 X = np.asarray(X, dtype=np.object)
             if Y is not None and not isinstance(Y, np.ndarray):
                 Y = np.asarray(Y, dtype=np.object)
+            """
             params = _precompute_metric_params(X, Y, metric=metric, **kwds)
             kwds.update(**params)
 
 
@@ -1132,15 +1132,13 @@ def test_gower_distances():
     assert_array_almost_equal(D * 2, manhattan_distances(X))
 
     # Test to obtain a non-squared distance matrix
-    X = np.array([['Syria', 1.0, 0.0, 0.0, True],
-                  ['Ireland', 0.181818, 0.0, 1, False],
-                  ['United Kingdom', 0.0, 0.0, 0.160377, False]],
-                 dtype=object)
+    X = [['Syria', 1.0, 0.0, 0.0, True],
+         ['Ireland', 0.181818, 0.0, 1, False],
+         ['United Kingdom', 0.0, 0.0, 0.160377, False]]
 
-    Y = np.array([['United Kingdom', 0.090909, 0.0, 0.500109, True]],
-                 dtype=object)
+    Y = [['United Kingdom', 0.090909, 0.0, 0.500109, True]]
 
-    D = gower_distances(X, Y, categorical_features=[0, 3], scale=True)
+    D = gower_distances(X, Y, categorical_features=[0, 4], scale=False)
 
     # Simplified calculation of Gower distance for expected values
     D_expected = np.zeros((3, 1))
@@ -1204,12 +1202,7 @@ def test_gower_distances():
     D = gower_distances(X)
     assert_array_almost_equal(D_expected, D)
 
-    # Test warnings for unexpected non-normalized data
-    X = [[1, 20], [0, -10.0]]
-    with pytest.raises(ValueError):
-        gower_distances(X, scale=False)
-
-    # Test X and Y with diferent ranges of numeric values
+    # Test X and Y with different ranges of numeric values
     X = [[9222.22, -11],
          [41934.0, -44],
          [1, 1]]
@@ -1218,7 +1211,7 @@ def test_gower_distances():
          [1934.0, 4],
          [3000, 3000]]
 
-    D = gower_distances(X, Y)
+    D = gower_distances(X, Y, scale=True)
 
     # The expected normalized values above are:
     Xn = [[0.22403432, 0.010841],
@@ -1244,18 +1237,11 @@ def test_gower_distances():
     D = gower_distances(X, Y)
     assert_array_almost_equal(D_expected, D)
 
-    # Test gower robustness after slice the data, with its original ranges
-    D = gower_distances(X, Y[1:2],  scale=[42156.22, 3044.0])
-    assert_array_almost_equal(D_expected[:, 1:2], D)
-
     # an assertion error is expected here, because there is no scale
     D = gower_distances(X, Y[1:2])
     with pytest.raises(AssertionError):
         assert_array_almost_equal(D_expected[:, 1:2], D)
 
-    D = gower_distances(X, Y[0:1], scale=[42156.22, 3044.0])
-    assert_array_almost_equal(D_expected[:, 0:1], D)
-
     # an assertion error is expected here, because there is no scale
     D = gower_distances(X, Y[0:1])
     with pytest.raises(AssertionError):
@@ -1265,7 +1251,7 @@ def test_gower_distances():
     D = pairwise_distances(X, Y, metric='gower', n_jobs=2)
     assert_array_almost_equal(D_expected, D)
 
-    # Test X and Y with diferent ranges of numeric values, categorical values,
+    # Test X and Y with different ranges of numeric values, categorical values,
     # and using pairwise_distances
     X = [[9222.22, -11, 'M', 1],
          [41934.0, -44, 'F', 1],
@@ -1295,11 +1281,17 @@ def test_gower_distances():
             # by observation, attribute by attribute.
             D_expected[i][j] = ((abs(Xn[i][0] - Yn[j][0]) +
                                  abs(Xn[i][1] - Yn[j][1]) +
-                                 [1, 0][Xn[i][2] == Yn[j][2]] +
+                                 ([1, 0][Xn[i][2] == Yn[j][2]]
+                                  if (Xn[i][2] == Xn[i][2] and
+                                      Yn[i][2] == Yn[i][2]) else 0) +
                                  abs(Xn[i][3] - Yn[j][3])) /
                                 non_missing_cols[i])
 
-    D = pairwise_distances(X, Y, metric='gower', n_jobs=2)
+    # pairwise_distances will convert the input to strings and np.nan would
+    # therefore be 'nan'. Passing DataFrames will avoid that.
+    D = pairwise_distances(pd.DataFrame(X), pd.DataFrame(Y), metric='gower',
+                           n_jobs=2,
+                           categorical_features=[2])
     assert_array_almost_equal(D_expected, D)
 
     # Test categorical_values passed in kwargs
@@ -1311,12 +1303,16 @@ def test_gower_distances():
             # by observation, attribute by attribute.
             D_expected[i][j] = ((abs(Xn[i][0] - Yn[j][0]) +
                                  abs(Xn[i][1] - Yn[j][1]) +
-                                 [1, 0][Xn[i][2] == Yn[j][2]] +
+                                 ([1, 0][Xn[i][2] == Yn[j][2]]
+                                  if (Xn[i][2] == Xn[i][2] and
+                                      Yn[i][2] == Yn[i][2]) else 0) +
                                  [1, 0][Xn[i][3] == Yn[j][3]]) /
                                 non_missing_cols[i])
 
-    D = pairwise_distances(X, Y, metric='gower', n_jobs=2,
-                           categorical_features=[False, False, True, True])
+    # TODO: boolean array with safe_indexing doesn't work?
+    D = pairwise_distances(pd.DataFrame(X), pd.DataFrame(Y), metric='gower',
+                           n_jobs=2,
+                           categorical_features=[2, 3])
 
     assert_array_almost_equal(D_expected, D)
 
@@ -1358,7 +1354,7 @@ def test_gower_distances():
 
     X[0] = np.nan
     Y[0] = np.nan
-    D = gower_distances(X, Y)
+    D = gower_distances(X, Y, categorical_features=[0])
     assert_array_equal(D_expected, D)
 
     X = np.full((15, 1), True, dtype=np.object)
@@ -1378,7 +1374,6 @@ def test_gower_distances():
     assert_array_equal(D_expected, D)
 
 
-
 def test_haversine_distances():
     # Check haversine distance with distances computation
     def slow_haversine_distances(x, y):
@@ -1402,7 +1397,6 @@ def slow_haversine_distances(x, y):
         haversine_distances(X)
 
 
-
 # Paired distances
 
 def test_paired_euclidean_distances():