21
21
22
22
from ..utils .validation import _num_samples
23
23
from ..utils .validation import check_non_negative
24
+ from ..utils .validation import check_consistent_length
24
25
from ..utils import check_array
25
26
from ..utils import gen_even_slices
26
27
from ..utils import gen_batches , get_chunk_n_rows
35
36
from ..utils .fixes import _object_dtype_isnan
36
37
from ..preprocessing import MinMaxScaler
37
38
39
+
38
40
# Utility Functions
39
41
def _return_float_dtype (X , Y ):
40
42
"""
@@ -831,7 +833,33 @@ def cosine_distances(X, Y=None):
831
833
return S
832
834
833
835
834
- def gower_distances (X , Y = None , categorical_features = None , scale = True ):
836
+ def _split_categorical_numerical (X , categorical_features ):
837
+ # the following bit is done before check_pairwise_array to avoid converting
838
+ # numerical data to object dtype. First we split the data into categorical
839
+ # and numerical, then we do check_array
840
+
841
+ # TODO: this should be more like check_array(..., accept_pandas=True)
842
+ if (X is not None and not hasattr (X , 'iloc' )
843
+ and not hasattr (X , '__array__' )):
844
+ X = check_array (X , dtype = np .object , force_all_finite = False )
845
+
846
+ if callable (categorical_features ) and X is not None :
847
+ cols = categorical_features (X )
848
+ else :
849
+ cols = categorical_features
850
+ if cols is None :
851
+ cols = []
852
+
853
+ if X is not None :
854
+ X_cat = _safe_indexing (X , cols , axis = 1 )
855
+ X_num = _safe_indexing (X , cols , axis = 1 , inverse = True )
856
+ else :
857
+ X_cat = X_num = None
858
+ return X_cat , X_num
859
+
860
+
861
+ def gower_distances (X , Y = None , categorical_features = None , scale = True ,
862
+ min_values = None , scale_factor = None ):
835
863
"""Compute the distances between the observations in X and Y,
836
864
that may contain mixed types of data, using an implementation
837
865
of Gower formula.
@@ -857,7 +885,18 @@ def gower_distances(X, Y=None, categorical_features=None, scale=True):
857
885
scale : bool, default=True
858
886
Indicates if the numerical columns will be scaled between 0 and 1.
859
887
If false, it is assumed the numerical columns are already scaled.
860
- The scaling factors, _i.e._ min and max, are taken from ``X``.
888
+ The scaling factors, _i.e._ min and max, are taken from both ``X`` and
889
+ ``Y``.
890
+
891
+ min_values : ndarray of shape (n_features,), default=None
892
+ Per feature adjustment for minimum. Equivalent to
893
+ ``min_values - X.min(axis=0) * scale_factor``
894
+ If provided, ``scale_factor`` should be provided as well.
895
+
896
+ scale_factor : ndarray of shape (n_features,), default=None
897
+ Per feature relative scaling of the data. Equivalent to
898
+ ``(max_values - min_values) / (X.max(axis=0) - X.min(axis=0))``
899
+ If provided, ``min_values`` should be provided as well.
861
900
862
901
Returns
863
902
-------
@@ -870,8 +909,6 @@ def gower_distances(X, Y=None, categorical_features=None, scale=True):
870
909
871
910
Notes
872
911
-----
873
- The numeric feature ranges are determined from both X and Y.
874
-
875
912
Categorical ordinal attributes should be treated as numeric for the purpose
876
913
of Gower similarity.
877
914
@@ -897,7 +934,7 @@ def _n_cols(X):
897
934
return np .asarray (X ).shape [1 ]
898
935
899
936
def _nanmanhatan (x , y ):
900
- return np .abs (np .nansum (x - y ))
937
+ return np .nansum (np .abs (x - y ))
901
938
902
939
def _non_nans (x , y ):
903
940
return np .sum (~ _object_dtype_isnan (x ) & ~ _object_dtype_isnan (y ))
@@ -909,40 +946,34 @@ def _nanhamming(x, y):
909
946
if issparse (X ) or issparse (Y ):
910
947
raise TypeError ("Gower distance does not support sparse matrices" )
911
948
912
- # TODO: this should be more like check_array(..., accept_pandas=True)
913
- if (X is not None and not hasattr (X , 'iloc' )
914
- and not hasattr (X , '__array__' )):
915
- X = check_array (X , dtype = np .object , force_all_finite = False )
916
- if (Y is not None and not hasattr (Y , 'iloc' )
917
- and not hasattr (Y , '__array__' )):
918
- Y = check_array (Y , dtype = np .object , force_all_finite = False )
919
-
920
949
if X is None or len (X ) == 0 :
921
950
raise ValueError ("X can not be None or empty" )
922
951
923
- if callable ( categorical_features ) :
924
- cols = categorical_features ( X )
925
- else :
926
- cols = categorical_features
927
- if cols is None :
928
- cols = []
952
+ if scale :
953
+ if ( scale_factor is None ) != ( min_values is None ):
954
+ raise ValueError ( "min_value and scale_factor should be provided "
955
+ "together." )
956
+ X_cat , X_num = _split_categorical_numerical ( X , categorical_features )
957
+ Y_cat , Y_num = _split_categorical_numerical ( Y , categorical_features )
929
958
930
- X_cat = _safe_indexing (X , cols , axis = 1 )
931
- X_num = _safe_indexing (X , cols , axis = 1 , inverse = True )
932
- #print(X_cat)
933
- #print(X_num)
934
- if Y is not None :
935
- Y_cat = _safe_indexing (Y , cols , axis = 1 )
936
- Y_num = _safe_indexing (Y , cols , axis = 1 , inverse = True )
937
- else :
938
- Y_cat = Y_num = None
959
+ if min_values is not None :
960
+ min_values = np .asarray (min_values )
961
+ scale_factor = np .asarray (scale_factor )
962
+ check_consistent_length (min_values , scale_factor ,
963
+ np .ndarray (shape = (_n_cols (X_num ), 0 )))
939
964
940
965
if _n_cols (X_num ):
941
966
X_num , Y_num = check_pairwise_arrays (X_num , Y_num , precomputed = False ,
942
967
dtype = float ,
943
968
force_all_finite = False )
944
969
if scale :
945
- trs = MinMaxScaler ().fit (X_num )
970
+ scale_data = X_num if Y_num is X_num else np .vstack ((X_num , Y_num ))
971
+ if scale_factor is None :
972
+ trs = MinMaxScaler ().fit (scale_data )
973
+ else :
974
+ trs = MinMaxScaler ()
975
+ trs .scale_ = scale_factor
976
+ trs .min_ = min_values
946
977
X_num = trs .transform (X_num )
947
978
Y_num = trs .transform (Y_num )
948
979
@@ -960,11 +991,9 @@ def _nanhamming(x, y):
960
991
else :
961
992
nan_hamming = valid_cat = None
962
993
963
- #print(nan_manhatan)
964
- #print(valid_num)
965
- #print(nan_hamming)
966
- #print(valid_cat)
967
-
994
+ # based on whether there are categorical and/or numerical data present,
995
+ # we compute the distance metric
996
+ # Division by zero and nans warnings are ignored since they are expected
968
997
with np .errstate (divide = 'ignore' , invalid = 'ignore' ):
969
998
if valid_num is not None and valid_cat is not None :
970
999
D = (nan_manhatan + nan_hamming ) / (valid_num + valid_cat )
@@ -1578,18 +1607,27 @@ def _precompute_metric_params(X, Y, metric=None, **kwds):
1578
1607
"""Precompute data-derived metric parameters if not provided
1579
1608
"""
1580
1609
if metric == 'gower' :
1581
- categorical_features = None
1582
- if 'categorical_features' in kwds :
1583
- categorical_features = kwds ['categorical_features' ]
1610
+ categorical_features = kwds .get ('categorical_features' , None )
1611
+
1612
+ _ , X_num = _split_categorical_numerical (X , categorical_features )
1613
+ _ , Y_num = _split_categorical_numerical (Y , categorical_features )
1614
+
1615
+ scale = kwds .get ('scale' , True )
1616
+ if not scale :
1617
+ return {'min_values' : None , 'scale_factor' : None , 'scale' : False }
1584
1618
1585
- num_mask = ~ _detect_categorical_features (X , categorical_features )
1619
+ scale_factor = kwds .get ('scale_factor' , None )
1620
+ min_values = kwds .get ('min_values' , None )
1621
+ if min_values is None :
1622
+ data = X_num if Y is X or Y is None else np .vstack ((X_num , Y_num ))
1623
+ trs = MinMaxScaler ().fit (data )
1624
+ min_values = trs .min_
1625
+ scale_factor = trs .scale_
1586
1626
1587
- scale = None
1588
- if 'scale' in kwds :
1589
- scale = kwds ['scale' ]
1590
- scale , _ , _ = _precompute_gower_params (X , Y , scale , num_mask )
1627
+ return {'min_values' : min_values ,
1628
+ 'scale_factor' : scale_factor ,
1629
+ 'scale' : True }
1591
1630
1592
- return {'scale' : scale }
1593
1631
if metric == "seuclidean" and 'V' not in kwds :
1594
1632
if X is Y :
1595
1633
V = np .var (X , axis = 0 , ddof = 1 )
@@ -1883,11 +1921,13 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
1883
1921
return X
1884
1922
elif metric in PAIRWISE_DISTANCE_FUNCTIONS :
1885
1923
if metric == 'gower' :
1924
+ """
1886
1925
# These convertions are necessary for matrices with string values
1887
1926
if not isinstance(X, np.ndarray):
1888
1927
X = np.asarray(X, dtype=np.object)
1889
1928
if Y is not None and not isinstance(Y, np.ndarray):
1890
1929
Y = np.asarray(Y, dtype=np.object)
1930
+ """
1891
1931
params = _precompute_metric_params (X , Y , metric = metric , ** kwds )
1892
1932
kwds .update (** params )
1893
1933
0 commit comments