1
-
2
1
"""
3
2
Logistic Regre
10000
ssion
4
3
"""
28
27
from ..utils .extmath import row_norms
29
28
from ..utils .optimize import newton_cg
30
29
from ..utils .validation import check_X_y
31
- from ..exceptions import DataConversionWarning
32
30
from ..exceptions import NotFittedError
33
31
from ..utils .fixes import expit
34
32
from ..utils .multiclass import check_classification_targets
@@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
925
923
y_test = np .ones (y_test .shape , dtype = np .float64 )
926
924
y_test [~ mask ] = - 1.
927
925
928
- # To deal with object dtypes, we need to convert into an array of floats.
929
- y_test = check_array (y_test , dtype = np .float64 , ensure_2d = False )
930
-
931
926
scores = list ()
932
927
933
928
if isinstance (scoring , six .string_types ):
@@ -1561,64 +1556,67 @@ def fit(self, X, y, sample_weight=None):
1561
1556
1562
1557
X , y = check_X_y (X , y , accept_sparse = 'csr' , dtype = np .float64 ,
1563
1558
order = "C" )
1564
-
1559
+ check_classification_targets ( y )
1565
1560
if self .solver == 'sag' :
1566
1561
max_squared_sum = row_norms (X , squared = True ).max ()
1567
1562
else :
1568
1563
max_squared_sum = None
1569
1564
1570
- check_classification_targets (y )
1565
+ # Encode for string labels
1566
+ label_encoder = LabelEncoder ().fit (y )
1567
+ y = label_encoder .transform (y )
1568
+ self .classes_ = label_encoder .classes_
1571
1569
1572
- if y .ndim == 2 and y .shape [1 ] == 1 :
1573
- warnings .warn (
1574
- "A column-vector y was passed when a 1d array was"
1575
- " expected. Please change the shape of y to "
1576
- "(n_samples, ), for example using ravel()." ,
1577
- DataConversionWarning )
1578
- y = np .ravel (y )
1570
+ enc_labels = label_encoder .transform (label_encoder .classes_ )
1571
+ cls_labels = self .classes_ # The original class labels
1579
1572
1580
- check_consistent_length (X , y )
1573
+ class_weight = self .class_weight
1574
+ if isinstance (class_weight , dict ):
1575
+ old_keys = list (class_weight .keys ())
1576
+ new_keys = label_encoder .transform (old_keys )
1577
+ # Don't modify the original class_weight dict.
1578
+ class_weight = dict ()
1579
+ for new_key , old_key in zip (new_keys , old_keys ):
1580
+ class_weight [new_key ] = self .class_weight [old_key ]
1581
1581
1582
1582
# init cross-validation generator
1583
1583
cv = check_cv (self .cv , y , classifier = True )
1584
1584
folds = list (cv .split (X , y ))
1585
1585
1586
- self ._enc = LabelEncoder ()
1587
- self ._enc .fit (y )
1588
-
1589
- labels = self .classes_ = np .unique (y )
1590
- n_classes = len (labels )
1586
+ # Use the label encoded classes
1587
+ n_classes = len (enc_labels )
1591
1588
1592
1589
if n_classes < 2 :
1593
1590
raise ValueError ("This solver needs samples of at least 2 classes"
1594
1591
" in the data, but the data contains only one"
1595
1592
" class: %r" % self .classes_ [0 ])
1593
+
1596
1594
if n_classes == 2 :
1597
1595
# OvR in case of binary problems is as good as fitting
1598
1596
# the higher label
1599
1597
n_classes = 1
1600
- labels = labels [1 :]
1598
+ enc_labels = enc_labels [1 :]
1599
+ cls_labels = cls_labels [1 :]
1601
1600
1602
1601
# We need this hack to iterate only once over labels, in the case of
1603
1602
# multi_class = multinomial, without changing the value of the labels.
1604
- iter_labels = labels
1605
1603
if self .multi_class == 'multinomial' :
1606
- iter_labels = [None ]
1604
+ iter_labels = iter_classes = [None ]
1605
+ else :
1606
+ iter_labels = enc_labels
1607
+ iter_classes = cls_labels
1607
1608
1608
- if self .class_weight and not (isinstance (self .class_weight , dict ) or
1609
- self .class_weight in
1610
- ['balanced' , 'auto' ]):
1609
+ if class_weight and not (isinstance (class_weight , dict ) or
1610
+ class_weight in ['balanced' , 'auto' ]):
1611
1611
# 'auto' is deprecated and will be removed in 0.19
1612
1612
raise ValueError ("class_weight provided should be a "
1613
1613
"dict or 'balanced'" )
1614
1614
1615
1615
# compute the class weights for the entire dataset y
1616
- if self . class_weight in ("auto" , "balanced" ):
1617
- classes = np .unique ( y )
1618
- class_weight = compute_class_weight (self . class_weight , classes , y )
1616
+ if class_weight in ("auto" , "balanced" ):
1617
+ classes = np .arange ( len ( self . classes_ ) )
1618
+ class_weight = compute_class_weight (class_weight , classes , y )
1619
1619
class_weight = dict (zip (classes , class_weight ))
1620
- else :
1621
- class_weight = self .class_weight
1622
1620
1623
1621
path_func = delayed (_log_reg_scoring_path )
1624
1622
@@ -1669,9 +1667,9 @@ def fit(self, X, y, sample_weight=None):
1669
1667
self .n_iter_ = np .reshape (n_iter_ , (n_classes , len (folds ),
1670
1668
len (self .Cs_ )))
1671
1669
1672
- self .coefs_paths_ = dict (zip (labels , coefs_paths ))
1670
+ self .coefs_paths_ = dict (zip (cls_labels , coefs_paths ))
1673
1671
scores = np .reshape (scores , (n_classes , len (folds ), - 1 ))
1674
- self .scores_ = dict (zip (labels , scores ))
1672
+ self .scores_ = dict (zip (cls_labels , scores ))
1675
1673
1676
1674
self .C_ = list ()
1677
1675
self .coef_ = np .empty ((n_classes , X .shape [1 ]))
@@ -1682,10 +1680,14 @@ def fit(self, X, y, sample_weight=None):
1682
1680
scores = multi_scores
1683
1681
coefs_paths = multi_coefs_paths
1684
1682
1685
- for index , label in enumerate (iter_labels ):
1683
+ for index , (cls_lbl , enc_lbl ) in enumerate (
1684
+ zip (iter_classes , iter_labels )):
1685
+
1686
1686
if self .multi_class == 'ovr' :
1687
- scores = self .scores_ [label ]
1688
- coefs_paths = self .coefs_paths_ [label ]
1687
+ # The scores_ / coefs_paths_ dict have unencoded class
1688
+ # labels as their keys
1689
+ scores = self .scores_ [cls_lbl ]
1690
+ coefs_paths = self .coefs_paths_ [cls_lbl ]
1689
1691
1690
1692
if self .refit :
1691
1693
best_index = scores .sum (axis = 0 ).argmax ()
@@ -1698,8 +1700,10 @@ def fit(self, X, y, sample_weight=None):
1698
1700
else :
1699
1701
coef_init = np .mean (coefs_paths [:, best_index , :], axis = 0 )
1700
1702
1703
+ # Note that y is label encoded and hence pos_class must be
1704
+ # the encoded label / None (for 'multinomial')
1701
1705
w , _ , _ = logistic_regression_path (
1702
- X , y , pos_class = label , Cs = [C_ ], solver = self .solver ,
1706
+ X , y , pos_class = enc_lbl , Cs = [C_ ], solver = self .solver ,
1703
1707
fit_intercept = self .fit_intercept , coef = coef_init ,
1704
1708
max_iter = self .max_iter , tol = self .tol ,
1705
1709
penalty = self .penalty , copy = False ,
0 commit comments