1
-
2
1
"""
3
2
Logistic Regression
4
3
"""
28
27
from ..utils .extmath import row_norms
29
28
from ..utils .optimize import newton_cg
30
29
from ..utils .validation import check_X_y
31
- from ..exceptions import DataConversionWarning
32
30
from ..exceptions import NotFittedError
33
31
from ..utils .fixes import expit
34
32
from ..utils .multiclass import check_classification_targets
@@ -925,9 +923,6 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
925
923
y_test = np .ones (y_test .shape , dtype = np .float64 )
926
924
y_test [~ mask ] = - 1.
927
925
928
- # To deal with object dtypes, we need to convert into an array of floats.
929
- y_test = check_array (y_test , dtype = np .float64 , ensure_2d = False )
930
-
931
926
scores = list ()
932
927
933
928
if isinstance (scoring , six .string_types ):
@@ -1561,64 +1556,64 @@ def fit(self, X, y, sample_weight=None):
1561
1556
1562
1557
X , y = check_X_y (X , y , accept_sparse = 'csr' , dtype = np .float64 ,
1563
1558
order = "C" )
1559
+ check_classification_targets (y )
1560
+
1561
+ class_weight = self .class_weight
1562
+ if class_weight and not (isinstance (class_weight , dict ) or
1563
+ class_weight in ['balanced' , 'auto' ]):
1564
+ # 'auto' is deprecated and will be removed in 0.19
1565
+ raise ValueError ("class_weight provided should be a "
1566
+ "dict or 'balanced'" )
1567
+
1568
+ # Encode for string labels
1569
+ label_encoder = LabelEncoder ().fit (y )
1570
+ y = label_encoder .transform (y )
1571
+ if isinstance (class_weight , dict ):
1572
+ class_weight = dict ((label_encoder .transform ([cls ])[0 ], v )
1573
+ for cls , v in class_weight .items ())
1574
+
1575
+ # The original class labels
1576
+ classes = self .classes_ = label_encoder .classes_
1577
+ encoded_labels = label_encoder .transform (label_encoder .classes_ )
1564
1578
1565
1579
if self .solver == 'sag' :
1566
1580
max_squared_sum = row_norms (X , squared = True ).max ()
1567
1581
else :
1568
1582
max_squared_sum = None
1569
1583
1570
- check_classification_targets (y )
1571
-
1572
- if y .ndim == 2 and y .shape [1 ] == 1 :
1573
- warnings .warn (
1574
- "A column-vector y was passed when a 1d array was"
1575
- " expected. Please change the shape of y to "
1576
- "(n_samples, ), for example using ravel()." ,
1577
- DataConversionWarning )
1578
- y = np .ravel (y )
1579
-
1580
- check_consistent_length (X , y )
1581
-
1582
1584
# init cross-validation generator
1583
1585
cv = check_cv (self .cv , y , classifier = True )
1584
1586
folds = list (cv .split (X , y ))
1585
1587
1586
- self ._enc = LabelEncoder ()
1587
- self ._enc .fit (y )
1588
-
1589
- labels = self .classes_ = np .unique (y )
1590
- n_classes = len (labels )
1588
+ # Use the label encoded classes
1589
+ n_classes = len (encoded_labels )
1591
1590
1592
1591
if n_classes < 2 :
1593
1592
raise ValueError ("This solver needs samples of at least 2 classes"
1594
1593
" in the data, but the data contains only one"
1595
- " class: %r" % self .classes_ [0 ])
1594
+ " class: %r" % classes [0 ])
1595
+
1596
1596
if n_classes == 2 :
1597
1597
# OvR in case of binary problems is as good as fitting
1598
1598
# the higher label
1599
1599
n_classes = 1
1600
- labels = labels [1 :]
1600
+ encoded_labels = encoded_labels [1 :]
1601
+ classes = classes [1 :]
1601
1602
1602
1603
# We need this hack to iterate only once over labels, in the case of
1603
1604
# multi_class = multinomial, without changing the value of the labels.
1604
- iter_labels = labels
1605
1605
if self .multi_class == 'multinomial' :
1606
- iter_labels = [None ]
1607
-
1608
- if self .class_weight and not (isinstance (self .class_weight , dict ) or
1609
- self .class_weight in
1610
- ['balanced' , 'auto' ]):
1611
- # 'auto' is deprecated and will be removed in 0.19
1612
- raise ValueError ("class_weight provided should be a "
1613
- "dict or 'balanced'" )
1606
+ iter_encoded_labels = iter_classes = [None ]
1607
+ else :
1608
+ iter_encoded_labels = encoded_labels
1609
+ iter_classes = classes
1614
1610
1615
1611
# compute the class weights for the entire dataset y
1616
- if self .class_weight in ("auto" , "balanced" ):
1617
- classes = np .unique (y )
1618
- class_weight = compute_class_weight (self .class_weight , classes , y )
1619
- class_weight = dict (zip (classes , class_weight ))
1620
- else :
1621
- class_weight = self .class_weight
1612
+ if class_weight in ("auto" , "balanced" ):
1613
+ class_weight = compute_class_weight (class_weight ,
1614
+ np .arange (len (self .classes_ )),
1615
+ y )
1616
+ class_weight = dict (enumerate (class_weight ))
1622
1617
1623
1618
path_func = delayed (_log_reg_scoring_path )
1624
1619
@@ -1638,7 +1633,7 @@ def fit(self, X, y, sample_weight=None):
1638
1633
max_squared_sum = max_squared_sum ,
1639
1634
sample_weight = sample_weight
1640
1635
)
1641
- for label in iter_labels
1636
+ for label in iter_encoded_labels
1642
1637
for train , test in folds )
1643
1638
1644
1639
if self .multi_class == 'multinomial' :
@@ -1669,9 +1664,9 @@ def fit(self, X, y, sample_weight=None):
1669
1664
self .n_iter_ = np .reshape (n_iter_ , (n_classes , len (folds ),
1670
1665
len (self .Cs_ )))
1671
1666
1672
- self .coefs_paths_ = dict (zip (labels , coefs_paths ))
1667
+ self .coefs_paths_ = dict (zip (classes , coefs_paths ))
1673
1668
scores = np .reshape (scores , (n_classes , len (folds ), - 1 ))
1674
10000
- self .scores_ = dict (zip (labels , scores ))
1669
+ self .scores_ = dict (zip (classes , scores ))
1675
1670
1676
1671
self .C_ = list ()
1677
1672
self .coef_ = np .empty ((n_classes , X .shape [1 ]))
@@ -1682,10 +1677,14 @@ def fit(self, X, y, sample_weight=None):
1682
1677
scores = multi_scores
1683
1678
coefs_paths = multi_coefs_paths
1684
1679
1685
- for index , label in enumerate (iter_labels ):
1680
+ for index , (cls , encoded_label ) in enumerate (
1681
+ zip (iter_classes , iter_encoded_labels )):
1682
+
1686
1683
if self .multi_class == 'ovr' :
1687
- scores = self .scores_ [label ]
1688
- coefs_paths = self .coefs_paths_ [label ]
1684
+ # The scores_ / coefs_paths_ dict have unencoded class
1685
+ # labels as their keys
1686
+ scores = self .scores_ [cls ]
1687
+ coefs_paths = self .coefs_paths_ [cls ]
1689
1688
1690
1689
if self .refit :
1691
1690
best_index = scores .sum (axis = 0 ).argmax ()
@@ -1698,8 +1697,10 @@ def fit(self, X, y, sample_weight=None):
1698
1697
else :
1699
1698
coef_init = np .mean (coefs_paths [:, best_index , :], axis = 0 )
1700
1699
1700
+ # Note that y is label encoded and hence pos_class must be
1701
+ # the encoded label / None (for 'multinomial')
1701
1702
w , _ , _ = logistic_regression_path (
1702
- X , y , pos_class = label , Cs = [C_ ], solver = self .solver ,
1703
+ X , y , pos_class = encoded_label , Cs = [C_ ], solver = self .solver ,
1703
1704
fit_intercept = self .fit_intercept , coef = coef_init ,
1704
1705
max_iter = self .max_iter , tol = self .tol ,
1705
1706
penalty = self .penalty , copy = False ,
0 commit comments