8000 Fix scaling of LogisticRegression objective for LBFGS · Issue #24752 · scikit-learn/scikit-learn · GitHub
[go: up one dir, main page]

Skip to content

Fix scaling of LogisticRegression objective for LBFGS #24752

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
lorentzenchr opened this issue Oct 25, 2022 · 11 comments · Fixed by #26721
Closed

Fix scaling of LogisticRegression objective for LBFGS #24752

lorentzenchr opened this issue Oct 25, 2022 · 11 comments · Fixed by #26721

Comments

@lorentzenchr
Copy link
Member

Description

The objective function of LogisticRegression is C * sum(log_loss) + penalty. For LBFGS, the reformulation (having the same argmin) is much more favorable: 1/N * sum(log_loss) + 1/(N*C)*penalty.

Note that the division by 1/C is already done for all solvers but "liblinear".

Proposed Action

Similar to _GeneralizedLinearRegressor, internally in _logistic_regression_path`, use

if solver == "lbfgs":
    if sample_weight is None
        sample_weight = np.full_like(y, 1/y.shape[0])
    else:
        sample_weight = sample_weight / sample_weight.sum()

Detailed Background

This unfavorable behaviour was noted in #23314 (comment).

image
image

import warnings
from pathlib import Path
import numpy as np
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn._loss import HalfBinomialLoss
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.linear_model._glm.glm import _GeneralizedLinearRegressor
from time import perf_counter
import pandas as pd
import joblib


class BinomialRegressor(_GeneralizedLinearRegressor):
    def _get_loss(self):
        return HalfBinomialLoss()


@joblib.Memory(location=".").cache
def prepare_data():
    df = fetch_openml(data_id=41214).frame
    df["Frequency"] = df["ClaimNb"] / df["Exposure"]
    log_scale_transformer = make_pipeline(
        FunctionTransformer(np.log, validate=False), StandardScaler()
    )
    linear_model_preprocessor = ColumnTransformer(
        [
            ("passthrough_numeric", "passthrough", ["BonusMalus"]),
            (
                "binned_numeric",
                KBinsDiscretizer(n_bins=10, subsample=None),
                ["VehAge", "DrivAge"],
            ),
            ("log_scaled_numeric", log_scale_transformer, ["Density"]),
            (
                "onehot_categorical",
                OneHotEncoder(),
                ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
            ),
        ],
        remainder="drop",
    )
    y = df["Frequency"]
    w = df["Exposure"]
    X = linear_model_preprocessor.fit_transform(df)
    return X, y, w


X, y, w = prepare_data()
y = y.values
y = (y > np.quantile(y, q=0.95)).astype(np.float64)
print(y.mean())

# Make sure, we use equivalent penalties
alpha = 1e2
C = 1 / alpha / X.shape[0]
lr1 = LogisticRegression(C=C, tol=1e-12, max_iter=10000).fit(X, y)
lr2 = BinomialRegressor(alpha=alpha, tol=1e-12, solver="newton-cholesky").fit(X, y)
assert np.allclose(lr1.intercept_, lr2.intercept_)

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, train_size=50_000, test_size=10_000, random_state=0
)
print(f"{X_train.shape = }")

results = []
loss_sw = np.full_like(y_train, fill_value=(1.0 / y_train.shape[0]))
# loss_sw = None
alpha = 1e-12
C = 1. / alpha / X.shape[0]
max_iter = 10_000
for tol in np.logspace(-1, -10, 10):
    for solver in ["lbfgs", "lbfgs2", "newton-cg", "newton-cholesky"]:
        tic = perf_counter()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            if solver in ["lbfgs", "newton-cg"]:
                model = LogisticRegression(
                    C=C, solver=solver, tol=tol, max_iter=max_iter
                ).fit(X_train, y_train)
                toc = perf_counter()
            else:
                if solver == "lbfgs2":
                    binomial_solver = "lbfgs"
                else:
                    binomial_solver = solver
                model = BinomialRegressor(
                    solver=binomial_solver, alpha=alpha, tol=tol, max_iter=max_iter
                ).fit(X_train, y_train)
                toc = perf_counter()
        train_time = toc - tic

        lml = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=True)
        train_loss = lml.loss(
            coef=np.r_[model.coef_.ravel(), model.intercept_.ravel()],
            X=X_train,
            y=y_train.astype(np.float64),
            sample_weight=loss_sw,
            l2_reg_strength=alpha,
        )
        result = {
            "solver": solver,
            "tol": tol,
            "train_time": train_time,
            "train_loss": train_loss,
            "n_iter": int(np.squeeze(model.n_iter_)),
            "converged": np.squeeze(model.n_iter_ < model.max_iter).all(),
            "coef_sq_norm": (model.coef_**2).sum(),
        }
        print(result)
        results.append(result)

results = pd.DataFrame.from_records(results)

# %%
import pandas as pd
from pathlib import Path

#results = pd.read_csv(Path(__file__).parent / "bench_quantile_clf.csv")
results["suboptimality"] = results["train_loss"] - results["train_loss"].min() + 1e-12
results

# %%
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
    group.sort_values("tol").plot(x="n_iter", y="suboptimality", loglog=True, marker="o", label=label, ax=ax)
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}")

# %%
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
    group.sort_values("tol").plot(x="train_time", y="suboptimality", loglog=True, marker="o", label=label, ax=ax)
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}")
where
  • "lbfgs" is LogisticRegression(solver="lbfgs")
  • "lbfgs2" is using
    class BinomialRegressor(_GeneralizedLinearRegressor):
        def _get_loss(self):
            return HalfBinomialLoss()
    with solver="lbfgs"

The same kind of solver performance profile on the kicks dataset, see https://github.com/lorentzenchr/notebooks/blob/master/bench_glm_newton_solvers_cholesky_lsmr.ipynb.

@github-actions github-actions bot added the Needs Triage Issue requires triage label Oct 25, 2022
@OmarManzoor
Copy link
Contributor

@lorentzenchr Can I work on this?

@lorentzenchr
Copy link
Member Author
lorentzenchr commented Oct 27, 2022

@OmarManzoor This is not an easy task! The contributor is expected to produce similar performance benchmarks as above.

@ogrisel
Copy link
Member
ogrisel commented Jun 27, 2023

I think we should bump the priority of the resolution of this problem so as to be able to take a better informed decision on the inclusion of a competing solver for LogisticRegression (e.g. in #25462) and maybe fix #18074 as well.

@lorentzenchr
Copy link
Member Author
lorentzenchr commented Jun 27, 2023

Some time ago, I started a branch to do the 1/n factor in LinearModelLoss. This worked excellent for the GLMs but got too complicated in logistic regression that I stopped. This is related to #11865.

TLDR I‘m happy to assist as reviewer (or I might find time in autumn).

Edit: I published this branch in PR #26721.

@ogrisel
Copy link
Member
ogrisel commented Jun 28, 2023

@OmarManzoor if you are still interested in this, I can help give you guidance.

@OmarManzoor
Copy link
Contributor

@OmarManzoor if you are still interested in this, I can help give you guidance.

I checked the draft PR and it contains quite a bit of the work already. I can try this out if you can provide some guidance because currently I don't really understand why the tests don't pass or what else needs to be done in order to ensure correctness of the results.

@lorentzenchr
Copy link
Member Author

@OmarManzoor It‘s too long ago to remember details. But note that some tests of the logistic regression are a bit poor. So sometimes one has to change those tests.

@ogrisel
Copy link
Member
ogrisel commented Jul 13, 2023

@OmarManzoor sorry I now realize that I won't have time to help you work on this before the second half of August.

@OmarManzoor
Copy link
Contributor

@OmarManzoor sorry I now realize that I won't have time to help you work on this before the second half of August.

No problem! I myself am occupied with other areas.

@ogrisel
Copy link
Member
ogrisel commented Aug 30, 2023

While working on #27191, If found the problem in the script when changing the value of alpha: we should plot the penalized objective function instead just the training loss term.

I updated the script below:

# %%
import warnings
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_spli
8000
t
from sklearn._loss import HalfBinomialLoss
from sklearn.linear_model._linear_loss import LinearModelLoss
from sklearn.linear_model._glm.glm import _GeneralizedLinearRegressor
from time import perf_counter
import pandas as pd
import joblib


class BinomialRegressor(_GeneralizedLinearRegressor):
    def _get_loss(self):
        return HalfBinomialLoss()


@joblib.Memory(location=".").cache
def prepare_data():
    df = fetch_openml(data_id=41214).frame
    df["Frequency"] = df["ClaimNb"] / df["Exposure"]
    log_scale_transformer = make_pipeline(
        FunctionTransformer(np.log, validate=False), StandardScaler()
    )
    linear_model_preprocessor = ColumnTransformer(
        [
            ("passthrough_numeric", "passthrough", ["BonusMalus"]),
            (
                "binned_numeric",
                KBinsDiscretizer(n_bins=10, subsample=None),
                ["VehAge", "DrivAge"],
            ),
            ("log_scaled_numeric", log_scale_transformer, ["Density"]),
            (
                "onehot_categorical",
                OneHotEncoder(),
                ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
            ),
        ],
        remainder="drop",
    )
    y = df["Frequency"]
    w = df["Exposure"]
    X = linear_model_preprocessor.fit_transform(df)
    return X, y, w


X, y, w = prepare_data()
y = (y.values > 0).astype(np.float64)
print(f"{X.shape = }")
print(f"{y.mean() = }")

# Make sure, we use equivalent penalties
# alpha = 1e2
# C = 1 / alpha / X.shape[0]
# lr1 = LogisticRegression(C=C, tol=1e-12, max_iter=10000).fit(X, y)
# lr2 = BinomialRegressor(alpha=alpha, tol=1e-12, solver="newton-cholesky").fit(X, y)
# assert np.allclose(lr1.intercept_, lr2.intercept_)

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, train_size=50_000, test_size=10_000, random_state=0
)
print(f"{X_train.shape = }")

results = []
loss_sw = np.full_like(y_train, fill_value=(1.0 / y_train.shape[0]))
alpha = 1e-6
C = 1.0 / alpha / X_train.shape[0]
max_iter = 10_000
for tol in np.logspace(-1, -10, 10):
    for solver in [
        "lbfgs",
        "lbfgs2",
        "newton-cg",
        "newton-cholesky",
        "newton-cholesky2",
    ]:
        tic = perf_counter()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
                model = LogisticRegression(
                    C=C, solver=solver, tol=tol, max_iter=max_iter
                ).fit(X_train, y_train)
                toc = perf_counter()
            else:
                if solver == "lbfgs2":
                    binomial_solver = "lbfgs"
                elif solver == "newton-cholesky2":
                    binomial_solver = "newton-cholesky"
                else:
                    binomial_solver = solver
                model = BinomialRegressor(
                    solver=binomial_solver, alpha=alpha, tol=tol, max_iter=max_iter
                ).fit(X_train, y_train)
                toc = perf_counter()
        train_time = toc - tic

        lml = LinearModelLoss(base_loss=HalfBinomialLoss(), fit_intercept=True)
        train_loss = lml.loss(
            coef=np.r_[model.coef_.ravel(), model.intercept_.ravel()],
            X=X_train,
            y=y_train.astype(np.float64),
            sample_weight=loss_sw,
            l2_reg_strength=alpha,
        )
        obj = train_loss + alpha * np.linalg.norm(model.coef_) ** 2
        result = {
            "solver": solver,
            "tol": tol,
            "train_time": train_time,
            "train_loss": train_loss,
            "obj": obj,
            "n_iter": int(np.squeeze(model.n_iter_)),
            "converged": np.squeeze(model.n_iter_ < model.max_iter).all(),
            "coef_sq_norm": (model.coef_**2).sum(),
        }
        print(result)
        results.append(result)

results = pd.DataFrame.from_records(results)

# %%
import pandas as pd
from pathlib import Path

# results = pd.read_csv(Path(__file__).parent / "bench_quantile_clf.csv")
results["suboptimality"] = results["obj"] - results["obj"].min() + 1e-12
results

# %%
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
    group.sort_values("tol").plot(
        x="n_iter", y="suboptimality", loglog=True, marker="o", label=label, ax=ax
    )
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}");

# %%
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
for label, group in results.groupby("solver"):
    group.sort_values("tol").plot(
        x="train_time", y="suboptimality", logy=True, marker="o", label=label, ax=ax
    )
ax.set_ylabel("suboptimality")
ax.set_title(f"LogReg Solver Profile alpha={alpha}");
# %%

I also changed other things:

  • now y is just (y_original > 0).astype(np.float64), the data is still imbalanced, but the target more natural,
  • I added included newton-cholesky for LogisticRegression(solver="newton-cholesky") and newton-cholesky2 for BinomialRegressor(solver="newton-cholesky") (similarly to lbfgs vs lbfgs2.

I will further explore tweaks in #27191 before reporting updated plots.

@lorentzenchr
Copy link
Member Author

I found the problem in the script when changing the value of alpha: we should plot the penalized objective function instead just the training loss term.

The LinearModelLoss.loss already contains the penalty term. So your new script is double counting it.
If I remember correctly, I checked it against the verbose output/print.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging a pull request may close this issue.

3 participants
0