8000 feat: support data split for evaluation in linear and ensemble models by shobsi · Pull Request #1081 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
105 changes: 92 additions & 13 deletions bigframes/ml/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from __future__ import annotations

from typing import Dict, List, Literal, Optional, Union
from typing import List, Literal, Optional, Union

import bigframes_vendored.sklearn.ensemble._forest
import bigframes_vendored.xgboost.sklearn
Expand Down Expand Up @@ -47,6 +47,9 @@
"max_iterations": "maxIterations",
"enable_global_explain": "enableGlobalExplain",
"xgboost_version": "xgboostVersion",
"data_split_method": "dataSplitMethod",
"data_split_eval_fraction": "dataSplitEvalFraction",
"data_split_col": "dataSplitColumn",
}


Expand Down Expand Up @@ -78,6 +81,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.booster = booster
Expand All @@ -97,6 +109,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -115,11 +130,11 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "BOOSTED_TREE_REGRESSOR",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"booster_type": self.booster,
Expand All @@ -140,6 +155,13 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down Expand Up @@ -227,6 +249,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.booster = booster
Expand All @@ -246,6 +277,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -264,11 +298,11 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "BOOSTED_TREE_CLASSIFIER",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"early_stop": True,
"num_parallel_tree": self.n_estimators,
"booster_type": self.booster,
Expand All @@ -289,6 +323,13 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down Expand Up @@ -370,6 +411,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.tree_method = tree_method
Expand All @@ -385,6 +435,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -403,9 +456,9 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "RANDOM_FOREST_REGRESSOR",
"early_stop": True,
"num_parallel_tree": self.n_estimators,
Expand All @@ -420,11 +473,18 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"min_rel_progress": self.tol,
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down Expand Up @@ -526,6 +586,15 @@ def __init__(
tol: float = 0.01,
enable_global_explain: bool = False,
xgboost_version: Literal["0.9", "1.1"] = "0.9",
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.n_estimators = n_estimators
self.tree_method = tree_method
Expand All @@ -541,6 +610,9 @@ def __init__(
self.tol = tol
self.enable_global_explain = enable_global_explain
self.xgboost_version = xgboost_version
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -559,9 +631,9 @@ def _from_bq(
return model

@property
def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
return {
options = {
"model_type": "RANDOM_FOREST_CLASSIFIER",
"early_stop": True,
"num_parallel_tree": self.n_estimators,
Expand All @@ -576,11 +648,18 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]:
"l1_reg": self.reg_alpha,
"l2_reg": self.reg_lambda,
"min_rel_progress": self.tol,
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"enable_global_explain": self.enable_global_explain,
"xgboost_version": self.xgboost_version,
}

if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

def _fit(
self,
X: Union[bpd.DataFrame, bpd.Series],
Expand Down
39 changes: 37 additions & 2 deletions bigframes/ml/linear_model.py
< 61F6 td class="blob-code blob-code-addition js-file-line"> "auto_split",
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
"warm_start": "warmStart",
"calculate_p_values": "calculatePValues",
"enable_global_explain": "enableGlobalExplain",
"data_split_method": "dataSplitMethod",
"data_split_eval_fraction": "dataSplitEvalFraction",
"data_split_col": "dataSplitColumn",
}


Expand Down Expand Up @@ -69,6 +72,15 @@ def __init__(
ls_init_learning_rate: Optional[float] = None,
calculate_p_values: bool = False,
enable_global_explain: bool = False,
data_split_method: Literal[
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.optimize_strategy = optimize_strategy
self.fit_intercept = fit_intercept
Expand All @@ -82,6 +94,9 @@ def __init__(
self.ls_init_learning_rate = ls_init_learning_rate
self.calculate_p_values = calculate_p_values
self.enable_global_explain = enable_global_explain
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

Expand All @@ -104,7 +119,7 @@ def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
options = {
"model_type": "LINEAR_REG",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"optimize_strategy": self.optimize_strategy,
"fit_intercept": self.fit_intercept,
"l2_reg": self.l2_reg,
Expand All @@ -123,6 +138,10 @@ def _bqml_options(self) -> dict:
# Even presenting warm_start returns error for NORMAL_EQUATION optimizer
if self.warm_start:
options["warm_start"] = self.warm_start
if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

Expand Down Expand Up @@ -209,6 +228,15 @@ def __init__(
calculate_p_values: bool = False,
enable_global_explain: bool = False,
class_weight: Optional[Union[Literal["balanced"], Dict[str, float]]] = None,
data_split_method: Literal[
"auto_split",
"random",
"custom",
"seq",
"no_split",
] = "no_split",
data_split_eval_fraction: Optional[float] = None,
data_split_col: Optional[str] = None,
):
self.optimize_strategy = optimize_strategy
self.fit_intercept = fit_intercept
Expand All @@ -223,6 +251,9 @@ def __init__(
self.calculate_p_values = calculate_p_values
self.enable_global_explain = enable_global_explain
self.class_weight = class_weight
self.data_split_method = data_split_method
self.data_split_eval_fraction = data_split_eval_fraction
self.data_split_col = data_split_col
self._auto_class_weight = class_weight == "balanced"
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()
Expand Down Expand Up @@ -253,7 +284,7 @@ def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
options = {
"model_type": "LOGISTIC_REG",
"data_split_method": "NO_SPLIT",
"data_split_method": self.data_split_method,
"fit_intercept": self.fit_intercept,
"auto_class_weights": self._auto_class_weight,
"optimize_strategy": self.optimize_strategy,
Expand All @@ -275,6 +306,10 @@ def _bqml_options(self) -> dict:
# Even presenting warm_start returns error for NORMAL_EQUATION optimizer
if self.warm_start:
options["warm_start"] = self.warm_start
if self.data_split_eval_fraction is not None:
options["data_split_eval_fraction"] = self.data_split_eval_fraction
if self.data_split_col is not None:
options["data_split_col"] = self.data_split_col

return options

Expand Down
Loading
0