From ce61a068f685d977b7414493d7aac289e1220b69 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Sat, 21 Dec 2024 00:54:32 +0000 Subject: [PATCH 1/2] docs: add ml.model_selection examples --- bigframes/ml/model_selection.py | 2 + .../sklearn/model_selection/_split.py | 109 ++++++++++++++++++ .../sklearn/model_selection/_validation.py | 17 +++ 3 files changed, 128 insertions(+) diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 9aeacef35e..abb4b0f26c 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -115,6 +115,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra @log_adapter.class_logger class KFold(vendored_model_selection_split.KFold): + __doc__ = inspect.getdoc(vendored_model_selection_split.KFold) + def __init__(self, n_splits: int = 5, *, random_state: Union[int, None] = None): if n_splits < 2: raise ValueError(f"n_splits must be at least 2. Got {n_splits}") diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py index 280962473e..ec16fa8cf9 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py @@ -65,6 +65,80 @@ class KFold(_BaseKFold): Each fold is then used once as a validation while the k - 1 remaining folds form the training set. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> from bigframes.ml.model_selection import KFold + >>> bpd.options.display.progress_bar = None + >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) + >>> y = bpd.DataFrame({"label": [1, 2, 3]}) + >>> kf = KFold(n_splits=3, random_state=42) + >>> for i, (X_train, X_test, y_train, y_test) in enumerate(kf.split(X, y)): + ... print(f"Fold {i}:") + ... print(f" X_train: {X_train}") + ... print(f" X_test: {X_test}") + ... print(f" y_train: {y_train}") + ... print(f" y_test: {y_test}") + ... + Fold 0: + X_train: feat0 feat1 + 1 3 4 + 2 5 6 + + [2 rows x 2 columns] + X_test: feat0 feat1 + 0 1 2 + + [1 rows x 2 columns] + y_train: label + 1 2 + 2 3 + + [2 rows x 1 columns] + y_test: label + 0 1 + + [1 rows x 1 columns] + Fold 1: + X_train: feat0 feat1 + 0 1 2 + 2 5 6 + + [2 rows x 2 columns] + X_test: feat0 feat1 + 1 3 4 + + [1 rows x 2 columns] + y_train: label + 0 1 + 2 3 + + [2 rows x 1 columns] + y_test: label + 1 2 + + [1 rows x 1 columns] + Fold 2: + X_train: feat0 feat1 + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + X_test: feat0 feat1 + 2 5 6 + + [1 rows x 2 columns] + y_train: label + 0 1 + 1 2 + + [2 rows x 1 columns] + y_test: label + 2 3 + + [1 rows x 1 columns] + + Args: n_splits (int): Number of folds. Must be at least 2. Default to 5. @@ -84,6 +158,41 @@ def train_test_split( ): """Splits dataframes or series into random train and test subsets. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> from bigframes.ml.model_selection import train_test_split + >>> bpd.options.display.progress_bar = None + >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]}) + >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]}) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) + >>> X_train + feat0 feat1 + 0 0 1 + 1 2 3 + 4 8 9 + + [3 rows x 2 columns] + >>> y_train + label + 0 0 + 1 1 + 4 4 + + [3 rows x 1 columns] + >>> X_test + feat0 feat1 + 2 4 5 + 3 6 7 + + [2 rows x 2 columns] + >>> y_test + label + 2 2 + 3 3 + + [2 rows x 1 columns] + Args: *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series): A sequence of BigQuery DataFrames or Series that can be joined on diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index 43e155da7d..bc4bf90c8a 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -14,6 +14,23 @@ def cross_validate(estimator, X, y=None, *, cv=None): """Evaluate metric(s) by cross-validation and also record fit/score times. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> from bigframes.ml.model_selection import cross_validate, KFold + >>> from bigframes.ml.linear_model import LinearRegression + >>> bpd.options.display.progress_bar = None + >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) + >>> y = bpd.DataFrame({"label": [1, 2, 3]}) + >>> model = LinearRegression() + >>> scores = cross_validate(model, X, y, cv=KFold(n_splits=3, random_state=42)) + >>> for score in scores["test_score"]: + ... print(score["mean_squared_error"][0]) + ... + 5.218167286047954e-19 + 2.726229944928669e-18 + 1.6197635612324266e-17 + Args: estimator: bigframes.ml model that implements fit(). From 0b4db2421c912dfd1373dbea1840419a082671e6 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Mon, 23 Dec 2024 19:28:21 +0000 Subject: [PATCH 2/2] fix --- .../bigframes_vendored/sklearn/model_selection/_validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py index bc4bf90c8a..b93c47ea04 100644 --- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py +++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py @@ -23,8 +23,8 @@ def cross_validate(estimator, X, y=None, *, cv=None): >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]}) >>> y = bpd.DataFrame({"label": [1, 2, 3]}) >>> model = LinearRegression() - >>> scores = cross_validate(model, X, y, cv=KFold(n_splits=3, random_state=42)) - >>> for score in scores["test_score"]: + >>> scores = cross_validate(model, X, y, cv=3) # doctest: +SKIP + >>> for score in scores["test_score"]: # doctest: +SKIP ... print(score["mean_squared_error"][0]) ... 5.218167286047954e-19