From ce61a068f685d977b7414493d7aac289e1220b69 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Sat, 21 Dec 2024 00:54:32 +0000
Subject: [PATCH 1/2] docs: add ml.model_selection examples

---
 bigframes/ml/model_selection.py               |   2 +
 .../sklearn/model_selection/_split.py         | 109 ++++++++++++++++++
 .../sklearn/model_selection/_validation.py    |  17 +++
 3 files changed, 128 insertions(+)
diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py
index 9aeacef35e..abb4b0f26c 100644
--- a/bigframes/ml/model_selection.py
+++ b/bigframes/ml/model_selection.py
@@ -115,6 +115,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
 
 @log_adapter.class_logger
 class KFold(vendored_model_selection_split.KFold):
+    __doc__ = inspect.getdoc(vendored_model_selection_split.KFold)
+
     def __init__(self, n_splits: int = 5, *, random_state: Union[int, None] = None):
         if n_splits < 2:
             raise ValueError(f"n_splits must be at least 2. Got {n_splits}")
diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_split.py b/third_party/bigframes_vendored/sklearn/model_selection/_split.py
index 280962473e..ec16fa8cf9 100644
--- a/third_party/bigframes_vendored/sklearn/model_selection/_split.py
+++ b/third_party/bigframes_vendored/sklearn/model_selection/_split.py
@@ -65,6 +65,80 @@ class KFold(_BaseKFold):
     Each fold is then used once as a validation while the k - 1 remaining
     folds form the training set.
 
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.model_selection import KFold
+        >>> bpd.options.display.progress_bar = None
+        >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
+        >>> y = bpd.DataFrame({"label": [1, 2, 3]})
+        >>> kf = KFold(n_splits=3, random_state=42)
+        >>> for i, (X_train, X_test, y_train, y_test) in enumerate(kf.split(X, y)):
+        ...     print(f"Fold {i}:")
+        ...     print(f"  X_train: {X_train}")
+        ...     print(f"  X_test: {X_test}")
+        ...     print(f"  y_train: {y_train}")
+        ...     print(f"  y_test: {y_test}")
+        ...
+        Fold 0:
+          X_train:    feat0  feat1
+        1      3      4
+        2      5      6
+        <BLANKLINE>
+        [2 rows x 2 columns]
+          X_test:    feat0  feat1
+        0      1      2
+        <BLANKLINE>
+        [1 rows x 2 columns]
+          y_train:    label
+        1      2
+        2      3
+        <BLANKLINE>
+        [2 rows x 1 columns]
+          y_test:    label
+        0      1
+        <BLANKLINE>
+        [1 rows x 1 columns]
+        Fold 1:
+          X_train:    feat0  feat1
+        0      1      2
+        2      5      6
+        <BLANKLINE>
+        [2 rows x 2 columns]
+          X_test:    feat0  feat1
+        1      3      4
+        <BLANKLINE>
+        [1 rows x 2 columns]
+          y_train:    label
+        0      1
+        2      3
+        <BLANKLINE>
+        [2 rows x 1 columns]
+          y_test:    label
+        1      2
+        <BLANKLINE>
+        [1 rows x 1 columns]
+        Fold 2:
+          X_train:    feat0  feat1
+        0      1      2
+        1      3      4
+        <BLANKLINE>
+        [2 rows x 2 columns]
+          X_test:    feat0  feat1
+        2      5      6
+        <BLANKLINE>
+        [1 rows x 2 columns]
+          y_train:    label
+        0      1
+        1      2
+        <BLANKLINE>
+        [2 rows x 1 columns]
+          y_test:    label
+        2      3
+        <BLANKLINE>
+        [1 rows x 1 columns]
+
+
     Args:
         n_splits (int):
             Number of folds. Must be at least 2. Default to 5.
@@ -84,6 +158,41 @@ def train_test_split(
 ):
     """Splits dataframes or series into random train and test subsets.
 
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.model_selection import train_test_split
+        >>> bpd.options.display.progress_bar = None
+        >>> X = bpd.DataFrame({"feat0": [0, 2, 4, 6, 8], "feat1": [1, 3, 5, 7, 9]})
+        >>> y = bpd.DataFrame({"label": [0, 1, 2, 3, 4]})
+        >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
+        >>> X_train
+            feat0  feat1
+        0      0      1
+        1      2      3
+        4      8      9
+        <BLANKLINE>
+        [3 rows x 2 columns]
+        >>> y_train
+            label
+        0      0
+        1      1
+        4      4
+        <BLANKLINE>
+        [3 rows x 1 columns]
+        >>> X_test
+            feat0  feat1
+        2      4      5
+        3      6      7
+        <BLANKLINE>
+        [2 rows x 2 columns]
+        >>> y_test
+            label
+        2      2
+        3      3
+        <BLANKLINE>
+        [2 rows x 1 columns]
+
     Args:
         *arrays (bigframes.dataframe.DataFrame or bigframes.series.Series):
             A sequence of BigQuery DataFrames or Series that can be joined on
diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
index 43e155da7d..bc4bf90c8a 100644
--- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
+++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
@@ -14,6 +14,23 @@
 def cross_validate(estimator, X, y=None, *, cv=None):
     """Evaluate metric(s) by cross-validation and also record fit/score times.
 
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.model_selection import cross_validate, KFold
+        >>> from bigframes.ml.linear_model import LinearRegression
+        >>> bpd.options.display.progress_bar = None
+        >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
+        >>> y = bpd.DataFrame({"label": [1, 2, 3]})
+        >>> model = LinearRegression()
+        >>> scores = cross_validate(model, X, y, cv=KFold(n_splits=3, random_state=42))
+        >>> for score in scores["test_score"]:
+        ...   print(score["mean_squared_error"][0])
+        ...
+        5.218167286047954e-19
+        2.726229944928669e-18
+        1.6197635612324266e-17
+
     Args:
         estimator:
             bigframes.ml model that implements fit().

From 0b4db2421c912dfd1373dbea1840419a082671e6 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Mon, 23 Dec 2024 19:28:21 +0000
Subject: [PATCH 2/2] fix

---
 .../bigframes_vendored/sklearn/model_selection/_validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
index bc4bf90c8a..b93c47ea04 100644
--- a/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
+++ b/third_party/bigframes_vendored/sklearn/model_selection/_validation.py
@@ -23,8 +23,8 @@ def cross_validate(estimator, X, y=None, *, cv=None):
         >>> X = bpd.DataFrame({"feat0": [1, 3, 5], "feat1": [2, 4, 6]})
         >>> y = bpd.DataFrame({"label": [1, 2, 3]})
         >>> model = LinearRegression()
-        >>> scores = cross_validate(model, X, y, cv=KFold(n_splits=3, random_state=42))
-        >>> for score in scores["test_score"]:
+        >>> scores = cross_validate(model, X, y, cv=3) # doctest: +SKIP
+        >>> for score in scores["test_score"]: # doctest: +SKIP
         ...   print(score["mean_squared_error"][0])
         ...
         5.218167286047954e-19