From c8b596045fbd20db66fbfa36aba06093c86a52bd Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Tue, 23 Apr 2024 17:03:12 +0000
Subject: [PATCH 1/4] feat: support the score method for PaLM2TextGenerator

---
 bigframes/ml/core.py                          | 11 ++++
 bigframes/ml/ensemble.py                      |  2 +-
 bigframes/ml/llm.py                           | 58 +++++++++++++++++++
 bigframes/ml/sql.py                           | 14 +++++
 tests/system/load/test_llm.py                 | 31 +++++++---
 tests/unit/ml/test_sql.py                     | 14 +++++
 .../sklearn/ensemble/_forest.py               |  4 +-
 .../bigframes_vendored/xgboost/sklearn.py     |  4 +-
 8 files changed, 126 insertions(+), 12 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index b94ae39687..dde6ea42f3 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -187,6 +187,17 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
 
         return self._session.read_gbq(sql)
 
+    def llm_evaluate(
+        self,
+        input_data: Optional[bpd.DataFrame] = None,
+        task_type: Optional[str] = None,
+    ):
+        sql = self._model_manipulation_sql_generator.ml_llm_evaluate(
+            input_data, task_type
+        )
+
+        return self._session.read_gbq(sql)
+
     def arima_evaluate(self, show_all_candidate_models: bool = False):
         sql = self._model_manipulation_sql_generator.ml_arima_evaluate(
             show_all_candidate_models
diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py
index b248c295f4..6849dac99a 100644
--- a/bigframes/ml/ensemble.py
+++ b/bigframes/ml/ensemble.py
@@ -472,7 +472,7 @@ def predict(
     def score(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
-        y: Union[bpd.DataFrame, bpd.Series],
+        y=None,  # ignored
     ):
         """Calculate evaluation metrics of the model.
 
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 37a38cdd5c..03a5240a18 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -310,6 +310,64 @@ def predict(
 
         return df
 
+    def score(
+        self,
+        X: Union[bpd.DataFrame, bpd.Series],
+        y=None,  # ignored
+        task_type: Literal[
+            "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", "QUESTION_ANSWERING"
+        ] = "TEXT_GENERATION",
+    ) -> bpd.DataFrame:
+        """Calculate evaluation metrics of the model.
+
+        .. note::
+
+            This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
+            Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
+            and might have limited support. For more information, see the launch stage descriptions
+            (https://cloud.google.com/products#product-launch-stages).
+
+        .. note::
+
+            Output matches that of the BigQuery ML.EVALUTE function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm
+            for the outputs relevant to this model type.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                A BigQuery DataFrame as evaluation data. X must have a column named
+                ``input_text`` that contains the prompt text to use when evaluating the model.
+                X must also have a column named ``output_text`` that contains the generated
+                text that you would expect to be returned by the model.
+            y (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                A BigQuery DataFrame as evaluation labels.
+            task_type (Optional[str]):
+                The type of the task for LLM model. Default to "TEXT_GENERATION".
+                Possible values: "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", and
+                "QUESTION_ANSWERING".
+
+        Returns:
+            bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
+        """
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+
+        columns = X.columns.to_list()
+        if "input_text" not in columns:
+            raise ValueError(
+                """Must contain a column named input_text that contains the prompt
+                text to use when evaluating the model."""
+            )
+        if "output_text" not in columns:
+            raise ValueError(
+                """Must contain a column named output_text that contains the generated
+                text that you would expect to be returned by the model."""
+            )
+        (X,) = utils.convert_to_dataframe(X)
+        refined_X = X[["input_text", "output_text"]].copy()
+
+        return self._bqml_model.llm_evaluate(refined_X, task_type)
+
     def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator:
         """Save the model to BigQuery.
 
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index 59c768ce81..205139a89e 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -318,6 +318,20 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str:
             return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`,
   ({source_sql}))"""
 
+    # ML evaluation TVFs
+    def ml_llm_evaluate(
+        self, source_df: Optional[bpd.DataFrame] = None, task_type: Optional[str] = None
+    ) -> str:
+        """Encode ML.EVALUATE for BQML"""
+        if source_df is None:
+            source_sql = None
+        else:
+            # Note: don't need index as evaluate returns a new table
+            source_sql, _, _ = source_df._to_sql_query(include_index=False)
+
+        return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`,
+            ({source_sql}), STRUCT("{task_type}" AS task_type))"""
+
     # ML evaluation TVFs
     def ml_arima_evaluate(self, show_all_candidate_models: bool = False) -> str:
         """Encode ML.ARMIA_EVALUATE for BQML"""
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index d56f6100c1..23b3e791d4 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -22,13 +22,12 @@
 def llm_fine_tune_df_default_index(
     session: bigframes.Session,
 ) -> bigframes.dataframe.DataFrame:
-    sql = """
-SELECT
-  CONCAT("Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: ", text) as prompt,
-  CAST(label AS STRING) as label
-FROM `llm_tuning.emotion_classification_train`
-"""
-    return session.read_gbq(sql)
+    training_table_name = "llm_tuning.emotion_classification_train"
+    df = session.read_gbq(training_table_name)
+    prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: "
+    df["prompt"] = prefix + df["text"]
+    df["label"] = df["label"].astype("string")
+    return df
 
 
 @pytest.fixture(scope="session")
@@ -69,3 +68,21 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_
     assert all(series.str.len() == 1)
 
     # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept
+
+
+def test_llm_palm_score(llm_fine_tune_df_default_index):
+    model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison")
+    eval_df = llm_fine_tune_df_default_index.rename(
+        columns={"prompt": "input_text", "label": "output_text"}
+    )
+    # Check score to ensure the model was fitted
+    score_result = model.score(eval_df).to_pandas()
+    score_result_col = score_result.columns.to_list()
+    expected_col = [
+        "bleu4_score",
+        "rouge-l_precision",
+        "rouge-l_recall",
+        "rouge-l_f1_score",
+        "evaluation_status",
+    ]
+    assert all(col in score_result_col for col in expected_col)
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 3560f05cb6..1a5e8fe962 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -319,6 +319,20 @@ def test_ml_predict_correct(
     )
 
 
+def test_ml_llm_evaluate_correct(
+    model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
+    mock_df: bpd.DataFrame,
+):
+    sql = model_manipulation_sql_generator.ml_llm_evaluate(
+        source_df=mock_df, task_type="CLASSIFICATION"
+    )
+    assert (
+        sql
+        == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`,
+            (input_X_sql), STRUCT("CLASSIFICATION" AS task_type))"""
+    )
+
+
 def test_ml_evaluate_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
index 53a211dd7f..a55b7b80d3 100644
--- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
+++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
@@ -95,7 +95,7 @@ class RandomForestRegressor(ForestRegressor):
             Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2.
         tree_method (Optional[str]):
             Specify which tree method to use. Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.
@@ -160,7 +160,7 @@ class RandomForestClassifier(ForestClassifier):
             Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2.
         tree_method (Optional[str]):
             Specify which tree method to use. Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.
diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py
index 424b17a371..5a2a69dff4 100644
--- a/third_party/bigframes_vendored/xgboost/sklearn.py
+++ b/third_party/bigframes_vendored/xgboost/sklearn.py
@@ -63,7 +63,7 @@ class XGBRegressor(XGBModel, XGBRegressorBase):
             Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE".
         tree_method (Optional[str]):
             Specify which tree method to use.  Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.
@@ -110,7 +110,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
             Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE".
         tree_method (Optional[str]):
             Specify which tree method to use.  Default to "auto". If this parameter is set to
-            default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx",
+            default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx",
             "hist".
         min_child_weight (Optional[float]):
             Minimum sum of instance weight(hessian) needed in a child. Default to 1.

From 58ded5db497edf23ac9f5f30fe8b2d56c6ad7230 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Tue, 23 Apr 2024 22:26:23 +0000
Subject: [PATCH 2/4] address comments

---
 bigframes/ml/ensemble.py      |  2 +-
 bigframes/ml/llm.py           | 34 +++++++++++++++++++---------------
 tests/system/load/test_llm.py | 28 +++++++++++++++++++++++++++-
 3 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py
index 6849dac99a..b248c295f4 100644
--- a/bigframes/ml/ensemble.py
+++ b/bigframes/ml/ensemble.py
@@ -472,7 +472,7 @@ def predict(
     def score(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
-        y=None,  # ignored
+        y: Union[bpd.DataFrame, bpd.Series],
     ):
         """Calculate evaluation metrics of the model.
 
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 03a5240a18..f5729805ed 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -313,10 +313,10 @@ def predict(
     def score(
         self,
         X: Union[bpd.DataFrame, bpd.Series],
-        y=None,  # ignored
+        y: Union[bpd.DataFrame, bpd.Series],
         task_type: Literal[
-            "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", "QUESTION_ANSWERING"
-        ] = "TEXT_GENERATION",
+            "text_generation", "classification", "summarization", "question_answering"
+        ] = "text_generation",
     ) -> bpd.DataFrame:
         """Calculate evaluation metrics of the model.
 
@@ -337,14 +337,12 @@ def score(
             X (bigframes.dataframe.DataFrame or bigframes.series.Series):
                 A BigQuery DataFrame as evaluation data. X must have a column named
                 ``input_text`` that contains the prompt text to use when evaluating the model.
-                X must also have a column named ``output_text`` that contains the generated
-                text that you would expect to be returned by the model.
             y (bigframes.dataframe.DataFrame or bigframes.series.Series):
-                A BigQuery DataFrame as evaluation labels.
+                A BigQuery DataFrame as evaluation labels. y must also have a column named ``output_text`` that contains the generated
+                text that you would expect to be returned by the model.
             task_type (Optional[str]):
-                The type of the task for LLM model. Default to "TEXT_GENERATION".
-                Possible values: "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", and
-                "QUESTION_ANSWERING".
+                The type of the task for LLM model. Default to "text_generation".
+                Possible values: "text_generation", "classification", "summarization", and "question_answering".
 
         Returns:
             bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
@@ -352,21 +350,27 @@ def score(
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before score")
 
-        columns = X.columns.to_list()
-        if "input_text" not in columns:
+        X, y = utils.convert_to_dataframe(X, y)
+
+        X_columns = X.columns.to_list()
+        y_columns = y.columns.to_list()
+        if "input_text" not in X_columns:
             raise ValueError(
                 """Must contain a column named input_text that contains the prompt
                 text to use when evaluating the model."""
             )
-        if "output_text" not in columns:
+        if "output_text" not in y_columns:
             raise ValueError(
                 """Must contain a column named output_text that contains the generated
                 text that you would expect to be returned by the model."""
             )
-        (X,) = utils.convert_to_dataframe(X)
-        refined_X = X[["input_text", "output_text"]].copy()
 
-        return self._bqml_model.llm_evaluate(refined_X, task_type)
+        input_data = (
+            X.join(y, how="outer") if (X is not None) and (y is not None) else None
+        )
+        refined_data = input_data[["input_text", "output_text"]].copy()
+
+        return self._bqml_model.llm_evaluate(refined_data, task_type)
 
     def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator:
         """Save the model to BigQuery.
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index 23b3e791d4..6d548a2e78 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -76,7 +76,9 @@ def test_llm_palm_score(llm_fine_tune_df_default_index):
         columns={"prompt": "input_text", "label": "output_text"}
     )
     # Check score to ensure the model was fitted
-    score_result = model.score(eval_df).to_pandas()
+    score_result = model.score(
+        X=eval_df[["input_text"]], y=eval_df[["output_text"]]
+    ).to_pandas()
     score_result_col = score_result.columns.to_list()
     expected_col = [
         "bleu4_score",
@@ -86,3 +88,27 @@ def test_llm_palm_score(llm_fine_tune_df_default_index):
         "evaluation_status",
     ]
     assert all(col in score_result_col for col in expected_col)
+
+
+def test_llm_palm_score_params(llm_fine_tune_df_default_index):
+    model = bigframes.ml.llm.PaLM2TextGenerator(
+        model_name="text-bison", max_iterations=1
+    )
+    eval_df = llm_fine_tune_df_default_index.rename(
+        columns={"prompt": "input_text", "label": "output_text"}
+    )
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=eval_df["input_text"], y=eval_df["output_text"], task_type="classification"
+    ).to_pandas()
+    score_result_col = score_result.columns.to_list()
+    expected_col = [
+        "trial_id",
+        "precision",
+        "recall",
+        "accuracy",
+        "f1_score",
+        "log_loss",
+        "roc_auc",
+    ]
+    assert all(col in score_result_col for col in expected_col)

From 562cf8dec3e6151cfb801e3c3283deb0b9bc46b6 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Wed, 24 Apr 2024 22:23:47 +0000
Subject: [PATCH 3/4] address additional comments

---
 bigframes/ml/core.py          |  2 +-
 bigframes/ml/llm.py           | 35 +++++++++++++++--------------------
 bigframes/ml/sql.py           | 10 +++-------
 tests/system/load/test_llm.py | 15 +++++++--------
 4 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index dde6ea42f3..12c881c19a 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -189,7 +189,7 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
 
     def llm_evaluate(
         self,
-        input_data: Optional[bpd.DataFrame] = None,
+        input_data: bpd.DataFrame,
         task_type: Optional[str] = None,
     ):
         sql = self._model_manipulation_sql_generator.ml_llm_evaluate(
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index f5729805ed..d14445866d 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -220,7 +220,7 @@ def predict(
 
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series):
-                Input DataFrame or Series, which needs to contain a column with name "prompt". Only the column will be used as input.
+                Input DataFrame or Series, which contains only one column of prompts.
                 Prompts can include preamble, questions, suggestions, instructions, or examples.
 
             temperature (float, default 0.0):
@@ -335,11 +335,11 @@ def score(
 
         Args:
             X (bigframes.dataframe.DataFrame or bigframes.series.Series):
-                A BigQuery DataFrame as evaluation data. X must have a column named
-                ``input_text`` that contains the prompt text to use when evaluating the model.
+                A BigQuery DataFrame as evaluation data, which contains only one column of input_text
+                that contains the prompt text to use when evaluating the model.
             y (bigframes.dataframe.DataFrame or bigframes.series.Series):
-                A BigQuery DataFrame as evaluation labels. y must also have a column named ``output_text`` that contains the generated
-                text that you would expect to be returned by the model.
+                A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
+                that you would expect to be returned by the model.
             task_type (Optional[str]):
                 The type of the task for LLM model. Default to "text_generation".
                 Possible values: "text_generation", "classification", "summarization", and "question_answering".
@@ -352,25 +352,20 @@ def score(
 
         X, y = utils.convert_to_dataframe(X, y)
 
-        X_columns = X.columns.to_list()
-        y_columns = y.columns.to_list()
-        if "input_text" not in X_columns:
-            raise ValueError(
-                """Must contain a column named input_text that contains the prompt
-                text to use when evaluating the model."""
-            )
-        if "output_text" not in y_columns:
+        if len(X.columns) != 1 or len(y.columns) != 1:
             raise ValueError(
-                """Must contain a column named output_text that contains the generated
-                text that you would expect to be returned by the model."""
+                f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}"
             )
 
-        input_data = (
-            X.join(y, how="outer") if (X is not None) and (y is not None) else None
-        )
-        refined_data = input_data[["input_text", "output_text"]].copy()
+        # BQML identified the column by name
+        X_col_label = cast(blocks.Label, X.columns[0])
+        y_col_label = cast(blocks.Label, y.columns[0])
+        X = X.rename(columns={X_col_label: "input_text"})
+        y = y.rename(columns={y_col_label: "output_text"})
+
+        input_data = X.join(y, how="outer")
 
-        return self._bqml_model.llm_evaluate(refined_data, task_type)
+        return self._bqml_model.llm_evaluate(input_data, task_type)
 
     def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator:
         """Save the model to BigQuery.
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index 205139a89e..3679be16c6 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -320,15 +320,11 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str:
 
     # ML evaluation TVFs
     def ml_llm_evaluate(
-        self, source_df: Optional[bpd.DataFrame] = None, task_type: Optional[str] = None
+        self, source_df: bpd.DataFrame, task_type: Optional[str] = None
     ) -> str:
         """Encode ML.EVALUATE for BQML"""
-        if source_df is None:
-            source_sql = None
-        else:
-            # Note: don't need index as evaluate returns a new table
-            source_sql, _, _ = source_df._to_sql_query(include_index=False)
-
+        # Note: don't need index as evaluate returns a new table
+        source_sql, _, _ = source_df._to_sql_query(include_index=False)
         return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`,
             ({source_sql}), STRUCT("{task_type}" AS task_type))"""
 
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index 6d548a2e78..835b31955e 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -72,12 +72,11 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_
 
 def test_llm_palm_score(llm_fine_tune_df_default_index):
     model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison")
-    eval_df = llm_fine_tune_df_default_index.rename(
-        columns={"prompt": "input_text", "label": "output_text"}
-    )
+
     # Check score to ensure the model was fitted
     score_result = model.score(
-        X=eval_df[["input_text"]], y=eval_df[["output_text"]]
+        X=llm_fine_tune_df_default_index[["prompt"]],
+        y=llm_fine_tune_df_default_index[["label"]],
     ).to_pandas()
     score_result_col = score_result.columns.to_list()
     expected_col = [
@@ -94,12 +93,12 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index):
     model = bigframes.ml.llm.PaLM2TextGenerator(
         model_name="text-bison", max_iterations=1
     )
-    eval_df = llm_fine_tune_df_default_index.rename(
-        columns={"prompt": "input_text", "label": "output_text"}
-    )
+
     # Check score to ensure the model was fitted
     score_result = model.score(
-        X=eval_df["input_text"], y=eval_df["output_text"], task_type="classification"
+        X=llm_fine_tune_df_default_index["prompt"],
+        y=llm_fine_tune_df_default_index["label"],
+        task_type="classification",
     ).to_pandas()
     score_result_col = score_result.columns.to_list()
     expected_col = [

From 3e65eab677a0f165649675cb3e7474ada7911f60 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Wed, 24 Apr 2024 23:23:03 +0000
Subject: [PATCH 4/4] address minor comments

---
 bigframes/ml/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index d14445866d..4a58152d14 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -340,7 +340,7 @@ def score(
             y (bigframes.dataframe.DataFrame or bigframes.series.Series):
                 A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
                 that you would expect to be returned by the model.
-            task_type (Optional[str]):
+            task_type (str):
                 The type of the task for LLM model. Default to "text_generation".
                 Possible values: "text_generation", "classification", "summarization", and "question_answering".