From c8b596045fbd20db66fbfa36aba06093c86a52bd Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 23 Apr 2024 17:03:12 +0000 Subject: [PATCH 1/4] feat: support the score method for PaLM2TextGenerator --- bigframes/ml/core.py | 11 ++++ bigframes/ml/ensemble.py | 2 +- bigframes/ml/llm.py | 58 +++++++++++++++++++ bigframes/ml/sql.py | 14 +++++ tests/system/load/test_llm.py | 31 +++++++--- tests/unit/ml/test_sql.py | 14 +++++ .../sklearn/ensemble/_forest.py | 4 +- .../bigframes_vendored/xgboost/sklearn.py | 4 +- 8 files changed, 126 insertions(+), 12 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index b94ae39687..dde6ea42f3 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -187,6 +187,17 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None): return self._session.read_gbq(sql) + def llm_evaluate( + self, + input_data: Optional[bpd.DataFrame] = None, + task_type: Optional[str] = None, + ): + sql = self._model_manipulation_sql_generator.ml_llm_evaluate( + input_data, task_type + ) + + return self._session.read_gbq(sql) + def arima_evaluate(self, show_all_candidate_models: bool = False): sql = self._model_manipulation_sql_generator.ml_arima_evaluate( show_all_candidate_models diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index b248c295f4..6849dac99a 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -472,7 +472,7 @@ def predict( def score( self, X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored ): """Calculate evaluation metrics of the model. diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 37a38cdd5c..03a5240a18 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -310,6 +310,64 @@ def predict( return df + def score( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + task_type: Literal[ + "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", "QUESTION_ANSWERING" + ] = "TEXT_GENERATION", + ) -> bpd.DataFrame: + """Calculate evaluation metrics of the model. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm + for the outputs relevant to this model type. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation data. X must have a column named + ``input_text`` that contains the prompt text to use when evaluating the model. + X must also have a column named ``output_text`` that contains the generated + text that you would expect to be returned by the model. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation labels. + task_type (Optional[str]): + The type of the task for LLM model. Default to "TEXT_GENERATION". + Possible values: "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", and + "QUESTION_ANSWERING". + + Returns: + bigframes.dataframe.DataFrame: The DataFrame as evaluation result. + """ + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + + columns = X.columns.to_list() + if "input_text" not in columns: + raise ValueError( + """Must contain a column named input_text that contains the prompt + text to use when evaluating the model.""" + ) + if "output_text" not in columns: + raise ValueError( + """Must contain a column named output_text that contains the generated + text that you would expect to be returned by the model.""" + ) + (X,) = utils.convert_to_dataframe(X) + refined_X = X[["input_text", "output_text"]].copy() + + return self._bqml_model.llm_evaluate(refined_X, task_type) + def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: """Save the model to BigQuery. diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 59c768ce81..205139a89e 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -318,6 +318,20 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str: return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, ({source_sql}))""" + # ML evaluation TVFs + def ml_llm_evaluate( + self, source_df: Optional[bpd.DataFrame] = None, task_type: Optional[str] = None + ) -> str: + """Encode ML.EVALUATE for BQML""" + if source_df is None: + source_sql = None + else: + # Note: don't need index as evaluate returns a new table + source_sql, _, _ = source_df._to_sql_query(include_index=False) + + return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, + ({source_sql}), STRUCT("{task_type}" AS task_type))""" + # ML evaluation TVFs def ml_arima_evaluate(self, show_all_candidate_models: bool = False) -> str: """Encode ML.ARMIA_EVALUATE for BQML""" diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index d56f6100c1..23b3e791d4 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -22,13 +22,12 @@ def llm_fine_tune_df_default_index( session: bigframes.Session, ) -> bigframes.dataframe.DataFrame: - sql = """ -SELECT - CONCAT("Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: ", text) as prompt, - CAST(label AS STRING) as label -FROM `llm_tuning.emotion_classification_train` -""" - return session.read_gbq(sql) + training_table_name = "llm_tuning.emotion_classification_train" + df = session.read_gbq(training_table_name) + prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: " + df["prompt"] = prefix + df["text"] + df["label"] = df["label"].astype("string") + return df @pytest.fixture(scope="session") @@ -69,3 +68,21 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_ assert all(series.str.len() == 1) # TODO(ashleyxu b/335492787): After bqml rolled out version control: save, load, check parameters to ensure configuration was kept + + +def test_llm_palm_score(llm_fine_tune_df_default_index): + model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison") + eval_df = llm_fine_tune_df_default_index.rename( + columns={"prompt": "input_text", "label": "output_text"} + ) + # Check score to ensure the model was fitted + score_result = model.score(eval_df).to_pandas() + score_result_col = score_result.columns.to_list() + expected_col = [ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ] + assert all(col in score_result_col for col in expected_col) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 3560f05cb6..1a5e8fe962 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -319,6 +319,20 @@ def test_ml_predict_correct( ) +def test_ml_llm_evaluate_correct( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_manipulation_sql_generator.ml_llm_evaluate( + source_df=mock_df, task_type="CLASSIFICATION" + ) + assert ( + sql + == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, + (input_X_sql), STRUCT("CLASSIFICATION" AS task_type))""" + ) + + def test_ml_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 53a211dd7f..a55b7b80d3 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -95,7 +95,7 @@ class RandomForestRegressor(ForestRegressor): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to - default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", + default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx", "hist". min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. @@ -160,7 +160,7 @@ class RandomForestClassifier(ForestClassifier): Number of parallel trees constructed during each iteration. Default to 100. Minimum value is 2. tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to - default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", + default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx", "hist". min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index 424b17a371..5a2a69dff4 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -63,7 +63,7 @@ class XGBRegressor(XGBModel, XGBRegressorBase): Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE". tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to - default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", + default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx", "hist". min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. @@ -110,7 +110,7 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase): Type of normalization algorithm for DART booster. Possible values: "TREE", "FOREST". Default to "TREE". tree_method (Optional[str]): Specify which tree method to use. Default to "auto". If this parameter is set to - default, XGBoost will choose the most conservative option available. Possible values: ""exact", "approx", + default, XGBoost will choose the most conservative option available. Possible values: "exact", "approx", "hist". min_child_weight (Optional[float]): Minimum sum of instance weight(hessian) needed in a child. Default to 1. From 58ded5db497edf23ac9f5f30fe8b2d56c6ad7230 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 23 Apr 2024 22:26:23 +0000 Subject: [PATCH 2/4] address comments --- bigframes/ml/ensemble.py | 2 +- bigframes/ml/llm.py | 34 +++++++++++++++++++--------------- tests/system/load/test_llm.py | 28 +++++++++++++++++++++++++++- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 6849dac99a..b248c295f4 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -472,7 +472,7 @@ def predict( def score( self, X: Union[bpd.DataFrame, bpd.Series], - y=None, # ignored + y: Union[bpd.DataFrame, bpd.Series], ): """Calculate evaluation metrics of the model. diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 03a5240a18..f5729805ed 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -313,10 +313,10 @@ def predict( def score( self, X: Union[bpd.DataFrame, bpd.Series], - y=None, # ignored + y: Union[bpd.DataFrame, bpd.Series], task_type: Literal[ - "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", "QUESTION_ANSWERING" - ] = "TEXT_GENERATION", + "text_generation", "classification", "summarization", "question_answering" + ] = "text_generation", ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. @@ -337,14 +337,12 @@ def score( X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. X must have a column named ``input_text`` that contains the prompt text to use when evaluating the model. - X must also have a column named ``output_text`` that contains the generated - text that you would expect to be returned by the model. y (bigframes.dataframe.DataFrame or bigframes.series.Series): - A BigQuery DataFrame as evaluation labels. + A BigQuery DataFrame as evaluation labels. y must also have a column named ``output_text`` that contains the generated + text that you would expect to be returned by the model. task_type (Optional[str]): - The type of the task for LLM model. Default to "TEXT_GENERATION". - Possible values: "TEXT_GENERATION", "CLASSIFICATION", "SUMMARIZATION", and - "QUESTION_ANSWERING". + The type of the task for LLM model. Default to "text_generation". + Possible values: "text_generation", "classification", "summarization", and "question_answering". Returns: bigframes.dataframe.DataFrame: The DataFrame as evaluation result. @@ -352,21 +350,27 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - columns = X.columns.to_list() - if "input_text" not in columns: + X, y = utils.convert_to_dataframe(X, y) + + X_columns = X.columns.to_list() + y_columns = y.columns.to_list() + if "input_text" not in X_columns: raise ValueError( """Must contain a column named input_text that contains the prompt text to use when evaluating the model.""" ) - if "output_text" not in columns: + if "output_text" not in y_columns: raise ValueError( """Must contain a column named output_text that contains the generated text that you would expect to be returned by the model.""" ) - (X,) = utils.convert_to_dataframe(X) - refined_X = X[["input_text", "output_text"]].copy() - return self._bqml_model.llm_evaluate(refined_X, task_type) + input_data = ( + X.join(y, how="outer") if (X is not None) and (y is not None) else None + ) + refined_data = input_data[["input_text", "output_text"]].copy() + + return self._bqml_model.llm_evaluate(refined_data, task_type) def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: """Save the model to BigQuery. diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 23b3e791d4..6d548a2e78 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -76,7 +76,9 @@ def test_llm_palm_score(llm_fine_tune_df_default_index): columns={"prompt": "input_text", "label": "output_text"} ) # Check score to ensure the model was fitted - score_result = model.score(eval_df).to_pandas() + score_result = model.score( + X=eval_df[["input_text"]], y=eval_df[["output_text"]] + ).to_pandas() score_result_col = score_result.columns.to_list() expected_col = [ "bleu4_score", @@ -86,3 +88,27 @@ def test_llm_palm_score(llm_fine_tune_df_default_index): "evaluation_status", ] assert all(col in score_result_col for col in expected_col) + + +def test_llm_palm_score_params(llm_fine_tune_df_default_index): + model = bigframes.ml.llm.PaLM2TextGenerator( + model_name="text-bison", max_iterations=1 + ) + eval_df = llm_fine_tune_df_default_index.rename( + columns={"prompt": "input_text", "label": "output_text"} + ) + # Check score to ensure the model was fitted + score_result = model.score( + X=eval_df["input_text"], y=eval_df["output_text"], task_type="classification" + ).to_pandas() + score_result_col = score_result.columns.to_list() + expected_col = [ + "trial_id", + "precision", + "recall", + "accuracy", + "f1_score", + "log_loss", + "roc_auc", + ] + assert all(col in score_result_col for col in expected_col) From 562cf8dec3e6151cfb801e3c3283deb0b9bc46b6 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 24 Apr 2024 22:23:47 +0000 Subject: [PATCH 3/4] address additional comments --- bigframes/ml/core.py | 2 +- bigframes/ml/llm.py | 35 +++++++++++++++-------------------- bigframes/ml/sql.py | 10 +++------- tests/system/load/test_llm.py | 15 +++++++-------- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index dde6ea42f3..12c881c19a 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -189,7 +189,7 @@ def evaluate(self, input_data: Optional[bpd.DataFrame] = None): def llm_evaluate( self, - input_data: Optional[bpd.DataFrame] = None, + input_data: bpd.DataFrame, task_type: Optional[str] = None, ): sql = self._model_manipulation_sql_generator.ml_llm_evaluate( diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index f5729805ed..d14445866d 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -220,7 +220,7 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - Input DataFrame or Series, which needs to contain a column with name "prompt". Only the column will be used as input. + Input DataFrame or Series, which contains only one column of prompts. Prompts can include preamble, questions, suggestions, instructions, or examples. temperature (float, default 0.0): @@ -335,11 +335,11 @@ def score( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): - A BigQuery DataFrame as evaluation data. X must have a column named - ``input_text`` that contains the prompt text to use when evaluating the model. + A BigQuery DataFrame as evaluation data, which contains only one column of input_text + that contains the prompt text to use when evaluating the model. y (bigframes.dataframe.DataFrame or bigframes.series.Series): - A BigQuery DataFrame as evaluation labels. y must also have a column named ``output_text`` that contains the generated - text that you would expect to be returned by the model. + A BigQuery DataFrame as evaluation labels, which contains only one column of output_text + that you would expect to be returned by the model. task_type (Optional[str]): The type of the task for LLM model. Default to "text_generation". Possible values: "text_generation", "classification", "summarization", and "question_answering". @@ -352,25 +352,20 @@ def score( X, y = utils.convert_to_dataframe(X, y) - X_columns = X.columns.to_list() - y_columns = y.columns.to_list() - if "input_text" not in X_columns: - raise ValueError( - """Must contain a column named input_text that contains the prompt - text to use when evaluating the model.""" - ) - if "output_text" not in y_columns: + if len(X.columns) != 1 or len(y.columns) != 1: raise ValueError( - """Must contain a column named output_text that contains the generated - text that you would expect to be returned by the model.""" + f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}" ) - input_data = ( - X.join(y, how="outer") if (X is not None) and (y is not None) else None - ) - refined_data = input_data[["input_text", "output_text"]].copy() + # BQML identified the column by name + X_col_label = cast(blocks.Label, X.columns[0]) + y_col_label = cast(blocks.Label, y.columns[0]) + X = X.rename(columns={X_col_label: "input_text"}) + y = y.rename(columns={y_col_label: "output_text"}) + + input_data = X.join(y, how="outer") - return self._bqml_model.llm_evaluate(refined_data, task_type) + return self._bqml_model.llm_evaluate(input_data, task_type) def to_gbq(self, model_name: str, replace: bool = False) -> PaLM2TextGenerator: """Save the model to BigQuery. diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 205139a89e..3679be16c6 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -320,15 +320,11 @@ def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str: # ML evaluation TVFs def ml_llm_evaluate( - self, source_df: Optional[bpd.DataFrame] = None, task_type: Optional[str] = None + self, source_df: bpd.DataFrame, task_type: Optional[str] = None ) -> str: """Encode ML.EVALUATE for BQML""" - if source_df is None: - source_sql = None - else: - # Note: don't need index as evaluate returns a new table - source_sql, _, _ = source_df._to_sql_query(include_index=False) - + # Note: don't need index as evaluate returns a new table + source_sql, _, _ = source_df._to_sql_query(include_index=False) return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, ({source_sql}), STRUCT("{task_type}" AS task_type))""" diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 6d548a2e78..835b31955e 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -72,12 +72,11 @@ def test_llm_palm_configure_fit(llm_fine_tune_df_default_index, llm_remote_text_ def test_llm_palm_score(llm_fine_tune_df_default_index): model = bigframes.ml.llm.PaLM2TextGenerator(model_name="text-bison") - eval_df = llm_fine_tune_df_default_index.rename( - columns={"prompt": "input_text", "label": "output_text"} - ) + # Check score to ensure the model was fitted score_result = model.score( - X=eval_df[["input_text"]], y=eval_df[["output_text"]] + X=llm_fine_tune_df_default_index[["prompt"]], + y=llm_fine_tune_df_default_index[["label"]], ).to_pandas() score_result_col = score_result.columns.to_list() expected_col = [ @@ -94,12 +93,12 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index): model = bigframes.ml.llm.PaLM2TextGenerator( model_name="text-bison", max_iterations=1 ) - eval_df = llm_fine_tune_df_default_index.rename( - columns={"prompt": "input_text", "label": "output_text"} - ) + # Check score to ensure the model was fitted score_result = model.score( - X=eval_df["input_text"], y=eval_df["output_text"], task_type="classification" + X=llm_fine_tune_df_default_index["prompt"], + y=llm_fine_tune_df_default_index["label"], + task_type="classification", ).to_pandas() score_result_col = score_result.columns.to_list() expected_col = [ From 3e65eab677a0f165649675cb3e7474ada7911f60 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 24 Apr 2024 23:23:03 +0000 Subject: [PATCH 4/4] address minor comments --- bigframes/ml/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index d14445866d..4a58152d14 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -340,7 +340,7 @@ def score( y (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation labels, which contains only one column of output_text that you would expect to be returned by the model. - task_type (Optional[str]): + task_type (str): The type of the task for LLM model. Default to "text_generation". Possible values: "text_generation", "classification", "summarization", and "question_answering".