From 126f1a77e164e2afbc398c39ec729ea16d0cf675 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 00:46:08 +0000 Subject: [PATCH 1/7] fix: model.fit metric not collected issue. --- bigframes/ml/core.py | 1 + tests/system/large/ml/test_linear_model.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 4bc61c5015..8e789386f1 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -273,6 +273,7 @@ def _create_model_ref( def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously _, job = session._start_query_ml_ddl(sql) + session._metrics.count_job_stats(job) # real model path in the session specific hidden dataset and table prefix model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index f593ac2983..145dbb462c 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -31,8 +31,14 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase ] ] y_train = df[["body_mass_g"]] + + start_execution_count = df._block._expr.session._metrics.execution_count + model.fit(X_train, y_train) + end_execution_count = df._block._expr.session._metrics.execution_count + assert end_execution_count - start_execution_count == 2 + # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() utils.check_pandas_df_schema_and_index( From f343e9e4c5d12a495233a05241a7f10b9c923073 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 01:24:43 +0000 Subject: [PATCH 2/7] update --- bigframes/ml/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 8e789386f1..c93bde83d1 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -273,7 +273,8 @@ def _create_model_ref( def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously _, job = session._start_query_ml_ddl(sql) - session._metrics.count_job_stats(job) + if session._metrics is not None: + session._metrics.count_job_stats(job) # real model path in the session specific hidden dataset and table prefix model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" From f64eba2a8b283286589e54d733ef95e34271a9bc Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 17:11:50 +0000 Subject: [PATCH 3/7] fix unit test --- tests/unit/ml/test_golden_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 65f079852e..ce05011546 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -36,6 +36,7 @@ def mock_session(): TEMP_MODEL_ID.project, TEMP_MODEL_ID.dataset_id ) mock_session._bq_kms_key_name = None + mock_session._metrics = None query_job = mock.create_autospec(bigquery.QueryJob) type(query_job).destination = mock.PropertyMock( From a10a5475b596a3ddacfdeea5e4a9ed9ab4314f3f Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 20:21:45 +0000 Subject: [PATCH 4/7] update code and test --- bigframes/ml/core.py | 2 -- bigframes/session/__init__.py | 9 ++++++++- tests/system/small/ml/test_core.py | 6 ++++++ 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index c93bde83d1..4bc61c5015 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -273,8 +273,6 @@ def _create_model_ref( def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously _, job = session._start_query_ml_ddl(sql) - if session._metrics is not None: - session._metrics.count_job_stats(job) # real model path in the session specific hidden dataset and table prefix model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 072bcc5781..9611d938b1 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1373,7 +1373,14 @@ def _start_query_ml_ddl( # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None - return bf_io_bigquery.start_query_with_client(self.bqclient, sql, job_config) + results_iterator, query_job = bf_io_bigquery.start_query_with_client( + self.bqclient, sql, job_config + ) + + if self._metrics is not None: + self._metrics.count_job_stats(query_job) + + return results_iterator, query_job def _export( self, diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 6b852e87af..65540e7e81 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -383,8 +383,14 @@ def test_model_forecast(time_series_bqml_arima_plus_model: core.BqmlModel): def test_model_register(ephemera_penguins_bqml_linear_model: core.BqmlModel): model = ephemera_penguins_bqml_linear_model + + start_execution_count = model.session._metrics.execution_count + model.register() + end_execution_count = model.session._metrics.execution_count + assert end_execution_count - start_execution_count == 1 + assert model.model.model_id is not None model_name = "bigframes_" + model.model.model_id # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. From 8e7937b05c0c55381cded85d4e8b1f75ce55f243 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 21:50:29 +0000 Subject: [PATCH 5/7] update code --- bigframes/session/__init__.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 9611d938b1..6cfaf48493 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1373,15 +1373,10 @@ def _start_query_ml_ddl( # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None - results_iterator, query_job = bf_io_bigquery.start_query_with_client( - self.bqclient, sql, job_config + return bf_io_bigquery.start_query_with_client( + self.bqclient, sql, job_config, metrics=self._metrics ) - if self._metrics is not None: - self._metrics.count_job_stats(query_job) - - return results_iterator, query_job - def _export( self, array_value: core.ArrayValue, From 70f96b0e11a440e8116dc86d17baf4f58a1567d0 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 22:00:14 +0000 Subject: [PATCH 6/7] update comment --- tests/system/large/ml/test_linear_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 145dbb462c..6189a69857 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -37,6 +37,8 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase model.fit(X_train, y_train) end_execution_count = df._block._expr.session._metrics.execution_count + # The fit function initiates two queries: the first generates and caches + # the training data, while the second creates and fits the model. assert end_execution_count - start_execution_count == 2 # Check score to ensure the model was fitted From 0a40d195d2b20973ae022bf3eb9ad9f7a6629a3f Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Tue, 15 Oct 2024 22:11:26 +0000 Subject: [PATCH 7/7] update test --- tests/system/large/ml/test_linear_model.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 6189a69857..5c3f7631d9 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -31,16 +31,8 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase ] ] y_train = df[["body_mass_g"]] - - start_execution_count = df._block._expr.session._metrics.execution_count - model.fit(X_train, y_train) - end_execution_count = df._block._expr.session._metrics.execution_count - # The fit function initiates two queries: the first generates and caches - # the training data, while the second creates and fits the model. - assert end_execution_count - start_execution_count == 2 - # Check score to ensure the model was fitted result = model.score(X_train, y_train).to_pandas() utils.check_pandas_df_schema_and_index( @@ -136,10 +128,20 @@ def test_unordered_mode_linear_regression_configure_fit_score_predict( ] ] y_train = df[["body_mass_g"]] + + start_execution_count = df._block._expr.session._metrics.execution_count model.fit(X_train, y_train) + end_execution_count = df._block._expr.session._metrics.execution_count + # The fit function initiates two queries: the first generates and caches + # the training data, while the second creates and fits the model. + assert end_execution_count - start_execution_count == 2 # Check score to ensure the model was fitted + start_execution_count = end_execution_count result = model.score(X_train, y_train).to_pandas() + end_execution_count = df._block._expr.session._metrics.execution_count + assert end_execution_count - start_execution_count == 1 + utils.check_pandas_df_schema_and_index( result, columns=utils.ML_REGRESSION_METRICS, index=1 ) @@ -162,7 +164,10 @@ def test_unordered_mode_linear_regression_configure_fit_score_predict( assert reloaded_model.max_iterations == 20 assert reloaded_model.tol == 0.01 + start_execution_count = df._block._expr.session._metrics.execution_count pred = reloaded_model.predict(df) + end_execution_count = df._block._expr.session._metrics.execution_count + assert end_execution_count - start_execution_count == 1 utils.check_pandas_df_schema_and_index( pred, columns=("predicted_body_mass_g",),