From ca61da582ad61685c832a0feaa9a4aa90c877a8f Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 7 Feb 2024 22:13:35 +0000 Subject: [PATCH 1/4] feat: add ml.metrics.pairwise.cosine_similarity function --- bigframes/ml/core.py | 85 +++++++++++++------ bigframes/ml/metrics/__init__.py | 37 ++++++++ .../ml/{metrics.py => metrics/_metrics.py} | 0 bigframes/ml/metrics/pairwise.py | 34 ++++++++ bigframes/ml/sql.py | 15 +++- docs/templates/toc.yml | 4 + .../system/small/ml/test_metrics_pairwise.py | 35 ++++++++ tests/unit/ml/test_sql.py | 55 +++++++----- .../sklearn/metrics/pairwise.py | 27 ++++++ 9 files changed, 242 insertions(+), 50 deletions(-) create mode 100644 bigframes/ml/metrics/__init__.py rename bigframes/ml/{metrics.py => metrics/_metrics.py} (100%) create mode 100644 bigframes/ml/metrics/pairwise.py create mode 100644 tests/system/small/ml/test_metrics_pairwise.py create mode 100644 third_party/bigframes_vendored/sklearn/metrics/pairwise.py diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 266ab1b058..c6621f9f1e 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -17,7 +17,7 @@ from __future__ import annotations import datetime -from typing import Callable, cast, Iterable, Mapping, Optional, Union +from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union import uuid from google.cloud import bigquery @@ -28,34 +28,12 @@ import bigframes.pandas as bpd -class BqmlModel: - """Represents an existing BQML model in BigQuery. - - Wraps the BQML API and SQL interface to expose the functionality needed for - BigQuery DataFrames ML. - """ +class BaseBqml: + """Base class for BQML functionalities.""" - def __init__(self, session: bigframes.Session, model: bigquery.Model): + def __init__(self, session: bigframes.Session): self._session = session - self._model = model - self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( - self.model_name - ) - - @property - def session(self) -> bigframes.Session: - """Get the BigQuery DataFrames session that this BQML model wrapper is tied to""" - return self._session - - @property - def model_name(self) -> str: - """Get the fully qualified name of the model, i.e. project_id.dataset_id.model_id""" - return f"{self._model.project}.{self._model.dataset_id}.{self._model.model_id}" - - @property - def model(self) -> bigquery.Model: - """Get the BQML model associated with this wrapper""" - return self._model + self._base_sql_generator = ml_sql.BaseSqlGenerator() def _apply_sql( self, @@ -84,6 +62,59 @@ def _apply_sql( return df + def distance( + self, + x: bpd.DataFrame, + y: bpd.DataFrame, + type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"], + name: str, + ) -> bpd.DataFrame: + assert len(x.columns) == 1 and len(y.columns) == 1 + + input_data = x._cached().join(y._cached(), how="outer") + x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0] + + return self._apply_sql( + input_data, + lambda source_df: self._base_sql_generator.ml_distance( + x_column_id, + y_column_id, + type=type, + source_df=source_df, + name=name, + ), + ) + + +class BqmlModel(BaseBqml): + """Represents an existing BQML model in BigQuery. + + Wraps the BQML API and SQL interface to expose the functionality needed for + BigQuery DataFrames ML. + """ + + def __init__(self, session: bigframes.Session, model: bigquery.Model): + self._session = session + self._model = model + self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( + self.model_name + ) + + @property + def session(self) -> bigframes.Session: + """Get the BigQuery DataFrames session that this BQML model wrapper is tied to""" + return self._session + + @property + def model_name(self) -> str: + """Get the fully qualified name of the model, i.e. project_id.dataset_id.model_id""" + return f"{self._model.project}.{self._model.dataset_id}.{self._model.model_id}" + + @property + def model(self) -> bigquery.Model: + """Get the BQML model associated with this wrapper""" + return self._model + def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py new file mode 100644 index 0000000000..7629c1e948 --- /dev/null +++ b/bigframes/ml/metrics/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.ml.metrics import pairwise +from bigframes.ml.metrics._metrics import ( + accuracy_score, + auc, + confusion_matrix, + f1_score, + precision_score, + r2_score, + roc_auc_score, + roc_curve, +) + +__all__ = [ + "r2_score", + "accuracy_score", + "roc_curve", + "roc_auc_score", + "auc", + "confusion_matrix", + "precision_score", + "f1_score", + "pairwise", +] diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics/_metrics.py similarity index 100% rename from bigframes/ml/metrics.py rename to bigframes/ml/metrics/_metrics.py diff --git a/bigframes/ml/metrics/pairwise.py b/bigframes/ml/metrics/pairwise.py new file mode 100644 index 0000000000..04577c89d3 --- /dev/null +++ b/bigframes/ml/metrics/pairwise.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Union + +from bigframes.ml import core, utils +import bigframes.pandas as bpd +import third_party.bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise + + +def cosine_similarity( + X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series] +) -> bpd.DataFrame: + X, Y = utils.convert_to_dataframe(X, Y) + if len(X.columns) != 1 or len(Y.columns) != 1: + raise ValueError("Inputs X and Y can only contain 1 column.") + + base_bqml = core.BaseBqml(session=X._session) + return base_bqml.distance(X, Y, type="COSINE", name="cosine_similarity") + + +cosine_similarity.__doc__ = inspect.getdoc(vendored_metrics_pairwise.cosine_similarity) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 152f881ec0..f45a5fec91 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -16,7 +16,7 @@ Generates SQL queries needed for BigQuery DataFrames ML """ -from typing import Iterable, Mapping, Optional, Union +from typing import Iterable, Literal, Mapping, Optional, Union import google.cloud.bigquery @@ -133,6 +133,19 @@ def ml_label_encoder( https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + def ml_distance( + self, + col_x: str, + col_y: str, + type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"], + source_df: bpd.DataFrame, + name: str, + ) -> str: + """Encode ML.DISTANCE for BQML. + https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance""" + source_sql, _, _ = source_df._to_sql_query(include_index=True) + return f"""SELECT *, ML.DISTANCE({col_x}, {col_y}, '{type}') AS {name} FROM ({source_sql})""" + class ModelCreationSqlGenerator(BaseSqlGenerator): """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id.""" diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index b680a5fc1a..a76994686c 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -112,6 +112,10 @@ - name: metrics uid: bigframes.ml.metrics name: metrics + - items: + - name: metrics.pairwise + uid: bigframes.ml.metrics.pairwise + name: metrics.pairwise - items: - name: model_selection uid: bigframes.ml.model_selection diff --git a/tests/system/small/ml/test_metrics_pairwise.py b/tests/system/small/ml/test_metrics_pairwise.py new file mode 100644 index 0000000000..c02a36abbc --- /dev/null +++ b/tests/system/small/ml/test_metrics_pairwise.py @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from bigframes.ml import metrics +import bigframes.pandas as bpd + + +def test_cosine_similarity(): + x_col = [np.array([4.1, 0.5, 1.0])] + y_col = [np.array([3.0, 0.0, 2.5])] + X = bpd.read_pandas(pd.DataFrame({"X": x_col})) + Y = bpd.read_pandas(pd.DataFrame({"Y": y_col})) + + result = metrics.pairwise.cosine_similarity(X, Y) + expected_pd_df = pd.DataFrame( + {"X": x_col, "Y": y_col, "cosine_similarity": [0.108199]} + ) + + pd.testing.assert_frame_equal( + result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False + ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 37cc33d33e..34cf7f0b96 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -47,7 +47,7 @@ def mock_df(): return mock_df -def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): +def test_options_correct(base_sql_generator: ml_sql.BaseSqlGenerator): sql = base_sql_generator.options( model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6 ) @@ -60,7 +60,7 @@ def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerato ) -def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator): +def test_transform_correct(base_sql_generator: ml_sql.BaseSqlGenerator): sql = base_sql_generator.transform( "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a", "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b", @@ -75,35 +75,35 @@ def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenera ) -def test_standard_scaler_produces_correct_sql( +def test_standard_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_standard_scaler("col_a", "scaled_col_a") assert sql == "ML.STANDARD_SCALER(col_a) OVER() AS scaled_col_a" -def test_max_abs_scaler_produces_correct_sql( +def test_max_abs_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_max_abs_scaler("col_a", "scaled_col_a") assert sql == "ML.MAX_ABS_SCALER(col_a) OVER() AS scaled_col_a" -def test_min_max_scaler_produces_correct_sql( +def test_min_max_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_min_max_scaler("col_a", "scaled_col_a") assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" -def test_k_bins_discretizer_produces_correct_sql( +def test_k_bins_discretizer_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a") assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" -def test_one_hot_encoder_produces_correct_sql( +def test_one_hot_encoder_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_one_hot_encoder( @@ -114,14 +114,25 @@ def test_one_hot_encoder_produces_correct_sql( ) -def test_label_encoder_produces_correct_sql( +def test_label_encoder_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_label_encoder("col_a", 1000000, 0, "encoded_col_a") assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" -def test_create_model_produces_correct_sql( +def test_distance_correct( + base_sql_generator: ml_sql.BaseSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = base_sql_generator.ml_distance("col_a", "col_b", "COSINE", mock_df, "cosine") + assert ( + sql + == "SELECT col_a, col_b, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM input_X_y_sql" + ) + + +def test_create_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -142,7 +153,7 @@ def test_create_model_produces_correct_sql( ) -def test_create_model_transform_produces_correct_sql( +def test_create_model_transform_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -170,7 +181,7 @@ def test_create_model_transform_produces_correct_sql( ) -def test_create_remote_model_produces_correct_sql( +def test_create_remote_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_remote_model( @@ -190,7 +201,7 @@ def test_create_remote_model_produces_correct_sql( ) -def test_create_remote_model_with_params_produces_correct_sql( +def test_create_remote_model_with_params_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_remote_model( @@ -216,7 +227,7 @@ def test_create_remote_model_with_params_produces_correct_sql( ) -def test_create_imported_model_produces_correct_sql( +def test_create_imported_model_correct( model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator, ): sql = model_creation_sql_generator.create_imported_model( @@ -249,7 +260,7 @@ def test_alter_model_correct_sql( ) -def test_ml_predict_produces_correct_sql( +def test_ml_predict_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -261,7 +272,7 @@ def test_ml_predict_produces_correct_sql( ) -def test_ml_evaluate_produces_correct_sql( +def test_ml_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -273,7 +284,7 @@ def test_ml_evaluate_produces_correct_sql( ) -def test_ml_arima_evaluate_produces_correct_sql( +def test_ml_arima_evaluate_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_arima_evaluate( @@ -286,7 +297,7 @@ def test_ml_arima_evaluate_produces_correct_sql( ) -def test_ml_evaluate_no_source_produces_correct_sql( +def test_ml_evaluate_no_source_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_evaluate() @@ -296,7 +307,7 @@ def test_ml_evaluate_no_source_produces_correct_sql( ) -def test_ml_centroids_produces_correct_sql( +def test_ml_centroids_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_centroids() @@ -322,7 +333,7 @@ def test_forecast_correct_sql( ) -def test_ml_generate_text_produces_correct_sql( +def test_ml_generate_text_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -339,7 +350,7 @@ def test_ml_generate_text_produces_correct_sql( ) -def test_ml_generate_text_embedding_produces_correct_sql( +def test_ml_generate_text_embedding_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, mock_df: bpd.DataFrame, ): @@ -356,7 +367,7 @@ def test_ml_generate_text_embedding_produces_correct_sql( ) -def test_ml_principal_components_produces_correct_sql( +def test_ml_principal_components_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_principal_components() @@ -366,7 +377,7 @@ def test_ml_principal_components_produces_correct_sql( ) -def test_ml_principal_component_info_produces_correct_sql( +def test_ml_principal_component_info_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): sql = model_manipulation_sql_generator.ml_principal_component_info() diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py new file mode 100644 index 0000000000..d320dce863 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py @@ -0,0 +1,27 @@ +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Robert Layton +# Andreas Mueller +# Philippe Gervais +# Lars Buitinck +# Joel Nothman +# License: BSD 3 clause + + +def cosine_similarity(X, Y): + """Compute cosine similarity between samples in X and Y. + + Cosine similarity, or the cosine kernel, computes similarity as the + normalized dot product of X and Y: + + K(X, Y) = / (||X||*||Y||) + + Args: + X (Series or single column DataFrame of array of numeric type): + Input data. + Y (Series or single column DataFrame of array of numeric type): + Input data. X and Y are mapped by indexes, must have the same index. + + Returns: + DataFrame with columns of X, Y and cosine_similarity + """ From 8984158d526b6c75c613e348fe7f9811a5e135f3 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 7 Feb 2024 22:21:53 +0000 Subject: [PATCH 2/4] fix tests --- tests/unit/ml/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 34cf7f0b96..d68c248738 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -128,7 +128,7 @@ def test_distance_correct( sql = base_sql_generator.ml_distance("col_a", "col_b", "COSINE", mock_df, "cosine") assert ( sql - == "SELECT col_a, col_b, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM input_X_y_sql" + == "SELECT *, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM (input_X_sql)" ) From 17883a9648aa946aa46fb187d67b6039d5237fec Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Thu, 8 Feb 2024 01:37:35 +0000 Subject: [PATCH 3/4] fix tests --- bigframes/ml/metrics/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py index 7629c1e948..6b0a243426 100644 --- a/bigframes/ml/metrics/__init__.py +++ b/bigframes/ml/metrics/__init__.py @@ -20,12 +20,14 @@ f1_score, precision_score, r2_score, + recall_score, roc_auc_score, roc_curve, ) __all__ = [ "r2_score", + "recall_score", "accuracy_score", "roc_curve", "roc_auc_score", From 9a1b5e6b7ded767d4874b0dd7ae10a411ec028c7 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 13 Feb 2024 00:32:31 +0000 Subject: [PATCH 4/4] add docs --- bigframes/ml/core.py | 12 ++++++++++++ .../bigframes_vendored/sklearn/metrics/pairwise.py | 8 ++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index c6621f9f1e..2df96383ef 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -69,6 +69,18 @@ def distance( type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"], name: str, ) -> bpd.DataFrame: + """Calculate ML.DISTANCE from DataFrame inputs. + + Args: + x: + input DataFrame + y: + input DataFrame + type: + Distance types, accept values are "EUCLIDEAN", "MANHATTAN", "COSINE". + name: + name of the output result column + """ assert len(x.columns) == 1 and len(y.columns) == 1 input_data = x._cached().join(y._cached(), how="outer") diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py index d320dce863..3ef5431178 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py +++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py @@ -7,8 +7,11 @@ # Joel Nothman # License: BSD 3 clause +from bigframes import constants +import bigframes.pandas as bpd -def cosine_similarity(X, Y): + +def cosine_similarity(X, Y) -> bpd.DataFrame: """Compute cosine similarity between samples in X and Y. Cosine similarity, or the cosine kernel, computes similarity as the @@ -23,5 +26,6 @@ def cosine_similarity(X, Y): Input data. X and Y are mapped by indexes, must have the same index. Returns: - DataFrame with columns of X, Y and cosine_similarity + bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and cosine_similarity """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)