From ca61da582ad61685c832a0feaa9a4aa90c877a8f Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Wed, 7 Feb 2024 22:13:35 +0000
Subject: [PATCH 1/4] feat: add ml.metrics.pairwise.cosine_similarity function

---
 bigframes/ml/core.py                          | 85 +++++++++++++------
 bigframes/ml/metrics/__init__.py              | 37 ++++++++
 .../ml/{metrics.py => metrics/_metrics.py}    |  0
 bigframes/ml/metrics/pairwise.py              | 34 ++++++++
 bigframes/ml/sql.py                           | 15 +++-
 docs/templates/toc.yml                        |  4 +
 .../system/small/ml/test_metrics_pairwise.py  | 35 ++++++++
 tests/unit/ml/test_sql.py                     | 55 +++++++-----
 .../sklearn/metrics/pairwise.py               | 27 ++++++
 9 files changed, 242 insertions(+), 50 deletions(-)
 create mode 100644 bigframes/ml/metrics/__init__.py
 rename bigframes/ml/{metrics.py => metrics/_metrics.py} (100%)
 create mode 100644 bigframes/ml/metrics/pairwise.py
 create mode 100644 tests/system/small/ml/test_metrics_pairwise.py
 create mode 100644 third_party/bigframes_vendored/sklearn/metrics/pairwise.py

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index 266ab1b058..c6621f9f1e 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -17,7 +17,7 @@
 from __future__ import annotations
 
 import datetime
-from typing import Callable, cast, Iterable, Mapping, Optional, Union
+from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union
 import uuid
 
 from google.cloud import bigquery
@@ -28,34 +28,12 @@
 import bigframes.pandas as bpd
 
 
-class BqmlModel:
-    """Represents an existing BQML model in BigQuery.
-
-    Wraps the BQML API and SQL interface to expose the functionality needed for
-    BigQuery DataFrames ML.
-    """
+class BaseBqml:
+    """Base class for BQML functionalities."""
 
-    def __init__(self, session: bigframes.Session, model: bigquery.Model):
+    def __init__(self, session: bigframes.Session):
         self._session = session
-        self._model = model
-        self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator(
-            self.model_name
-        )
-
-    @property
-    def session(self) -> bigframes.Session:
-        """Get the BigQuery DataFrames session that this BQML model wrapper is tied to"""
-        return self._session
-
-    @property
-    def model_name(self) -> str:
-        """Get the fully qualified name of the model, i.e. project_id.dataset_id.model_id"""
-        return f"{self._model.project}.{self._model.dataset_id}.{self._model.model_id}"
-
-    @property
-    def model(self) -> bigquery.Model:
-        """Get the BQML model associated with this wrapper"""
-        return self._model
+        self._base_sql_generator = ml_sql.BaseSqlGenerator()
 
     def _apply_sql(
         self,
@@ -84,6 +62,59 @@ def _apply_sql(
 
         return df
 
+    def distance(
+        self,
+        x: bpd.DataFrame,
+        y: bpd.DataFrame,
+        type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"],
+        name: str,
+    ) -> bpd.DataFrame:
+        assert len(x.columns) == 1 and len(y.columns) == 1
+
+        input_data = x._cached().join(y._cached(), how="outer")
+        x_column_id, y_column_id = x._block.value_columns[0], y._block.value_columns[0]
+
+        return self._apply_sql(
+            input_data,
+            lambda source_df: self._base_sql_generator.ml_distance(
+                x_column_id,
+                y_column_id,
+                type=type,
+                source_df=source_df,
+                name=name,
+            ),
+        )
+
+
+class BqmlModel(BaseBqml):
+    """Represents an existing BQML model in BigQuery.
+
+    Wraps the BQML API and SQL interface to expose the functionality needed for
+    BigQuery DataFrames ML.
+    """
+
+    def __init__(self, session: bigframes.Session, model: bigquery.Model):
+        self._session = session
+        self._model = model
+        self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator(
+            self.model_name
+        )
+
+    @property
+    def session(self) -> bigframes.Session:
+        """Get the BigQuery DataFrames session that this BQML model wrapper is tied to"""
+        return self._session
+
+    @property
+    def model_name(self) -> str:
+        """Get the fully qualified name of the model, i.e. project_id.dataset_id.model_id"""
+        return f"{self._model.project}.{self._model.dataset_id}.{self._model.model_id}"
+
+    @property
+    def model(self) -> bigquery.Model:
+        """Get the BQML model associated with this wrapper"""
+        return self._model
+
     def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame:
         # TODO: validate input data schema
         return self._apply_sql(
diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py
new file mode 100644
index 0000000000..7629c1e948
--- /dev/null
+++ b/bigframes/ml/metrics/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bigframes.ml.metrics import pairwise
+from bigframes.ml.metrics._metrics import (
+    accuracy_score,
+    auc,
+    confusion_matrix,
+    f1_score,
+    precision_score,
+    r2_score,
+    roc_auc_score,
+    roc_curve,
+)
+
+__all__ = [
+    "r2_score",
+    "accuracy_score",
+    "roc_curve",
+    "roc_auc_score",
+    "auc",
+    "confusion_matrix",
+    "precision_score",
+    "f1_score",
+    "pairwise",
+]
diff --git a/bigframes/ml/metrics.py b/bigframes/ml/metrics/_metrics.py
similarity index 100%
rename from bigframes/ml/metrics.py
rename to bigframes/ml/metrics/_metrics.py
diff --git a/bigframes/ml/metrics/pairwise.py b/bigframes/ml/metrics/pairwise.py
new file mode 100644
index 0000000000..04577c89d3
--- /dev/null
+++ b/bigframes/ml/metrics/pairwise.py
@@ -0,0 +1,34 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Union
+
+from bigframes.ml import core, utils
+import bigframes.pandas as bpd
+import third_party.bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise
+
+
+def cosine_similarity(
+    X: Union[bpd.DataFrame, bpd.Series], Y: Union[bpd.DataFrame, bpd.Series]
+) -> bpd.DataFrame:
+    X, Y = utils.convert_to_dataframe(X, Y)
+    if len(X.columns) != 1 or len(Y.columns) != 1:
+        raise ValueError("Inputs X and Y can only contain 1 column.")
+
+    base_bqml = core.BaseBqml(session=X._session)
+    return base_bqml.distance(X, Y, type="COSINE", name="cosine_similarity")
+
+
+cosine_similarity.__doc__ = inspect.getdoc(vendored_metrics_pairwise.cosine_similarity)
diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py
index 152f881ec0..f45a5fec91 100644
--- a/bigframes/ml/sql.py
+++ b/bigframes/ml/sql.py
@@ -16,7 +16,7 @@
 Generates SQL queries needed for BigQuery DataFrames ML
 """
 
-from typing import Iterable, Mapping, Optional, Union
+from typing import Iterable, Literal, Mapping, Optional, Union
 
 import google.cloud.bigquery
 
@@ -133,6 +133,19 @@ def ml_label_encoder(
         https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params."""
         return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}"""
 
+    def ml_distance(
+        self,
+        col_x: str,
+        col_y: str,
+        type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"],
+        source_df: bpd.DataFrame,
+        name: str,
+    ) -> str:
+        """Encode ML.DISTANCE for BQML.
+        https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance"""
+        source_sql, _, _ = source_df._to_sql_query(include_index=True)
+        return f"""SELECT *, ML.DISTANCE({col_x}, {col_y}, '{type}') AS {name} FROM ({source_sql})"""
+
 
 class ModelCreationSqlGenerator(BaseSqlGenerator):
     """Sql generator for creating a model entity. Model id is the standalone id without project id and dataset id."""
diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml
index b680a5fc1a..a76994686c 100644
--- a/docs/templates/toc.yml
+++ b/docs/templates/toc.yml
@@ -112,6 +112,10 @@
       - name: metrics
         uid: bigframes.ml.metrics
       name: metrics
+    - items:
+      - name: metrics.pairwise
+        uid: bigframes.ml.metrics.pairwise
+      name: metrics.pairwise
     - items:
       - name: model_selection
         uid: bigframes.ml.model_selection
diff --git a/tests/system/small/ml/test_metrics_pairwise.py b/tests/system/small/ml/test_metrics_pairwise.py
new file mode 100644
index 0000000000..c02a36abbc
--- /dev/null
+++ b/tests/system/small/ml/test_metrics_pairwise.py
@@ -0,0 +1,35 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+from bigframes.ml import metrics
+import bigframes.pandas as bpd
+
+
+def test_cosine_similarity():
+    x_col = [np.array([4.1, 0.5, 1.0])]
+    y_col = [np.array([3.0, 0.0, 2.5])]
+    X = bpd.read_pandas(pd.DataFrame({"X": x_col}))
+    Y = bpd.read_pandas(pd.DataFrame({"Y": y_col}))
+
+    result = metrics.pairwise.cosine_similarity(X, Y)
+    expected_pd_df = pd.DataFrame(
+        {"X": x_col, "Y": y_col, "cosine_similarity": [0.108199]}
+    )
+
+    pd.testing.assert_frame_equal(
+        result.to_pandas(), expected_pd_df, check_dtype=False, check_index_type=False
+    )
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 37cc33d33e..34cf7f0b96 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -47,7 +47,7 @@ def mock_df():
     return mock_df
 
 
-def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator):
+def test_options_correct(base_sql_generator: ml_sql.BaseSqlGenerator):
     sql = base_sql_generator.options(
         model_type="lin_reg", input_label_cols=["col_a"], l1_reg=0.6
     )
@@ -60,7 +60,7 @@ def test_options_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerato
     )
 
 
-def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenerator):
+def test_transform_correct(base_sql_generator: ml_sql.BaseSqlGenerator):
     sql = base_sql_generator.transform(
         "ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a",
         "ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b",
@@ -75,35 +75,35 @@ def test_transform_produces_correct_sql(base_sql_generator: ml_sql.BaseSqlGenera
     )
 
 
-def test_standard_scaler_produces_correct_sql(
+def test_standard_scaler_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
     sql = base_sql_generator.ml_standard_scaler("col_a", "scaled_col_a")
     assert sql == "ML.STANDARD_SCALER(col_a) OVER() AS scaled_col_a"
 
 
-def test_max_abs_scaler_produces_correct_sql(
+def test_max_abs_scaler_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
     sql = base_sql_generator.ml_max_abs_scaler("col_a", "scaled_col_a")
     assert sql == "ML.MAX_ABS_SCALER(col_a) OVER() AS scaled_col_a"
 
 
-def test_min_max_scaler_produces_correct_sql(
+def test_min_max_scaler_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
     sql = base_sql_generator.ml_min_max_scaler("col_a", "scaled_col_a")
     assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a"
 
 
-def test_k_bins_discretizer_produces_correct_sql(
+def test_k_bins_discretizer_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
     sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a")
     assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a"
 
 
-def test_one_hot_encoder_produces_correct_sql(
+def test_one_hot_encoder_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
     sql = base_sql_generator.ml_one_hot_encoder(
@@ -114,14 +114,25 @@ def test_one_hot_encoder_produces_correct_sql(
     )
 
 
-def test_label_encoder_produces_correct_sql(
+def test_label_encoder_correct(
     base_sql_generator: ml_sql.BaseSqlGenerator,
 ):
     sql = base_sql_generator.ml_label_encoder("col_a", 1000000, 0, "encoded_col_a")
     assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a"
 
 
-def test_create_model_produces_correct_sql(
+def test_distance_correct(
+    base_sql_generator: ml_sql.BaseSqlGenerator,
+    mock_df: bpd.DataFrame,
+):
+    sql = base_sql_generator.ml_distance("col_a", "col_b", "COSINE", mock_df, "cosine")
+    assert (
+        sql
+        == "SELECT col_a, col_b, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM input_X_y_sql"
+    )
+
+
+def test_create_model_correct(
     model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
     mock_df: bpd.DataFrame,
 ):
@@ -142,7 +153,7 @@ def test_create_model_produces_correct_sql(
     )
 
 
-def test_create_model_transform_produces_correct_sql(
+def test_create_model_transform_correct(
     model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
     mock_df: bpd.DataFrame,
 ):
@@ -170,7 +181,7 @@ def test_create_model_transform_produces_correct_sql(
     )
 
 
-def test_create_remote_model_produces_correct_sql(
+def test_create_remote_model_correct(
     model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
 ):
     sql = model_creation_sql_generator.create_remote_model(
@@ -190,7 +201,7 @@ def test_create_remote_model_produces_correct_sql(
     )
 
 
-def test_create_remote_model_with_params_produces_correct_sql(
+def test_create_remote_model_with_params_correct(
     model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
 ):
     sql = model_creation_sql_generator.create_remote_model(
@@ -216,7 +227,7 @@ def test_create_remote_model_with_params_produces_correct_sql(
     )
 
 
-def test_create_imported_model_produces_correct_sql(
+def test_create_imported_model_correct(
     model_creation_sql_generator: ml_sql.ModelCreationSqlGenerator,
 ):
     sql = model_creation_sql_generator.create_imported_model(
@@ -249,7 +260,7 @@ def test_alter_model_correct_sql(
     )
 
 
-def test_ml_predict_produces_correct_sql(
+def test_ml_predict_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
 ):
@@ -261,7 +272,7 @@ def test_ml_predict_produces_correct_sql(
     )
 
 
-def test_ml_evaluate_produces_correct_sql(
+def test_ml_evaluate_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
 ):
@@ -273,7 +284,7 @@ def test_ml_evaluate_produces_correct_sql(
     )
 
 
-def test_ml_arima_evaluate_produces_correct_sql(
+def test_ml_arima_evaluate_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
 ):
     sql = model_manipulation_sql_generator.ml_arima_evaluate(
@@ -286,7 +297,7 @@ def test_ml_arima_evaluate_produces_correct_sql(
     )
 
 
-def test_ml_evaluate_no_source_produces_correct_sql(
+def test_ml_evaluate_no_source_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
 ):
     sql = model_manipulation_sql_generator.ml_evaluate()
@@ -296,7 +307,7 @@ def test_ml_evaluate_no_source_produces_correct_sql(
     )
 
 
-def test_ml_centroids_produces_correct_sql(
+def test_ml_centroids_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
 ):
     sql = model_manipulation_sql_generator.ml_centroids()
@@ -322,7 +333,7 @@ def test_forecast_correct_sql(
     )
 
 
-def test_ml_generate_text_produces_correct_sql(
+def test_ml_generate_text_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
 ):
@@ -339,7 +350,7 @@ def test_ml_generate_text_produces_correct_sql(
     )
 
 
-def test_ml_generate_text_embedding_produces_correct_sql(
+def test_ml_generate_text_embedding_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
     mock_df: bpd.DataFrame,
 ):
@@ -356,7 +367,7 @@ def test_ml_generate_text_embedding_produces_correct_sql(
     )
 
 
-def test_ml_principal_components_produces_correct_sql(
+def test_ml_principal_components_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
 ):
     sql = model_manipulation_sql_generator.ml_principal_components()
@@ -366,7 +377,7 @@ def test_ml_principal_components_produces_correct_sql(
     )
 
 
-def test_ml_principal_component_info_produces_correct_sql(
+def test_ml_principal_component_info_correct(
     model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator,
 ):
     sql = model_manipulation_sql_generator.ml_principal_component_info()
diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py
new file mode 100644
index 0000000000..d320dce863
--- /dev/null
+++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py
@@ -0,0 +1,27 @@
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Mathieu Blondel <mathieu@mblondel.org>
+#          Robert Layton <robertlayton@gmail.com>
+#          Andreas Mueller <amueller@ais.uni-bonn.de>
+#          Philippe Gervais <philippe.gervais@inria.fr>
+#          Lars Buitinck
+#          Joel Nothman <joel.nothman@gmail.com>
+# License: BSD 3 clause
+
+
+def cosine_similarity(X, Y):
+    """Compute cosine similarity between samples in X and Y.
+
+    Cosine similarity, or the cosine kernel, computes similarity as the
+    normalized dot product of X and Y:
+
+        K(X, Y) = <X, Y> / (||X||*||Y||)
+
+    Args:
+        X (Series or single column DataFrame of array of numeric type):
+            Input data.
+        Y (Series or single column DataFrame of array of numeric type):
+            Input data. X and Y are mapped by indexes, must have the same index.
+
+    Returns:
+        DataFrame with columns of X, Y and cosine_similarity
+    """

From 8984158d526b6c75c613e348fe7f9811a5e135f3 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Wed, 7 Feb 2024 22:21:53 +0000
Subject: [PATCH 2/4] fix tests

---
 tests/unit/ml/test_sql.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index 34cf7f0b96..d68c248738 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -128,7 +128,7 @@ def test_distance_correct(
     sql = base_sql_generator.ml_distance("col_a", "col_b", "COSINE", mock_df, "cosine")
     assert (
         sql
-        == "SELECT col_a, col_b, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM input_X_y_sql"
+        == "SELECT *, ML.DISTANCE(col_a, col_b, 'COSINE') AS cosine FROM (input_X_sql)"
     )
 
 

From 17883a9648aa946aa46fb187d67b6039d5237fec Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Thu, 8 Feb 2024 01:37:35 +0000
Subject: [PATCH 3/4] fix tests

---
 bigframes/ml/metrics/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bigframes/ml/metrics/__init__.py b/bigframes/ml/metrics/__init__.py
index 7629c1e948..6b0a243426 100644
--- a/bigframes/ml/metrics/__init__.py
+++ b/bigframes/ml/metrics/__init__.py
@@ -20,12 +20,14 @@
     f1_score,
     precision_score,
     r2_score,
+    recall_score,
     roc_auc_score,
     roc_curve,
 )
 
 __all__ = [
     "r2_score",
+    "recall_score",
     "accuracy_score",
     "roc_curve",
     "roc_auc_score",

From 9a1b5e6b7ded767d4874b0dd7ae10a411ec028c7 Mon Sep 17 00:00:00 2001
From: Garrett Wu <garrettwu@google.com>
Date: Tue, 13 Feb 2024 00:32:31 +0000
Subject: [PATCH 4/4] add docs

---
 bigframes/ml/core.py                                 | 12 ++++++++++++
 .../bigframes_vendored/sklearn/metrics/pairwise.py   |  8 ++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index c6621f9f1e..2df96383ef 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -69,6 +69,18 @@ def distance(
         type: Literal["EUCLIDEAN", "MANHATTAN", "COSINE"],
         name: str,
     ) -> bpd.DataFrame:
+        """Calculate ML.DISTANCE from DataFrame inputs.
+
+        Args:
+            x:
+                input DataFrame
+            y:
+                input DataFrame
+            type:
+                Distance types, accept values are  "EUCLIDEAN", "MANHATTAN", "COSINE".
+            name:
+                name of the output result column
+        """
         assert len(x.columns) == 1 and len(y.columns) == 1
 
         input_data = x._cached().join(y._cached(), how="outer")
diff --git a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py
index d320dce863..3ef5431178 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/pairwise.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/pairwise.py
@@ -7,8 +7,11 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 # License: BSD 3 clause
 
+from bigframes import constants
+import bigframes.pandas as bpd
 
-def cosine_similarity(X, Y):
+
+def cosine_similarity(X, Y) -> bpd.DataFrame:
     """Compute cosine similarity between samples in X and Y.
 
     Cosine similarity, or the cosine kernel, computes similarity as the
@@ -23,5 +26,6 @@ def cosine_similarity(X, Y):
             Input data. X and Y are mapped by indexes, must have the same index.
 
     Returns:
-        DataFrame with columns of X, Y and cosine_similarity
+        bigframes.dataframe.DataFrame: DataFrame with columns of X, Y and cosine_similarity
     """
+    raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)