From 7001f96c04104c1937e4d8cc380feb42c8d5db6c Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 16 Jan 2025 23:08:58 +0000 Subject: [PATCH 01/20] chore: remove unused dependencies. --- bigframes/ml/metrics/_metrics.py | 14 +++++++++++--- setup.py | 11 ++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 90df6f9539..575a5a58cc 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -25,7 +25,6 @@ import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd -import sklearn.metrics as sklearn_metrics # type: ignore from bigframes.ml import utils import bigframes.pandas as bpd @@ -177,8 +176,17 @@ def auc( x_series, y_series = utils.batch_convert_to_series(x, y) # TODO(b/286410053) Support ML exceptions and error handling. - auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas()) - return auc + x_pandas = x_series.to_pandas() + y_pandas = y_series.to_pandas() + + if x_pandas.is_monotonic_decreasing: + d = -1 + elif x_pandas.is_monotonic_increasing: + d = 1 + else: + raise ValueError(f"x is neither increasing nor decreasing : {x_pandas}.") + + return d * np.trapz(y_pandas, x_pandas) auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) diff --git a/setup.py b/setup.py index 74a0d5475c..6b99e96b9c 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ "pyarrow >=10.0.1", "pydata-google-auth >=1.8.2", "requests >=2.27.1", - "scikit-learn >=1.2.2", - "sqlalchemy >=1.4,<3.0dev", "sqlglot >=23.6.3", "tabulate >=0.9", "ipywidgets >=7.7.1", @@ -77,7 +75,14 @@ # used for local engine, which is only needed for unit tests at present. "polars": ["polars >= 1.7.0"], # Packages required for basic development flow. - "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"], + "dev": [ + "pytest", + "pytest-mock", + "pre-commit", + "nox", + "google-cloud-testutils", + "scikit-learn", + ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) From fc52332ab12e24860fbb8a77e0ed7c1d9625f5d8 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 16 Jan 2025 23:10:34 +0000 Subject: [PATCH 02/20] remove dependency from test constrains. --- testing/constraints-3.9.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 015153cb01..ca9314dd88 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,7 +20,6 @@ pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 scikit-learn==1.2.2 -sqlalchemy==1.4 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 From 8d0a8f4dfc44003b530f620514190a5863cffb52 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 16 Jan 2025 23:24:21 +0000 Subject: [PATCH 03/20] update test dependencies. --- noxfile.py | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/noxfile.py b/noxfile.py index 863c7b26d3..940266f9b0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -78,6 +78,7 @@ "google-cloud-testutils", "tabulate", "xarray", + "scikit-learn >=1.2.2", ] SYSTEM_TEST_EXTERNAL_DEPENDENCIES = [ "google-cloud-bigquery", @@ -172,7 +173,7 @@ def install_unittest_dependencies(session, install_test_extra, *constraints): if UNIT_TEST_EXTERNAL_DEPENDENCIES: msg = ( "'unit_test_external_dependencies' is deprecated. Instead, please " - "use 'unit_test_dependencies' or 'unit_test_local_dependencies'.", + "use 'unit_test_dependencies' or 'unit_test_local_dependencies'." ) warnings.warn(msg, DeprecationWarning) session.install(*UNIT_TEST_EXTERNAL_DEPENDENCIES, *constraints) diff --git a/setup.py b/setup.py index 6b99e96b9c..987ea59ad7 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,7 @@ "pre-commit", "nox", "google-cloud-testutils", - "scikit-learn", + "scikit-learn >=1.2.2", ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) From a5bd03ddc955a18635938219c049fa54618c9c4c Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 16 Jan 2025 23:43:20 +0000 Subject: [PATCH 04/20] update nox dependencies --- noxfile.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 940266f9b0..7325f74fb4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -78,7 +78,6 @@ "google-cloud-testutils", "tabulate", "xarray", - "scikit-learn >=1.2.2", ] SYSTEM_TEST_EXTERNAL_DEPENDENCIES = [ "google-cloud-bigquery", @@ -224,6 +223,9 @@ def run_unit(session, install_test_extra): @nox.session(python=UNIT_TEST_PYTHON_VERSIONS) def unit(session): + session.install( + "scikit-learn >=1.2.2", + ) run_unit(session, install_test_extra=True) @@ -480,6 +482,7 @@ def docs(session): SPHINX_VERSION, "alabaster", "recommonmark", + "scikit-learn >=1.2.2", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) From 605aca0eaedc0d0c4d82b99959ade4304b42b8b1 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 17 Jan 2025 18:45:18 +0000 Subject: [PATCH 05/20] update --- noxfile.py | 5 +++-- setup.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index 7325f74fb4..9a81769067 100644 --- a/noxfile.py +++ b/noxfile.py @@ -224,7 +224,7 @@ def run_unit(session, install_test_extra): @nox.session(python=UNIT_TEST_PYTHON_VERSIONS) def unit(session): session.install( - "scikit-learn >=1.2.2", + "scikit-learn>=1.2.2", ) run_unit(session, install_test_extra=True) @@ -482,7 +482,7 @@ def docs(session): SPHINX_VERSION, "alabaster", "recommonmark", - "scikit-learn >=1.2.2", + "scikit-learn>=1.2.2", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) @@ -525,6 +525,7 @@ def docfx(session): "alabaster", "recommonmark", "gcp-sphinx-docfx-yaml==3.0.1", + "scikit-learn>=1.2.2", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) diff --git a/setup.py b/setup.py index 987ea59ad7..38037a36c7 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,7 @@ "pre-commit", "nox", "google-cloud-testutils", - "scikit-learn >=1.2.2", + "scikit-learn>=1.2.2", ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) From 7c487b4149fd6cd281e53c227a63fa2101bbba97 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 17 Jan 2025 19:07:43 +0000 Subject: [PATCH 06/20] update trapz --- bigframes/ml/metrics/_metrics.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 575a5a58cc..d4784a4b21 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -186,7 +186,11 @@ def auc( else: raise ValueError(f"x is neither increasing nor decreasing : {x_pandas}.") - return d * np.trapz(y_pandas, x_pandas) + if hasattr(np, "trapezoid"): + # new in numpy 2.0 + return d * np.trapezoid(y_pandas, x_pandas) + # np.trapz has been deprecated in 2.0 + return d * np.trapz(y_pandas, x_pandas) # type: ignore auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) From 3e816887b66aefeb7239017acecc5a3d22e6b20b Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 17 Jan 2025 20:53:22 +0000 Subject: [PATCH 07/20] update value error --- bigframes/ml/metrics/_metrics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index d4784a4b21..e684028047 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -179,6 +179,11 @@ def auc( x_pandas = x_series.to_pandas() y_pandas = y_series.to_pandas() + if len(x_pandas) < 2: + raise ValueError( + f"At least 2 points are needed to compute area under curve, but x.shape = {len(x_pandas)}" + ) + if x_pandas.is_monotonic_decreasing: d = -1 elif x_pandas.is_monotonic_increasing: From c36fb8ec975dceb13a255ae16fd6727aafd61ea2 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 17 Jan 2025 21:01:25 +0000 Subject: [PATCH 08/20] remove todo --- bigframes/ml/metrics/_metrics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index e684028047..7558529201 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -175,7 +175,6 @@ def auc( ) -> float: x_series, y_series = utils.batch_convert_to_series(x, y) - # TODO(b/286410053) Support ML exceptions and error handling. x_pandas = x_series.to_pandas() y_pandas = y_series.to_pandas() From ab18fe07b001f3b57bf7e29de7e787b735af59ba Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 17 Jan 2025 22:08:38 +0000 Subject: [PATCH 09/20] update dep --- noxfile.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/noxfile.py b/noxfile.py index 9a81769067..7a277fe933 100644 --- a/noxfile.py +++ b/noxfile.py @@ -231,6 +231,9 @@ def unit(session): @nox.session(python=UNIT_TEST_PYTHON_VERSIONS[-1]) def unit_noextras(session): + session.install( + "scikit-learn>=1.2.2", + ) run_unit(session, install_test_extra=False) From ae17f8b2e08cf8e0beeed4ed6f40f0de49b7e00a Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 17 Jan 2025 22:59:48 +0000 Subject: [PATCH 10/20] update test env --- testing/constraints-3.9.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index ca9314dd88..921f6c1a15 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,6 +19,7 @@ pandas-gbq==0.26.0 pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 +scipy==1.6.0 scikit-learn==1.2.2 sqlglot==23.6.3 tabulate==0.9 From 207a6e3d6074c705a3edb60516a990a012ec9fdc Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Wed, 19 Feb 2025 23:30:46 +0000 Subject: [PATCH 11/20] code update --- noxfile.py | 23 ++++++++++------------- scripts/test_publish_api_coverage.py | 2 ++ setup.py | 2 +- testing/constraints-3.9.txt | 2 +- tests/unit/ml/test_api_primitives.py | 5 ++--- tests/unit/ml/test_compose.py | 4 ++-- tests/unit/ml/test_pipeline.py | 9 ++++----- 7 files changed, 22 insertions(+), 25 deletions(-) diff --git a/noxfile.py b/noxfile.py index 34396d602c..fd772a073d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -72,7 +72,13 @@ UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] -UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]} +UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.9": ["scikit-learn"], + "3.10": ["scikit-learn"], + "3.11": ["scikit-learn"], + "3.12": ["polars", "scikit-learn"], + "3.13": ["scikit-learn"], +} # 3.10 is needed for Windows tests as it is the only version installed in the # bigframes-windows container image. For more information, search @@ -96,7 +102,7 @@ ] SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] SYSTEM_TEST_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_EXTRAS: List[str] = ["tests"] +SYSTEM_TEST_EXTRAS: List[str] = ["tests", "scikit-learn"] SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" @@ -227,17 +233,11 @@ def run_unit(session, install_test_extra): @nox.session(python=UNIT_TEST_PYTHON_VERSIONS) def unit(session): - session.install( - "scikit-learn>=1.2.2", - ) run_unit(session, install_test_extra=True) @nox.session(python=UNIT_TEST_PYTHON_VERSIONS[-1]) def unit_noextras(session): - session.install( - "scikit-learn>=1.2.2", - ) run_unit(session, install_test_extra=False) @@ -474,8 +474,7 @@ def cover(session): @nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs for this library.""" - - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -489,7 +488,6 @@ def docs(session): SPHINX_VERSION, "alabaster", "recommonmark", - "scikit-learn>=1.2.2", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) @@ -517,7 +515,7 @@ def docs(session): def docfx(session): """Build the docfx yaml files for this library.""" - session.install("-e", ".") + session.install("-e", ".[scikit-learn]") session.install( # We need to pin to specific versions of the `sphinxcontrib-*` packages # which still support sphinx 4.x. @@ -532,7 +530,6 @@ def docfx(session): "alabaster", "recommonmark", "gcp-sphinx-docfx-yaml==3.0.1", - "scikit-learn>=1.2.2", ) shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 034a266177..6dea10b608 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -19,6 +19,8 @@ from . import publish_api_coverage +pytest.importorskip("sklearn") + @pytest.fixture def api_coverage_df(): diff --git a/setup.py b/setup.py index ed4ec8bb30..1f6114b634 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ "tests": [], # used for local engine, which is only needed for unit tests at present. "polars": ["polars >= 1.7.0"], + "scikit-learn": ["scikit-learn>=1.2.2"], # Packages required for basic development flow. "dev": [ "pytest", @@ -82,7 +83,6 @@ "pre-commit", "nox", "google-cloud-testutils", - "scikit-learn>=1.2.2", ], } extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values())))) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index d70f7f41be..9545db8e5c 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,7 +20,7 @@ pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 scipy==1.6.0 -scikit-learn==1.2.2 +scikit-learn>=1.2.2 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 diff --git a/tests/unit/ml/test_api_primitives.py b/tests/unit/ml/test_api_primitives.py index 00a51ccfe9..dd2ceff143 100644 --- a/tests/unit/ml/test_api_primitives.py +++ b/tests/unit/ml/test_api_primitives.py @@ -13,8 +13,6 @@ # limitations under the License. import pytest -import sklearn.decomposition as sklearn_decomposition # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore import bigframes.ml.decomposition import bigframes.ml.linear_model @@ -35,8 +33,9 @@ def test_base_estimator_repr(): assert pca_estimator.__repr__() == "PCA(n_components=7)" -@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn") def test_base_estimator_repr_matches_sklearn(): + sklearn_decomposition = pytest.importorskip("sklearn.decomposition") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") estimator = bigframes.ml.linear_model.LinearRegression() sklearn_estimator = sklearn_linear_model.LinearRegression() assert estimator.__repr__() == sklearn_estimator.__repr__() diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 395296f3e4..450ce8d6ee 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -15,8 +15,6 @@ from google.cloud import bigquery import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, preprocessing from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer @@ -119,6 +117,8 @@ def test_columntransformer_repr(): def test_columntransformer_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_column_transformer = compose.ColumnTransformer( [ ( diff --git a/tests/unit/ml/test_pipeline.py b/tests/unit/ml/test_pipeline.py index ed5c621b1d..beebb9f282 100644 --- a/tests/unit/ml/test_pipeline.py +++ b/tests/unit/ml/test_pipeline.py @@ -13,10 +13,6 @@ # limitations under the License. import pytest -import sklearn.compose as sklearn_compose # type: ignore -import sklearn.linear_model as sklearn_linear_model # type: ignore -import sklearn.pipeline as sklearn_pipeline # type: ignore -import sklearn.preprocessing as sklearn_preprocessing # type: ignore from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing @@ -57,8 +53,11 @@ def test_pipeline_repr(): ) -@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn") def test_pipeline_repr_matches_sklearn(): + sklearn_compose = pytest.importorskip("sklearn.compose") + sklearn_linear_model = pytest.importorskip("sklearn.linear_model") + sklearn_pipeline = pytest.importorskip("sklearn.pipeline") + sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing") bf_pl = pipeline.Pipeline( [ ( From 5a221ecabb966928a0a183c862df64b8e15677e9 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 20 Feb 2025 19:14:26 +0000 Subject: [PATCH 12/20] remove scipy in test --- testing/constraints-3.9.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 9545db8e5c..cf94017a5a 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,7 +19,6 @@ pandas-gbq==0.26.0 pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 -scipy==1.6.0 scikit-learn>=1.2.2 sqlglot==23.6.3 tabulate==0.9 From ab352dbdbd6124c7e0a7538d42657a63bd974cb7 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 20 Feb 2025 20:55:54 +0000 Subject: [PATCH 13/20] add scipt=1.7.1 to test constraints --- testing/constraints-3.9.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index cf94017a5a..62bada7b9b 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,6 +19,7 @@ pandas-gbq==0.26.0 pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 +scipy==1.7.1 scikit-learn>=1.2.2 sqlglot==23.6.3 tabulate==0.9 From 10e6e31f3d719c61d0a906e06c2670aa57d689eb Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 20 Feb 2025 21:26:13 +0000 Subject: [PATCH 14/20] update scipy version --- testing/constraints-3.9.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 62bada7b9b..139e69a416 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,7 +19,7 @@ pandas-gbq==0.26.0 pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 -scipy==1.7.1 +scipy==1.9.0 scikit-learn>=1.2.2 sqlglot==23.6.3 tabulate==0.9 From 5eaeb5b91b2b6e95c41efdb9631f166566e8b0b5 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 20 Feb 2025 22:22:44 +0000 Subject: [PATCH 15/20] update constraint --- noxfile.py | 2 ++ testing/constraints-3.9.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index fd772a073d..d0d814e502 100644 --- a/noxfile.py +++ b/noxfile.py @@ -657,6 +657,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( if match.group(1) not in already_installed ] + print(already_installed) + # We use --no-deps to ensure that pre-release versions aren't overwritten # by the version ranges in setup.py. session.install(*deps) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 139e69a416..a5bfb7ba53 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,7 +20,7 @@ pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 scipy==1.9.0 -scikit-learn>=1.2.2 +scikit-learn==1.2.2 sqlglot==23.6.3 tabulate==0.9 ipywidgets==7.7.1 From 571ec91bb263b5937ad292a6f9e63c7c3b88d5f9 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Thu, 20 Feb 2025 23:20:16 +0000 Subject: [PATCH 16/20] update scipy test --- tests/system/small/test_series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 00f47c754e..2daa7dd825 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict): ), ) def test_series_interpolate(method): + pytest.importorskip("scipy") + values = [None, 1, 2, None, None, 16, None] index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8] pd_series = pd.Series(values, index) From c0b2df3f269122241576a6ee98e542530044c552 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 21 Feb 2025 03:17:52 +0000 Subject: [PATCH 17/20] skip some tests when sklearn doesn't exists --- tests/system/small/ml/test_metrics.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/system/small/ml/test_metrics.py b/tests/system/small/ml/test_metrics.py index 81e1b2f77f..b80202bdbe 100644 --- a/tests/system/small/ml/test_metrics.py +++ b/tests/system/small/ml/test_metrics.py @@ -17,7 +17,6 @@ import numpy as np import pandas as pd import pytest -import sklearn.metrics as sklearn_metrics # type: ignore import bigframes from bigframes.ml import metrics @@ -66,6 +65,7 @@ def test_r2_score_force_finite(session): def test_r2_score_ok_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session): def test_accuracy_score_fit_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]}) df = session.read_pandas(pd_df) @@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session): def test_roc_curve_binary_classification_prediction_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session): def test_roc_curve_binary_classification_decision_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") # Instead of operating on probabilities, assume a 70% decision threshold # has been applied, and operate on the final output y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45] @@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session): def test_roc_auc_score_returns_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1], @@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session): def test_confusion_matrix_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 3, 3, 3, 4, 1], @@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session): def test_confusion_matrix_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -603,6 +609,7 @@ def test_recall_score(session): def test_recall_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session): def test_recall_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -673,6 +681,7 @@ def test_precision_score(session): def test_precision_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session): def test_precision_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], @@ -752,6 +762,7 @@ def test_f1_score(session): def test_f1_score_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": [2, 0, 2, 2, 0, 1], @@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session): def test_f1_score_str_matches_sklearn(session): + sklearn_metrics = pytest.importorskip("sklearn.metrics") pd_df = pd.DataFrame( { "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"], From 3ec09cb98ce3eb36e8232bd983a4a871f31b9655 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Fri, 21 Feb 2025 10:06:50 -0800 Subject: [PATCH 18/20] Remove sklearn for unit except 3.12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- noxfile.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/noxfile.py b/noxfile.py index d0d814e502..2a03c9b89c 100644 --- a/noxfile.py +++ b/noxfile.py @@ -73,11 +73,7 @@ UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.9": ["scikit-learn"], - "3.10": ["scikit-learn"], - "3.11": ["scikit-learn"], "3.12": ["polars", "scikit-learn"], - "3.13": ["scikit-learn"], } # 3.10 is needed for Windows tests as it is the only version installed in the From 565cda5317e558782af969c70a4089afb76b3b86 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 21 Feb 2025 18:24:53 +0000 Subject: [PATCH 19/20] update auc --- bigframes/ml/metrics/_metrics.py | 19 +----------------- noxfile.py | 9 +++++++-- .../sklearn/metrics/_ranking.py | 20 ++++++++++++++++++- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 7558529201..658818b261 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -177,24 +177,7 @@ def auc( x_pandas = x_series.to_pandas() y_pandas = y_series.to_pandas() - - if len(x_pandas) < 2: - raise ValueError( - f"At least 2 points are needed to compute area under curve, but x.shape = {len(x_pandas)}" - ) - - if x_pandas.is_monotonic_decreasing: - d = -1 - elif x_pandas.is_monotonic_increasing: - d = 1 - else: - raise ValueError(f"x is neither increasing nor decreasing : {x_pandas}.") - - if hasattr(np, "trapezoid"): - # new in numpy 2.0 - return d * np.trapezoid(y_pandas, x_pandas) - # np.trapz has been deprecated in 2.0 - return d * np.trapz(y_pandas, x_pandas) # type: ignore + return vendored_metrics_ranking.auc(x_pandas, y_pandas) auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc) diff --git a/noxfile.py b/noxfile.py index 2a03c9b89c..bffb6ebaa0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -98,8 +98,13 @@ ] SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = [] SYSTEM_TEST_DEPENDENCIES: List[str] = [] -SYSTEM_TEST_EXTRAS: List[str] = ["tests", "scikit-learn"] -SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {} +SYSTEM_TEST_EXTRAS: List[str] = [] +SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { + "3.9": ["tests"], + "3.10": ["tests"], + "3.12": ["tests", "scikit-learn"], + "3.13": ["tests"], +} LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME" diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 7b97526de2..9262ffbd3d 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -16,6 +16,8 @@ # Michal Karbownik # License: BSD 3 clause +import numpy as np + from bigframes import constants @@ -60,7 +62,23 @@ def auc(x, y) -> float: Returns: float: Area Under the Curve. """ - raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + if len(x) < 2: + raise ValueError( + f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}" + ) + + if x.is_monotonic_decreasing: + d = -1 + elif x.is_monotonic_increasing: + d = 1 + else: + raise ValueError(f"x is neither increasing nor decreasing : {x}.") + + if hasattr(np, "trapezoid"): + # new in numpy 2.0 + return d * np.trapezoid(y, x) + # np.trapz has been deprecated in 2.0 + return d * np.trapz(y, x) # type: ignore def roc_auc_score(y_true, y_score) -> float: From 2b612236a6e16a0722c80fd8a4357b70e5996d63 Mon Sep 17 00:00:00 2001 From: Huan Chen Date: Fri, 21 Feb 2025 18:40:05 +0000 Subject: [PATCH 20/20] remove scipy again --- testing/constraints-3.9.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index a5bfb7ba53..30d5c1c3a7 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -19,7 +19,6 @@ pandas-gbq==0.26.0 pyarrow==10.0.1 pydata-google-auth==1.8.2 requests==2.27.1 -scipy==1.9.0 scikit-learn==1.2.2 sqlglot==23.6.3 tabulate==0.9