8000 deps: remove scikit-learn and sqlalchemy as required dependencies by Genesis929 · Pull Request #1296 · googleapis/python-bigquery-dataframes · GitHub
[go: up one dir, main page]

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c66593a
Merge branch 'main' into dep_removal_huanc
Genesis929 Jan 17, 2025
c36fb8e
remove todo
Genesis929 Jan 17, 2025
ab18fe0
update dep
Genesis929 Jan 17, 2025
ae17f8b
update test env
Genesis929 Jan 17, 2025
f0cbe9a
Merge branch 'main' into dep_removal_huanc
Genesis929 Feb 19, 2025
207a6e3
code update
Genesis929 Feb 19, 2025
5a221ec
remove scipy in test
Genesis929 Feb 20, 2025
8f67cf7
Merge branch 'main' into dep_removal_huanc
Genesis929 Feb 20, 2025
ab352db
add scipt=1.7.1 to test constraints
Genesis929 Feb 20, 2025
10e6e31
update scipy version
Genesis929 Feb 20, 2025
5eaeb5b
update constraint
Genesis929 Feb 20, 2025
571ec91
update scipy test
Genesis929 Feb 20, 2025
c0b2df3
skip some tests when sklearn doesn't exists
Genesis929 Feb 21, 2025
2aa51b5
Merge branch 'main' into dep_removal_huanc
Genesis929 Feb 21, 2025
3ec09cb
Remove sklearn for unit except 3.12
Genesis929 Feb 21, 2025
565cda5
update auc
Genesis929 Feb 21, 2025
2b61223
remove scipy again
Genesis929 Feb 21, 2025
d41369a
Merge branch 'main' into dep_removal_huanc
Genesis929 Feb 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions bigframes/ml/metrics/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
import numpy as np
import pandas as pd
import sklearn.metrics as sklearn_metrics # type: ignore

from bigframes.ml import utils
import bigframes.pandas as bpd
Expand Down Expand Up @@ -176,9 +175,9 @@ def auc(
) -> float:
x_series, y_series = utils.batch_convert_to_series(x, y)

# TODO(b/286410053) Support ML exceptions and error handling.
auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas())
return auc
x_pandas = x_series.to_pandas()
y_pandas = y_series.to_pandas()
return vendored_metrics_ranking.auc(x_pandas, y_pandas)


auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)
Expand Down
20 changes: 14 additions & 6 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@
UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
UNIT_TEST_DEPENDENCIES: List[str] = []
UNIT_TEST_EXTRAS: List[str] = []
UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]}
UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
"3.12": ["polars", "scikit-learn"],
}

# 3.10 is needed for Windows tests as it is the only version installed in the
# bigframes-windows container image. For more information, search
Expand All @@ -96,8 +98,13 @@
]
SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = []
SYSTEM_TEST_DEPENDENCIES: List[str] = []
SYSTEM_TEST_EXTRAS: List[str] = ["tests"]
SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {}
SYSTEM_TEST_EXTRAS: List[str] = []
SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
"3.9": ["tests"],
"3.10": ["tests"],
"3.12": ["tests", "scikit-learn"],
"3.13": ["tests"],
}

LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"

Expand Down Expand Up @@ -468,8 +475,7 @@ def cover(session):
@nox.session(python=DEFAULT_PYTHON_VERSION)
def docs(session):
"""Build the docs for this library."""

session.install("-e", ".")
session.install("-e", ".[scikit-learn]")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very concerned if we need to install scikit-learn just to build the docs. Please remove.

Suggested change
session.install("-e", ".[scikit-learn]")
session.install("-e", ".")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's possible to remove this as we are calling publish_api_coverage.py, and one of the things it do is go through sklearn APIs and generate a coverage report.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Thanks for clarifying.

session.install(
# We need to pin to specific versions of the `sphinxcontrib-*` packages
# which still support sphinx 4.x.
Expand Down Expand Up @@ -510,7 +516,7 @@ def docs(session):
def docfx(session):
"""Build the docfx yaml files for this library."""

session.install("-e", ".")
session.install("-e", ".[scikit-learn]")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very concerned if we need to install scikit-learn just to build the docs. Please remove.

Suggested change
session.install("-e", ".[scikit-learn]")
session.install("-e", ".")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's possible to remove this as we are calling publish_api_coverage.py, and one of the things it do is go through sklearn APIs and generate a coverage report.

session.install(
# We need to pin to specific versions of the `sphinxcontrib-*` packages
# which still support sphinx 4.x.
Expand Down Expand Up @@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=(
if match.group(1) not in already_installed
]

print(already_installed)

# We use --no-deps to ensure that pre-release versions aren't overwritten
# by the version ranges in setup.py.
session.install(*deps)
Expand Down
2 changes: 2 additions & 0 deletions scripts/test_publish_api_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

from . import publish_api_coverage

pytest.importorskip("sklearn")


@pytest.fixture
def api_coverage_df():
Expand Down
11 changes: 8 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@
"pyarrow >=10.0.1",
"pydata-google-auth >=1.8.2",
"requests >=2.27.1",
"scikit-learn >=1.2.2",
"sqlalchemy >=1.4,<3.0dev",
"sqlglot >=23.6.3",
"tabulate >=0.9",
"ipywidgets >=7.7.1",
Expand All @@ -77,8 +75,15 @@
"tests": [],
# used for local engine, which is only needed for unit tests at present.
"polars": ["polars >= 1.7.0"],
"scikit-learn": ["scikit-learn>=1.2.2"],
# Packages required for basic development flow.
"dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"],
"dev": [
"pytest",
"pytest-mock",
"pre-commit",
"nox",
"google-cloud-testutils",
],
}
extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values()))))

Expand Down
1 change: 0 additions & 1 deletion testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ pyarrow==10.0.1
pydata-google-auth==1.8.2
requests==2.27.1
scikit-learn==1.2.2
sqlalchemy==1.4
sqlglot==23.6.3
tabulate==0.9
ipywidgets==7.7.1
Expand Down
14 changes: 13 additions & 1 deletion tests/system/small/ml/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import numpy as np
import pandas as pd
import pytest
import sklearn.metrics as sklearn_metrics # type: ignore

import bigframes
from bigframes.ml import metrics
Expand Down Expand Up @@ -66,6 +65,7 @@ def test_r2_score_force_finite(session):


def test_r2_score_ok_fit_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})

df = session.read_pandas(pd_df)
Expand Down Expand Up @@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session):


def test_accuracy_score_fit_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})

df = session.read_pandas(pd_df)
Expand Down Expand Up @@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session):


def test_roc_curve_binary_classification_prediction_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
Expand Down Expand Up @@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session):


def test_roc_curve_binary_classification_decision_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
# Instead of operating on probabilities, assume a 70% decision threshold
# has been applied, and operate on the final output
y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]
Expand Down Expand Up @@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session):


def test_roc_auc_score_returns_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
Expand Down Expand Up @@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session):


def test_confusion_matrix_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 3, 3, 3, 4, 1],
Expand All @@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session):


def test_confusion_matrix_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down Expand Up @@ -603,6 +609,7 @@ def test_recall_score(session):


def test_recall_score_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 0, 2, 2, 0, 1],
Expand All @@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session):


def test_recall_score_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down Expand Up @@ -673,6 +681,7 @@ def test_precision_score(session):


def test_precision_score_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 0, 2, 2, 0, 1],
Expand All @@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session):


def test_precision_score_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down Expand Up @@ -752,6 +762,7 @@ def test_f1_score(session):


def test_f1_score_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 0, 2, 2, 0, 1],
Expand All @@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session):


def test_f1_score_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down
2 changes: 2 additions & 0 deletions tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
),
)
def test_series_interpolate(method):
pytest.importorskip("scipy")

values = [None, 1, 2, None, None, 16, None]
index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
pd_series = pd.Series(values, index)
Expand Down
5 changes: 2 additions & 3 deletions tests/unit/ml/test_api_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
# limitations under the License.

import pytest
import sklearn.decomposition as sklearn_decomposition # type: ignore
import sklearn.linear_model as sklearn_linear_model # type: ignore

import bigframes.ml.decomposition
import bigframes.ml.linear_model
Expand All @@ -35,8 +33,9 @@ def test_base_estimator_repr():
assert pca_estimator.__repr__() == "PCA(n_components=7)"


@pytest.mark.skipi 4268 f(sklearn_linear_model is None, reason="requires sklearn")
def test_base_estimator_repr_matches_sklearn():
sklearn_decomposition = pytest.importorskip("sklearn.decomposition")
sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
estimator = bigframes.ml.linear_model.LinearRegression()
sklearn_estimator = sklearn_linear_model.LinearRegression()
assert estimator.__repr__() == sklearn_estimator.__repr__()
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/ml/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

from google.cloud import bigquery
import pytest
import sklearn.compose as sklearn_compose # type: ignore
import sklearn.preprocessing as sklearn_preprocessing # type: ignore

from bigframes.ml import compose, preprocessing
from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer
Expand Down Expand Up @@ -119,6 +117,8 @@ def test_columntransformer_repr():


def test_columntransformer_repr_matches_sklearn():
sklearn_compose = pytest.importorskip("sklearn.compose")
sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
bf_column_transformer = compose.ColumnTransformer(
[
(
Expand Down
9 changes: 4 additions & 5 deletions tests/unit/ml/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@
# limitations under the License.

import pytest
import sklearn.compose as sklearn_compose # type: ignore
import sklearn.linear_model as sklearn_linear_model # type: ignore
import sklearn.pipeline as sklearn_pipeline # type: ignore
import sklearn.preprocessing as sklearn_preprocessing # type: ignore

from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing

Expand Down Expand Up @@ -57,8 +53,11 @@ def test_pipeline_repr():
)


@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn")
def test_pipeline_repr_matches_sklearn():
sklearn_compose = pyt 4268 est.importorskip("sklearn.compose")
sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
sklearn_pipeline = pytest.importorskip("sklearn.pipeline")
sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
bf_pl = pipeline.Pipeline(
[
(
Expand Down
20 changes: 19 additions & 1 deletion third_party/bigframes_vendored/sklearn/metrics/_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# Michal Karbownik <michakarbownik@gmail.com>
# License: BSD 3 clause

import numpy as np

from bigframes import constants


Expand Down Expand Up @@ -60,7 +62,23 @@ def auc(x, y) -> float:
Returns:
float: Area Under the Curve.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
if len(x) < 2:
raise ValueError(
f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}"
)

if x.is_monotonic_decreasing:
d = -1
elif x.is_monotonic_increasing:
d = 1
else:
raise ValueError(f"x is neither increasing nor decreasing : {x}.")

if hasattr(np, "trapezoid"):
# new in numpy 2.0
return d * np.trapezoid(y, x)
# np.trapz has been deprecated in 2.0
return d * np.trapz(y, x) # type: ignore


def roc_auc_score(y_true, y_score) -> float:
Expand Down
0