From 1f87562f82e889a310d1aa4b31b92912741e5840 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Mon, 2 Oct 2023 23:00:14 +0000 Subject: [PATCH 1/6] feat: add ml.preprocessing.KBinsDiscretizer --- bigframes/ml/compose.py | 9 +- bigframes/ml/pipeline.py | 12 +- bigframes/ml/preprocessing.py | 127 ++++++++++++++- bigframes/ml/sql.py | 11 +- tests/system/large/ml/test_pipeline.py | 38 +++++ tests/system/small/ml/test_preprocessing.py | 150 +++++++++++++++++- tests/unit/ml/test_compose.py | 25 +++ tests/unit/ml/test_sql.py | 9 ++ .../sklearn/preprocessing/_discretization.py | 44 +++++ 9 files changed, 411 insertions(+), 14 deletions(-) create mode 100644 third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 9effbf1968..553a22b005 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -31,6 +31,7 @@ preprocessing.StandardScaler, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ] @@ -91,7 +92,9 @@ def transformers_( return result - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql( + self, columns: List[str], X: Union[bpd.DataFrame, bpd.Series] + ) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -102,7 +105,7 @@ def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: Returns: a list of tuples of (sql_expression, output_name)""" return [ - transformer._compile_to_sql([column])[0] + transformer._compile_to_sql([column], X=X)[0] for column in columns for _, transformer, target_column in self.transformers_ if column == target_column @@ -115,7 +118,7 @@ def fit( ) -> ColumnTransformer: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] self._bqml_model = self._bqml_model_factory.create_model( diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index ac02c39112..ad0b3fae11 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -52,6 +52,7 @@ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): preprocessing.OneHotEncoder, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ), ): @@ -93,7 +94,7 @@ def fit( ) -> Pipeline: (X,) = utils.convert_to_dataframe(X) - compiled_transforms = self._transform._compile_to_sql(X.columns.tolist()) + compiled_transforms = self._transform._compile_to_sql(X.columns.tolist(), X=X) transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] if y is not None: @@ -151,6 +152,7 @@ def _extract_as_column_transformer( preprocessing.StandardScaler, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ], Union[str, List[str]], @@ -190,6 +192,13 @@ def _extract_as_column_transformer( *preprocessing.MinMaxScaler._parse_from_sql(transform_sql), ) ) + elif transform_sql.startswith("ML.BUCKETIZE"): + transformers.append( + ( + "k_bins_discretizer", + *preprocessing.KBinsDiscretizer._parse_from_sql(transform_sql), + ) + ) elif transform_sql.startswith("ML.LABEL_ENCODER"): transformers.append( ( @@ -213,6 +222,7 @@ def _merge_column_transformer( preprocessing.OneHotEncoder, preprocessing.MaxAbsScaler, preprocessing.MinMaxScaler, + preprocessing.KBinsDiscretizer, preprocessing.LabelEncoder, ]: """Try to merge the column transformer to a simple transformer.""" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index caf4657a63..cbba4bd2c2 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -23,6 +23,7 @@ from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.preprocessing._data +import third_party.bigframes_vendored.sklearn.preprocessing._discretization import third_party.bigframes_vendored.sklearn.preprocessing._encoder import third_party.bigframes_vendored.sklearn.preprocessing._label @@ -44,12 +45,13 @@ def __init__(self): def __eq__(self, other: Any) -> bool: return type(other) is StandardScaler and self._bqml_model == other._bqml_model - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -124,12 +126,13 @@ def __init__(self): def __eq__(self, other: Any) -> bool: return type(other) is MaxAbsScaler and self._bqml_model == other._bqml_model - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -204,12 +207,13 @@ def __init__(self): def __eq__(self, other: Any) -> bool: return type(other) is MinMaxScaler and self._bqml_model == other._bqml_model - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause Args: - columns: a list of column names to transform + columns: + a list of column names to transform Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -267,6 +271,113 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +class KBinsDiscretizer( + base.Transformer, + third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer, +): + __doc__ = ( + third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer.__doc__ + ) + + def __init__( + self, + n_bins: int = 5, + ): + self.n_bins = n_bins + self._bqml_model: Optional[core.BqmlModel] = None + self._bqml_model_factory = globals.bqml_model_factory() + self._base_sql_generator = globals.base_sql_generator() + + # TODO(garrettwu): implement __hash__ + def __eq__(self, other: Any) -> bool: + return ( + type(other) is KBinsDiscretizer + and self.n_bins == other.n_bins + and self._bqml_model == other._bqml_model + ) + + def _compile_to_sql( + self, + columns: List[str], + X: Union[bpd.DataFrame, bpd.Series], + ) -> List[Tuple[str, str]]: + """Compile this transformer to a list of SQL expressions that can be included in + a BQML TRANSFORM clause + + Args: + columns: + a list of column names to transform + X: + The Dataframe or Series with training data. + + Returns: a list of tuples of (sql_expression, output_name)""" + array_split_points = {} + for column in columns: + min_value = X[column].min() + max_value = X[column].max() + bin_size = (max_value - min_value) / self.n_bins + array_split_points[column] = [ + min_value + i * bin_size for i in range(self.n_bins) + ] + + return [ + ( + self._base_sql_generator.ml_k_bind_discretizer( + column, array_split_points[column], f"kbinsdiscretizer_{column}" + ), + f"kbinsdiscretizer_{column}", + ) + for column in columns + ] + + @classmethod + def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: + """Parse SQL to tuple(KBinsDiscretizer, column_label). + + Args: + sql: SQL string of format "ML.BUCKETIZE({col_label}, array_split_points, FALSE) OVER()" + + Returns: + tuple(KBinsDiscretizer, column_label)""" + s = sql[sql.find("(") + 1 : sql.find(")")] + array_split_points = s[s.find("[") + 1 : s.find("]")] + col_label = s[: s.find(",")] + n_bins = array_split_points.count(",") + 1 + return cls(n_bins), col_label + + def fit( + self, + X: Union[bpd.DataFrame, bpd.Series], + y=None, # ignored + ) -> KBinsDiscretizer: + (X,) = utils.convert_to_dataframe(X) + + compiled_transforms = self._compile_to_sql(X.columns.tolist(), X) + transform_sqls = [transform_sql for transform_sql, _ in compiled_transforms] + + self._bqml_model = self._bqml_model_factory.create_model( + X, + options={"model_type": "transform_only"}, + transforms=transform_sqls, + ) + + # The schema of TRANSFORM output is not available in the model API, so save it during fitting + self._output_names = [name for _, name in compiled_transforms] + return self + + def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("Must be fitted before transform") + + (X,) = utils.convert_to_dataframe(X) + + df = self._bqml_model.transform(X) + return typing.cast( + bpd.DataFrame, + df[self._output_names], + ) + + class OneHotEncoder( base.Transformer, third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, @@ -308,7 +419,7 @@ def __eq__(self, other: Any) -> bool: and self.max_categories == other.max_categories ) - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -432,7 +543,7 @@ def __eq__(self, other: Any) -> bool: and self.max_categories == other.max_categories ) - def _compile_to_sql(self, columns: List[str]) -> List[Tuple[str, str]]: + def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 57c8ba672a..89f147e5f5 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -16,7 +16,7 @@ Generates SQL queries needed for BigQuery DataFrames ML """ -from typing import Iterable, Mapping, Optional, Union +from typing import Iterable, List, Mapping, Optional, Union import bigframes.constants as constants import bigframes.pandas as bpd @@ -85,6 +85,15 @@ def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.MIN_MAX_SCALER for BQML""" return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}""" + def ml_k_bind_discretizer( + self, + numeric_expr_sql: str, + array_split_points: List[str], + name: str, + ) -> str: + """Encode ML.MIN_MAX_SCALER for BQML""" + return f"""ML.BUCKETIZE({numeric_expr_sql}, {array_split_points}, FALSE) AS {name}""" + def ml_one_hot_encoder( self, numeric_expr_sql: str, diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 34a2ca0101..67e7f3ed6b 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -580,6 +580,11 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -657,6 +662,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(), + ["culmen_length_mm", "flipper_length_mm"], + ), ( "label", preprocessing.LabelEncoder(), @@ -696,9 +706,11 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), ("max_abs_scaler", preprocessing.MaxAbsScaler(), "culmen_length_mm"), ("min_max_scaler", preprocessing.MinMaxScaler(), "culmen_length_mm"), + ("k_bins_discretizer", preprocessing.KBinsDiscretizer(), "culmen_length_mm"), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), ("max_abs_scaler", preprocessing.MaxAbsScaler(), "flipper_length_mm"), ("min_max_scaler", preprocessing.MinMaxScaler(), "flipper_length_mm"), + ("k_bins_discretizer", preprocessing.KBinsDiscretizer(), "flipper_length_mm"), ] assert transformers == expected @@ -791,6 +803,32 @@ def test_pipeline_min_max_scaler_to_gbq(penguins_df_default_index, dataset_id): assert pl_loaded._estimator.fit_intercept is False +def test_pipeline_k_bins_discretizer_to_gbq(penguins_df_default_index, dataset_id): + pl = pipeline.Pipeline( + [ + ("transform", preprocessing.KBinsDiscretizer()), + ("estimator", linear_model.LinearRegression(fit_intercept=False)), + ] + ) + + df = penguins_df_default_index.dropna() + X_train = df[ + [ + "culmen_length_mm", + ] + ] + y_train = df[["body_mass_g"]] + pl.fit(X_train, y_train) + + pl_loaded = pl.to_gbq( + f"{dataset_id}.test_penguins_pipeline_k_bins_discretizer", replace=True + ) + assert isinstance(pl_loaded._transform, preprocessing.KBinsDiscretizer) + + assert isinstance(pl_loaded._estimator, linear_model.LinearRegression) + assert pl_loaded._estimator.fit_intercept is False + + def test_pipeline_one_hot_encoder_to_gbq(penguins_df_default_index, dataset_id): pl = pipeline.Pipeline( [ diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index fc8f3251bd..ec3ca470e5 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -211,7 +211,7 @@ def test_max_abs_scaler_series_normalizes(penguins_df_default_index, new_penguin pd.testing.assert_frame_equal(result, expected, rtol=1e-3) -def test_min_max_scaler_normalizeds_fit_transform(new_penguins_df): +def test_min_max_scaler_normalized_fit_transform(new_penguins_df): scaler = bigframes.ml.preprocessing.MinMaxScaler() result = scaler.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] @@ -304,6 +304,154 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer() + result = discretizer.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_4"], + "kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_3", "bin_2"], + "kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_series_normalizes( + penguins_df_default_index, new_penguins_df +): + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer() + discretizer.fit(penguins_df_default_index["culmen_length_mm"]) + + result = discretizer.transform( + penguins_df_default_index["culmen_length_mm"] + ).to_pandas() + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer() + discretizer.fit( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ) + + result = discretizer.transform( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ).to_pandas() + + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_4", "bin_4"], + "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_normalizes_different_params( + penguins_df_default_index, new_penguins_df +): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(n_bins=6) + discretizer.fit( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ) + + result = discretizer.transform( + penguins_df_default_index[ + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] + ] + ).to_pandas() + + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_4", "bin_5"], + "kbinsdiscretizer_culmen_length_mm": ["bin_3", "bin_3", "bin_3"], + "kbinsdiscretizer_flipper_length_mm": ["bin_4", "bin_2", "bin_3"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_k_bins_discretizer_different_params(new_penguins_df): + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(n_bins=7) + result = discretizer.fit( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ) + result = discretizer.transform(new_penguins_df).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "kbinsdiscretizer_culmen_depth_mm": ["bin_8", "bin_2", "bin_5"], + "kbinsdiscretizer_culmen_length_mm": ["bin_8", "bin_4", "bin_2"], + "kbinsdiscretizer_flipper_length_mm": ["bin_8", "bin_2", "bin_5"], + }, + dtype="string[pyarrow]", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 8c8fbd6ab5..3d0809b345 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -23,6 +23,7 @@ def test_columntransformer_init_expectedtransforms(): standard_scaler_transformer = preprocessing.StandardScaler() max_abs_scaler_transformer = preprocessing.MaxAbsScaler() min_max_scaler_transformer = preprocessing.MinMaxScaler() + k_bins_discretizer_transformer = preprocessing.KBinsDiscretizer() label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -42,6 +43,11 @@ def test_columntransformer_init_expectedtransforms(): min_max_scaler_transformer, ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + k_bins_discretizer_transformer, + ["culmen_length_mm", "flipper_length_mm"], + ), ("label", label_transformer, "species"), ] ) @@ -54,6 +60,8 @@ def test_columntransformer_init_expectedtransforms(): ("max_abs_scale", max_abs_scaler_transformer, "flipper_length_mm"), ("min_max_scale", min_max_scaler_transformer, "culmen_length_mm"), ("min_max_scale", min_max_scaler_transformer, "flipper_length_mm"), + ("k_bins_discretizer", k_bins_discretizer_transformer, "culmen_length_mm"), + ("k_bins_discretizer", k_bins_discretizer_transformer, "flipper_length_mm"), ("label", label_transformer, "species"), ] @@ -81,6 +89,11 @@ def test_columntransformer_repr(): preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) @@ -92,6 +105,8 @@ def test_columntransformer_repr(): ('max_abs_scale', MaxAbsScaler(), ['culmen_length_mm', 'flipper_length_mm']), ('min_max_scale', MinMaxScaler(), + ['culmen_length_mm', 'flipper_length_mm']), + ('k_bins_discretizer', KBinsDiscretizer(), ['culmen_length_mm', 'flipper_length_mm'])])""" ) @@ -119,6 +134,11 @@ def test_columntransformer_repr_matches_sklearn(): preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) sk_column_transformer = sklearn_compose.ColumnTransformer( @@ -143,6 +163,11 @@ def test_columntransformer_repr_matches_sklearn(): sklearn_preprocessing.MinMaxScaler(), ["culmen_length_mm", "flipper_length_mm"], ), + ( + "k_bins_discretizer", + sklearn_preprocessing.KBinsDiscretizer(), + ["culmen_length_mm", "flipper_length_mm"], + ), ] ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index a3338e762d..56a3478d2b 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -95,6 +95,15 @@ def test_min_max_scaler_produces_correct_sql( assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" +def test__k_bind_discretizer_produces_correct_sql( + base_sql_generator: ml_sql.BaseSqlGenerator, +): + sql = base_sql_generator.ml_k_bind_discretizer( + "col_a", [1, 2, 3, 4], "scaled_col_a" # type:ignore + ) + assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" + + def test_one_hot_encoder_produces_correct_sql( base_sql_generator: ml_sql.BaseSqlGenerator, ): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py new file mode 100644 index 0000000000..acf5775d17 --- /dev/null +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -0,0 +1,44 @@ +# Author: Henry Lin +# Tom Dupré la Tour + +# License: BSD + +from bigframes import constants +from third_party.bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + + +class KBinsDiscretizer(TransformerMixin, BaseEstimator): + """ + Bin continuous data into intervals. + + + Args: + n_bins (int, default 5): + The number of bins to produce. Raises ValueError if ``n_bins < 2``. + """ + + def fit(self, X, y=None): + """Fit the estimator. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The Dataframe or Series with training data. + + y (default None): + Ignored. + + Returns: + KBinsDiscretizer: Fitted scaler. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def transform(self, X): + """Discretize the data. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + The DataFrame or Series to be transformed. + + Returns: + bigframes.dataframe.DataFrame: Transformed result.""" + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From eae378f34e41d8f1e87b3e047c9bba7349bb4ff5 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 3 Oct 2023 03:09:35 +0000 Subject: [PATCH 2/6] fix: address all the comments --- bigframes/ml/compose.py | 6 ++- bigframes/ml/preprocessing.py | 53 +++++++++++++------ bigframes/ml/sql.py | 6 +-- tests/system/large/ml/test_pipeline.py | 18 +++++-- tests/system/small/ml/test_preprocessing.py | 43 ++++----------- tests/unit/ml/test_compose.py | 11 ++-- tests/unit/ml/test_sql.py | 6 +-- .../sklearn/preprocessing/_discretization.py | 7 ++- 8 files changed, 82 insertions(+), 68 deletions(-) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 553a22b005..bf046ff691 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -93,7 +93,9 @@ def transformers_( return result def _compile_to_sql( - self, columns: List[str], X: Union[bpd.DataFrame, bpd.Series] + self, + columns: List[str], + X: bpd.DataFrame, ) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -101,6 +103,8 @@ def _compile_to_sql( Args: columns (List[str]): a list of column names to transform + X (bpd.DataFrame): + The Dataframe with training data. Returns: a list of tuples of (sql_expression, output_name)""" diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index cbba4bd2c2..4d20477cf4 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -51,7 +51,9 @@ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -132,7 +134,9 @@ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -213,7 +217,9 @@ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" return [ @@ -282,8 +288,18 @@ class KBinsDiscretizer( def __init__( self, n_bins: int = 5, + strategy: Literal["uniform", "quantile", "kmeans"] = "quantile", ): + if strategy != "uniform": + raise NotImplementedError( + f"Only strategy = 'uniform' is supported now, input is {strategy}." + ) + if n_bins < 2: + raise ValueError( + f"n_bins has to be larger than or equal to 2, input is {n_bins}." + ) self.n_bins = n_bins + self.strategy = strategy self._bqml_model: Optional[core.BqmlModel] = None self._bqml_model_factory = globals.bqml_model_factory() self._base_sql_generator = globals.base_sql_generator() @@ -299,7 +315,7 @@ def __eq__(self, other: Any) -> bool: def _compile_to_sql( self, columns: List[str], - X: Union[bpd.DataFrame, bpd.Series], + X: bpd.DataFrame, ) -> List[Tuple[str, str]]: """Compile this transformer to a list of SQL expressions that can be included in a BQML TRANSFORM clause @@ -308,21 +324,22 @@ def _compile_to_sql( columns: a list of column names to transform X: - The Dataframe or Series with training data. + The Dataframe with training data. Returns: a list of tuples of (sql_expression, output_name)""" array_split_points = {} - for column in columns: - min_value = X[column].min() - max_value = X[column].max() - bin_size = (max_value - min_value) / self.n_bins - array_split_points[column] = [ - min_value + i * bin_size for i in range(self.n_bins) - ] + if self.strategy == "uniform": + for column in columns: + min_value = X[column].min() + max_value = X[column].max() + bin_size = (max_value - min_value) / self.n_bins + array_split_points[column] = [ + min_value + i * bin_size for i in range(self.n_bins) + ] return [ ( - self._base_sql_generator.ml_k_bind_discretizer( + self._base_sql_generator.ml_bucketize( column, array_split_points[column], f"kbinsdiscretizer_{column}" ), f"kbinsdiscretizer_{column}", @@ -343,7 +360,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: array_split_points = s[s.find("[") + 1 : s.find("]")] col_label = s[: s.find(",")] n_bins = array_split_points.count(",") + 1 - return cls(n_bins), col_label + return cls(n_bins, "uniform"), col_label def fit( self, @@ -425,7 +442,9 @@ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" @@ -549,7 +568,9 @@ def _compile_to_sql(self, columns: List[str], X=None) -> List[Tuple[str, str]]: Args: columns: - a list of column names to transform + a list of column names to transform. + X (default None): + Ignored. Returns: a list of tuples of (sql_expression, output_name)""" diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 89f147e5f5..601b271099 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -16,7 +16,7 @@ Generates SQL queries needed for BigQuery DataFrames ML """ -from typing import Iterable, List, Mapping, Optional, Union +from typing import Iterable, Mapping, Optional, Union import bigframes.constants as constants import bigframes.pandas as bpd @@ -85,10 +85,10 @@ def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.MIN_MAX_SCALER for BQML""" return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}""" - def ml_k_bind_discretizer( + def ml_bucketize( self, numeric_expr_sql: str, - array_split_points: List[str], + array_split_points: Iterable[Union[int, float]], name: str, ) -> str: """Encode ML.MIN_MAX_SCALER for BQML""" diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 67e7f3ed6b..9294740dd6 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -582,7 +582,7 @@ def test_pipeline_columntransformer_fit_predict(session, penguins_df_default_ind ), ( "k_bins_discretizer", - preprocessing.KBinsDiscretizer(), + preprocessing.KBinsDiscretizer(strategy="uniform"), ["culmen_length_mm", "flipper_length_mm"], ), ( @@ -664,7 +664,7 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id ), ( "k_bins_discretizer", - preprocessing.KBinsDiscretizer(), + preprocessing.KBinsDiscretizer(strategy="uniform"), ["culmen_length_mm", "flipper_length_mm"], ), ( @@ -706,11 +706,19 @@ def test_pipeline_columntransformer_to_gbq(penguins_df_default_index, dataset_id ("standard_scaler", preprocessing.StandardScaler(), "culmen_length_mm"), ("max_abs_scaler", preprocessing.MaxAbsScaler(), "culmen_length_mm"), ("min_max_scaler", preprocessing.MinMaxScaler(), "culmen_length_mm"), - ("k_bins_discretizer", preprocessing.KBinsDiscretizer(), "culmen_length_mm"), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + "culmen_length_mm", + ), ("standard_scaler", preprocessing.StandardScaler(), "flipper_length_mm"), ("max_abs_scaler", preprocessing.MaxAbsScaler(), "flipper_length_mm"), ("min_max_scaler", preprocessing.MinMaxScaler(), "flipper_length_mm"), - ("k_bins_discretizer", preprocessing.KBinsDiscretizer(), "flipper_length_mm"), + ( + "k_bins_discretizer", + preprocessing.KBinsDiscretizer(strategy="uniform"), + "flipper_length_mm", + ), ] assert transformers == expected @@ -806,7 +814,7 @@ def test_pipeline_min_max_scaler_to_gbq(penguins_df_default_index, dataset_id): def test_pipeline_k_bins_discretizer_to_gbq(penguins_df_default_index, dataset_id): pl = pipeline.Pipeline( [ - ("transform", preprocessing.KBinsDiscretizer()), + ("transform", preprocessing.KBinsDiscretizer(strategy="uniform")), ("estimator", linear_model.LinearRegression(fit_intercept=False)), ] ) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index ec3ca470e5..a28a0f080a 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -121,7 +121,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui def test_max_abs_scaler_normalizes(penguins_df_default_index, new_penguins_df): - # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MaxAbsScaler, when BQML's change is in prod. scaler = bigframes.ml.preprocessing.MaxAbsScaler() scaler.fit( penguins_df_default_index[ @@ -265,7 +265,7 @@ def test_min_max_scaler_series_normalizes(penguins_df_default_index, new_penguin def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): - # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.MinMaxScaler, when BQML's change is in prod. scaler = bigframes.ml.preprocessing.MinMaxScaler() scaler.fit( penguins_df_default_index[ @@ -305,7 +305,7 @@ def test_min_max_scaler_normalizes(penguins_df_default_index, new_penguins_df): def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins_df): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer() + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") result = discretizer.fit_transform( new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] ).to_pandas() @@ -331,7 +331,7 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins def test_k_bins_discretizer_series_normalizes( penguins_df_default_index, new_penguins_df ): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer() + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit(penguins_df_default_index["culmen_length_mm"]) result = discretizer.transform( @@ -356,8 +356,8 @@ def test_k_bins_discretizer_series_normalizes( def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_df): - # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer() + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(strategy="uniform") discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -393,8 +393,10 @@ def test_k_bins_discretizer_normalizes(penguins_df_default_index, new_penguins_d def test_k_bins_discretizer_normalizes_different_params( penguins_df_default_index, new_penguins_df ): - # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(n_bins=6) + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.KBinsDiscretizer, when BQML's change is in prod. + discretizer = bigframes.ml.preprocessing.KBinsDiscretizer( + n_bins=6, strategy="uniform" + ) discretizer.fit( penguins_df_default_index[ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] @@ -427,31 +429,6 @@ def test_k_bins_discretizer_normalizes_different_params( pd.testing.assert_frame_equal(result, expected, rtol=1e-3) -def test_k_bins_discretizer_different_params(new_penguins_df): - discretizer = bigframes.ml.preprocessing.KBinsDiscretizer(n_bins=7) - result = discretizer.fit( - new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] - ) - result = discretizer.transform(new_penguins_df).to_pandas() - - # TODO: bug? feature columns seem to be in nondeterministic random order - # workaround: sort columns by name. Can't repro it in pantheon, so could - # be a bigframes issue... - result = result.reindex(sorted(result.columns), axis=1) - - expected = pd.DataFrame( - { - "kbinsdiscretizer_culmen_depth_mm": ["bin_8", "bin_2", "bin_5"], - "kbinsdiscretizer_culmen_length_mm": ["bin_8", "bin_4", "bin_2"], - "kbinsdiscretizer_flipper_length_mm": ["bin_8", "bin_2", "bin_5"], - }, - dtype="string[pyarrow]", - index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), - ) - - pd.testing.assert_frame_equal(result, expected, rtol=1e-3) - - def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() encoder.fit(new_penguins_df[["species", "sex"]]) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 3d0809b345..60dcc75b63 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -23,7 +23,7 @@ def test_columntransformer_init_expectedtransforms(): standard_scaler_transformer = preprocessing.StandardScaler() max_abs_scaler_transformer = preprocessing.MaxAbsScaler() min_max_scaler_transformer = preprocessing.MinMaxScaler() - k_bins_discretizer_transformer = preprocessing.KBinsDiscretizer() + k_bins_discretizer_transformer = preprocessing.KBinsDiscretizer(strategy="uniform") label_transformer = preprocessing.LabelEncoder() column_transformer = compose.ColumnTransformer( [ @@ -91,7 +91,7 @@ def test_columntransformer_repr(): ), ( "k_bins_discretizer", - preprocessing.KBinsDiscretizer(), + preprocessing.KBinsDiscretizer(strategy="uniform"), ["culmen_length_mm", "flipper_length_mm"], ), ] @@ -106,7 +106,8 @@ def test_columntransformer_repr(): ['culmen_length_mm', 'flipper_length_mm']), ('min_max_scale', MinMaxScaler(), ['culmen_length_mm', 'flipper_length_mm']), - ('k_bins_discretizer', KBinsDiscretizer(), + ('k_bins_discretizer', + KBinsDiscretizer(strategy='uniform'), ['culmen_length_mm', 'flipper_length_mm'])])""" ) @@ -136,7 +137,7 @@ def test_columntransformer_repr_matches_sklearn(): ), ( "k_bins_discretizer", - preprocessing.KBinsDiscretizer(), + preprocessing.KBinsDiscretizer(strategy="uniform"), ["culmen_length_mm", "flipper_length_mm"], ), ] @@ -165,7 +166,7 @@ def test_columntransformer_repr_matches_sklearn(): ), ( "k_bins_discretizer", - sklearn_preprocessing.KBinsDiscretizer(), + sklearn_preprocessing.KBinsDiscretizer(strategy="uniform"), ["culmen_length_mm", "flipper_length_mm"], ), ] diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 56a3478d2b..34a02edd42 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -95,12 +95,10 @@ def test_min_max_scaler_produces_correct_sql( assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" -def test__k_bind_discretizer_produces_correct_sql( +def test_k_bins_discretizer_produces_correct_sql( base_sql_generator: ml_sql.BaseSqlGenerator, ): - sql = base_sql_generator.ml_k_bind_discretizer( - "col_a", [1, 2, 3, 4], "scaled_col_a" # type:ignore - ) + sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a") assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index acf5775d17..85422748c2 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -11,10 +11,15 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): """ Bin continuous data into intervals. - Args: n_bins (int, default 5): The number of bins to produce. Raises ValueError if ``n_bins < 2``. + strategy ({'uniform', 'quantile', 'kmeans'}, default='quantile'): + Strategy used to define the widths of the bins. 'uniform': All bins + in each feature have identical widths. 'quantile': All bins in each + feature have the same number of points. 'kmeans': Values in each bin + have the same nearest center of a 1D k-means cluster. Only + `uniform` is supported now. """ def fit(self, X, y=None): From fbab7439598b6ec7fb0b9c83397f989e8d0fd8a9 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 3 Oct 2023 18:20:43 +0000 Subject: [PATCH 3/6] fix: address additional comments --- bigframes/ml/preprocessing.py | 6 +++--- .../sklearn/preprocessing/_discretization.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 4d20477cf4..5f44d40218 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -288,7 +288,7 @@ class KBinsDiscretizer( def __init__( self, n_bins: int = 5, - strategy: Literal["uniform", "quantile", "kmeans"] = "quantile", + strategy: Literal["uniform", "quantile"] = "quantile", ): if strategy != "uniform": raise NotImplementedError( @@ -334,7 +334,7 @@ def _compile_to_sql( max_value = X[column].max() bin_size = (max_value - min_value) / self.n_bins array_split_points[column] = [ - min_value + i * bin_size for i in range(self.n_bins) + min_value + i * bin_size for i in range(self.n_bins - 1) ] return [ @@ -359,7 +359,7 @@ def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: s = sql[sql.find("(") + 1 : sql.find(")")] array_split_points = s[s.find("[") + 1 : s.find("]")] col_label = s[: s.find(",")] - n_bins = array_split_points.count(",") + 1 + n_bins = array_split_points.count(",") + 2 return cls(n_bins, "uniform"), col_label def fit( diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 85422748c2..0236558dd4 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -14,12 +14,10 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator): Args: n_bins (int, default 5): The number of bins to produce. Raises ValueError if ``n_bins < 2``. - strategy ({'uniform', 'quantile', 'kmeans'}, default='quantile'): + strategy ({'uniform', 'quantile'}, default='quantile'): Strategy used to define the widths of the bins. 'uniform': All bins in each feature have identical widths. 'quantile': All bins in each - feature have the same number of points. 'kmeans': Values in each bin - have the same nearest center of a 1D k-means cluster. Only - `uniform` is supported now. + feature have the same number of points. Only `uniform` is supported now. """ def fit(self, X, y=None): From 56395e0c97c8a9dd8ada41f44d51df614cbc0180 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 3 Oct 2023 19:10:16 +0000 Subject: [PATCH 4/6] fix: fix the failed test --- tests/system/small/ml/test_preprocessing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index a28a0f080a..45548acca3 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -317,9 +317,9 @@ def test_k_bins_discretizer_normalized_fit_transform_default_params(new_penguins expected = pd.DataFrame( { - "kbinsdiscretizer_culmen_depth_mm": ["bin_6", "bin_2", "bin_4"], - "kbinsdiscretizer_culmen_length_mm": ["bin_6", "bin_3", "bin_2"], - "kbinsdiscretizer_flipper_length_mm": ["bin_6", "bin_2", "bin_4"], + "kbinsdiscretizer_culmen_depth_mm": ["bin_5", "bin_2", "bin_4"], + "kbinsdiscretizer_culmen_length_mm": ["bin_5", "bin_3", "bin_2"], + "kbinsdiscretizer_flipper_length_mm": ["bin_5", "bin_2", "bin_4"], }, dtype="string[pyarrow]", index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), From 56a5049c2ebaa2df45933c56caee8853e1b7f82c Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 3 Oct 2023 22:30:05 +0000 Subject: [PATCH 5/6] Empty commit From ef35bdef1898a99e3caa99839678a6b53d1fe723 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Wed, 4 Oct 2023 02:01:26 +0000 Subject: [PATCH 6/6] Trigger Kokoro