diff --git a/.circleci/config.yml b/.circleci/config.yml index 447b3dc209c35..b9ec23c5da56d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -24,9 +24,6 @@ jobs: - OPENBLAS_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - LOCK_FILE: build_tools/circle/doc_min_dependencies_linux-64_conda.lock - # Sphinx race condition in doc-min-dependencies is causing job to stall - # Here we run the job serially - - SPHINX_NUMJOBS: 1 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh @@ -61,8 +58,6 @@ jobs: - OPENBLAS_NUM_THREADS: 2 - CONDA_ENV_NAME: testenv - LOCK_FILE: build_tools/circle/doc_linux-64_conda.lock - # Disable sphinx parallelism to avoid EOFError or job stalling in CircleCI - - SPHINX_NUMJOBS: 1 steps: - checkout - run: ./build_tools/circle/checkout_merge_commit.sh diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml index cca5c3f6adf73..826aa0ed8a4b1 100644 --- a/.github/workflows/publish_pypi.yml +++ b/.github/workflows/publish_pypi.yml @@ -13,6 +13,9 @@ on: jobs: publish: runs-on: ubuntu-latest + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 @@ -35,15 +38,10 @@ jobs: run: | python build_tools/github/check_wheels.py - name: Publish package to TestPyPI - uses: pypa/gh-action-pypi-publish@v1.4.1 + uses: pypa/gh-action-pypi-publish@v1.8.5 with: - user: __token__ - password: ${{ secrets.TEST_PYPI_TOKEN }} repository_url: https://test.pypi.org/legacy/ if: ${{ github.event.inputs.pypi_repo == 'testpypi' }} - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.4.1 - with: - user: __token__ - password: ${{ secrets.PYPI_TOKEN }} + uses: pypa/gh-action-pypi-publish@v1.8.5 if: ${{ github.event.inputs.pypi_repo == 'pypi' }} diff --git a/doc/Makefile b/doc/Makefile index fa9921007f8eb..5b67892251ecf 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -7,18 +7,25 @@ SPHINXBUILD ?= sphinx-build PAPER = BUILDDIR = _build -# Run sequential by default, unless SPHINX_NUMJOBS is set. -SPHINX_NUMJOBS ?= 1 - ifneq ($(EXAMPLES_PATTERN),) EXAMPLES_PATTERN_OPTS := -D sphinx_gallery_conf.filename_pattern="$(EXAMPLES_PATTERN)" endif +ifeq ($(CI), true) + # On CircleCI using -j2 does not seem to speed up the html-noplot build + SPHINX_NUMJOBS_NOPLOT_DEFAULT=1 +else ($(shell uname), Darwin) + # Avoid stalling issues on MacOS + SPHINX_NUMJOBS_NOPLOT_DEFAULT=1 +else + SPHINX_NUMJOBS_NOPLOT_DEFAULT=auto +endif + # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -T -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)\ - -j$(SPHINX_NUMJOBS) $(EXAMPLES_PATTERN_OPTS) . + $(EXAMPLES_PATTERN_OPTS) . .PHONY: help clean html dirhtml ziphtml pickle json latex latexpdf changes linkcheck doctest optipng @@ -44,19 +51,27 @@ clean: -rm -rf generated/* -rm -rf modules/generated/ +# Default to SPHINX_NUMJOBS=1 for full documentation build. Using +# SPHINX_NUMJOBS!=1 may actually slow down the build, or cause weird issues in +# the CI (job stalling or EOFError), see +# https://github.com/scikit-learn/scikit-learn/pull/25836 or +# https://github.com/scikit-learn/scikit-learn/pull/25809 +html: SPHINX_NUMJOBS ?= 1 html: # These two lines make the build a bit more lengthy, and the # the embedding of images more robust rm -rf $(BUILDDIR)/html/_images #rm -rf _build/doctrees/ - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) -j$(SPHINX_NUMJOBS) $(BUILDDIR)/html/stable @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" -# rm $(BUILDDIR)/html/stable/index.html -# mv $(BUILDDIR)/html/stable/fork_index.html $(BUILDDIR)/html/stable/index.html +# Default to SPHINX_NUMJOBS=auto (except on MacOS and CI) since this makes +# html-noplot build faster +html-noplot: SPHINX_NUMJOBS ?= $(SPHINX_NUMJOBS_NOPLOT_DEFAULT) html-noplot: - $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) -j$(SPHINX_NUMJOBS) \ + $(BUILDDIR)/html/stable @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable." diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index f59a0d89fe6fd..9de2dcec080ff 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -226,8 +226,8 @@ neighbors of samples with missing values:: Missing value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 Pages 520-525. -Keeping the number of features constants -======================================== +Keeping the number of features constant +======================================= By default, the scikit-learn imputers will drop fully empty features, i.e. columns containing only missing values. For instance:: diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 56781fd8685e6..70c9c8b2b83f9 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -115,18 +115,12 @@ Usage examples: >>> clf = svm.SVC(random_state=0) >>> cross_val_score(clf, X, y, cv=5, scoring='recall_macro') array([0.96..., 0.96..., 0.96..., 0.93..., 1. ]) - >>> model = svm.SVC() - >>> cross_val_score(model, X, y, cv=5, scoring='wrong_choice') - Traceback (most recent call last): - ValueError: 'wrong_choice' is not a valid scoring value. Use - sklearn_fork.metrics.get_scorer_names() to get valid options. .. note:: - The values listed by the ``ValueError`` exception correspond to the - functions measuring prediction accuracy described in the following - sections. You can retrieve the names of all available scorers by calling - :func:`~sklearn_fork.metrics.get_scorer_names`. + If a wrong scoring name is passed, an ``InvalidParameterError`` is raised. + You can retrieve the names of all available scorers by calling + :func:`~sklearn.metrics.get_scorer_names`. .. currentmodule:: sklearn_fork.metrics diff --git a/examples/model_selection/plot_permutation_tests_for_classification.py b/examples/model_selection/plot_permutation_tests_for_classification.py index 574f3912b4ae6..32c53573a4dfa 100644 --- a/examples/model_selection/plot_permutation_tests_for_classification.py +++ b/examples/model_selection/plot_permutation_tests_for_classification.py @@ -95,7 +95,7 @@ score_label = f"Score on original\ndata: {score_iris:.2f}\n(p-value: {pvalue_iris:.3f})" ax.text(0.7, 10, score_label, fontsize=12) ax.set_xlabel("Accuracy score") -_ = ax.set_ylabel("Probability") +_ = ax.set_ylabel("Probability density") # %% # Random data @@ -116,7 +116,7 @@ score_label = f"Score on original\ndata: {score_rand:.2f}\n(p-value: {pvalue_rand:.3f})" ax.text(0.14, 7.5, score_label, fontsize=12) ax.set_xlabel("Accuracy score") -ax.set_ylabel("Probability") +ax.set_ylabel("Probability density") plt.show() # %% diff --git a/pyproject.toml b/pyproject.toml index 5aae32d5b14bb..4d95bc6e09770 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,8 @@ ignore = [ # Exclude files are generated from tempita templates exclude= ''' ( - sklearn_fork/_loss/_loss.pyx + asv_benchmarks/ + | sklearn_fork/_loss/_loss.pyx | sklearn_fork/linear_model/_sag_fast.pyx | sklearn_fork/linear_model/_sgd_fast.pyx | sklearn_fork/utils/_seq_dataset.pyx diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py new file mode 100644 index 0000000000000..e62df15037b7d --- /dev/null +++ b/sklearn/tests/test_public_functions.py @@ -0,0 +1,340 @@ +from importlib import import_module +from inspect import signature +from numbers import Integral, Real + +import pytest + +from sklearn.utils._param_validation import generate_invalid_param_val +from sklearn.utils._param_validation import generate_valid_param +from sklearn.utils._param_validation import make_constraint +from sklearn.utils._param_validation import InvalidParameterError +from sklearn.utils._param_validation import Interval + + +def _get_func_info(func_module): + module_name, func_name = func_module.rsplit(".", 1) + module = import_module(module_name) + func = getattr(module, func_name) + + func_sig = signature(func) + func_params = [ + p.name + for p in func_sig.parameters.values() + if p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD) + ] + + # The parameters `*args` and `**kwargs` are ignored since we cannot generate + # constraints. + required_params = [ + p.name + for p in func_sig.parameters.values() + if p.default is p.empty and p.kind not in (p.VAR_POSITIONAL, p.VAR_KEYWORD) + ] + + return func, func_name, func_params, required_params + + +def _check_function_param_validation( + func, func_name, func_params, required_params, parameter_constraints +): + """Check that an informative error is raised when the value of a parameter does not + have an appropriate type or value. + """ + # generate valid values for the required parameters + valid_required_params = {} + for param_name in required_params: + if parameter_constraints[param_name] == "no_validation": + valid_required_params[param_name] = 1 + else: + valid_required_params[param_name] = generate_valid_param( + make_constraint(parameter_constraints[param_name][0]) + ) + + # check that there is a constraint for each parameter + if func_params: + validation_params = parameter_constraints.keys() + unexpected_params = set(validation_params) - set(func_params) + missing_params = set(func_params) - set(validation_params) + err_msg = ( + "Mismatch between _parameter_constraints and the parameters of" + f" {func_name}.\nConsider the unexpected parameters {unexpected_params} and" + f" expected but missing parameters {missing_params}\n" + ) + assert set(validation_params) == set(func_params), err_msg + + # this object does not have a valid type for sure for all params + param_with_bad_type = type("BadType", (), {})() + + for param_name in func_params: + constraints = parameter_constraints[param_name] + + if constraints == "no_validation": + # This parameter is not validated + continue + + # Mixing an interval of reals and an interval of integers must be avoided. + if any( + isinstance(constraint, Interval) and constraint.type == Integral + for constraint in constraints + ) and any( + isinstance(constraint, Interval) and constraint.type == Real + for constraint in constraints + ): + raise ValueError( + f"The constraint for parameter {param_name} of {func_name} can't have a" + " mix of intervals of Integral and Real types. Use the type" + " RealNotInt instead of Real." + ) + + match = ( + rf"The '{param_name}' parameter of {func_name} must be .* Got .* instead." + ) + + # First, check that the error is raised if param doesn't match any valid type. + with pytest.raises(InvalidParameterError, match=match): + func(**{**valid_required_params, param_name: param_with_bad_type}) + + # Then, for constraints that are more than a type constraint, check that the + # error is raised if param does match a valid type but does not match any valid + # value for this type. + constraints = [make_constraint(constraint) for constraint in constraints] + + for constraint in constraints: + try: + bad_value = generate_invalid_param_val(constraint) + except NotImplementedError: + continue + + with pytest.raises(InvalidParameterError, match=match): + func(**{**valid_required_params, param_name: bad_value}) + + +PARAM_VALIDATION_FUNCTION_LIST = [ + "sklearn.calibration.calibration_curve", + "sklearn.cluster.cluster_optics_dbscan", + "sklearn.cluster.compute_optics_graph", + "sklearn.cluster.estimate_bandwidth", + "sklearn.cluster.kmeans_plusplus", + "sklearn.cluster.cluster_optics_xi", + "sklearn.cluster.ward_tree", + "sklearn.covariance.empirical_covariance", + "sklearn.covariance.ledoit_wolf_shrinkage", + "sklearn.covariance.shrunk_covariance", + "sklearn.datasets.clear_data_home", + "sklearn.datasets.dump_svmlight_file", + "sklearn.datasets.fetch_20newsgroups", + "sklearn.datasets.fetch_20newsgroups_vectorized", + "sklearn.datasets.fetch_california_housing", + "sklearn.datasets.fetch_covtype", + "sklearn.datasets.fetch_kddcup99", + "sklearn.datasets.fetch_lfw_pairs", + "sklearn.datasets.fetch_lfw_people", + "sklearn.datasets.fetch_olivetti_faces", + "sklearn.datasets.fetch_rcv1", + "sklearn.datasets.fetch_species_distributions", + "sklearn.datasets.get_data_home", + "sklearn.datasets.load_breast_cancer", + "sklearn.datasets.load_diabetes", + "sklearn.datasets.load_digits", + "sklearn.datasets.load_files", + "sklearn.datasets.load_iris", + "sklearn.datasets.load_linnerud", + "sklearn.datasets.load_sample_image", + "sklearn.datasets.load_svmlight_file", + "sklearn.datasets.load_svmlight_files", + "sklearn.datasets.load_wine", + "sklearn.datasets.make_biclusters", + "sklearn.datasets.make_blobs", + "sklearn.datasets.make_checkerboard", + "sklearn.datasets.make_circles", + "sklearn.datasets.make_classification", + "sklearn.datasets.make_friedman1", + "sklearn.datasets.make_friedman2", + "sklearn.datasets.make_friedman3", + "sklearn.datasets.make_gaussian_quantiles", + "sklearn.datasets.make_hastie_10_2", + "sklearn.datasets.make_low_rank_matrix", + "sklearn.datasets.make_moons", + "sklearn.datasets.make_multilabel_classification", + "sklearn.datasets.make_regression", + "sklearn.datasets.make_s_curve", + "sklearn.datasets.make_sparse_coded_signal", + "sklearn.datasets.make_sparse_spd_matrix", + "sklearn.datasets.make_sparse_uncorrelated", + "sklearn.datasets.make_spd_matrix", + "sklearn.datasets.make_swiss_roll", + "sklearn.decomposition.sparse_encode", + "sklearn.feature_extraction.grid_to_graph", + "sklearn.feature_extraction.img_to_graph", + "sklearn.feature_extraction.image.extract_patches_2d", + "sklearn.feature_extraction.image.reconstruct_from_patches_2d", + "sklearn.feature_selection.chi2", + "sklearn.feature_selection.f_classif", + "sklearn.feature_selection.f_regression", + "sklearn.feature_selection.mutual_info_classif", + "sklearn.feature_selection.mutual_info_regression", + "sklearn.feature_selection.r_regression", + "sklearn.inspection.partial_dependence", + "sklearn.inspection.permutation_importance", + "sklearn.linear_model.orthogonal_mp", + "sklearn.metrics.accuracy_score", + "sklearn.metrics.auc", + "sklearn.metrics.average_precision_score", + "sklearn.metrics.balanced_accuracy_score", + "sklearn.metrics.brier_score_loss", + "sklearn.metrics.calinski_harabasz_score", + "sklearn.metrics.check_scoring", + "sklearn.metrics.completeness_score", + "sklearn.metrics.class_likelihood_ratios", + "sklearn.metrics.classification_report", + "sklearn.metrics.cluster.adjusted_mutual_info_score", + "sklearn.metrics.cluster.contingency_matrix", + "sklearn.metrics.cluster.entropy", + "sklearn.metrics.cluster.fowlkes_mallows_score", + "sklearn.metrics.cluster.homogeneity_completeness_v_measure", + "sklearn.metrics.cluster.normalized_mutual_info_score", + "sklearn.metrics.cluster.silhouette_samples", + "sklearn.metrics.cluster.silhouette_score", + "sklearn.metrics.cohen_kappa_score", + "sklearn.metrics.confusion_matrix", + "sklearn.metrics.coverage_error", + "sklearn.metrics.d2_absolute_error_score", + "sklearn.metrics.d2_pinball_score", + "sklearn.metrics.d2_tweedie_score", + "sklearn.metrics.davies_bouldin_score", + "sklearn.metrics.dcg_score", + "sklearn.metrics.det_curve", + "sklearn.metrics.explained_variance_score", + "sklearn.metrics.f1_score", + "sklearn.metrics.fbeta_score", + "sklearn.metrics.get_scorer", + "sklearn.metrics.hamming_loss", + "sklearn.metrics.hinge_loss", + "sklearn.metrics.homogeneity_score", + "sklearn.metrics.jaccard_score", + "sklearn.metrics.label_ranking_average_precision_score", + "sklearn.metrics.label_ranking_loss", + "sklearn.metrics.log_loss", + "sklearn.metrics.make_scorer", + "sklearn.metrics.matthews_corrcoef", + "sklearn.metrics.max_error", + "sklearn.metrics.mean_absolute_error", + "sklearn.metrics.mean_absolute_percentage_error", + "sklearn.metrics.mean_gamma_deviance", + "sklearn.metrics.mean_pinball_loss", + "sklearn.metrics.mean_poisson_deviance", + "sklearn.metrics.mean_squared_error", + "sklearn.metrics.mean_squared_log_error", + "sklearn.metrics.mean_tweedie_deviance", + "sklearn.metrics.median_absolute_error", + "sklearn.metrics.multilabel_confusion_matrix", + "sklearn.metrics.mutual_info_score", + "sklearn.metrics.ndcg_score", + "sklearn.metrics.pair_confusion_matrix", + "sklearn.metrics.adjusted_rand_score", + "sklearn.metrics.pairwise.additive_chi2_kernel", + "sklearn.metrics.pairwise.cosine_distances", + "sklearn.metrics.pairwise.cosine_similarity", + "sklearn.metrics.pairwise.haversine_distances", + "sklearn.metrics.pairwise.laplacian_kernel", + "sklearn.metrics.pairwise.linear_kernel", + "sklearn.metrics.pairwise.manhattan_distances", + "sklearn.metrics.pairwise.nan_euclidean_distances", + "sklearn.metrics.pairwise.paired_cosine_distances", + "sklearn.metrics.pairwise.paired_euclidean_distances", + "sklearn.metrics.pairwise.paired_manhattan_distances", + "sklearn.metrics.pairwise.polynomial_kernel", + "sklearn.metrics.pairwise.rbf_kernel", + "sklearn.metrics.pairwise.sigmoid_kernel", + "sklearn.metrics.precision_recall_curve", + "sklearn.metrics.precision_recall_fscore_support", + "sklearn.metrics.precision_score", + "sklearn.metrics.r2_score", + "sklearn.metrics.rand_score", + "sklearn.metrics.recall_score", + "sklearn.metrics.roc_auc_score", + "sklearn.metrics.roc_curve", + "sklearn.metrics.top_k_accuracy_score", + "sklearn.metrics.v_measure_score", + "sklearn.metrics.zero_one_loss", + "sklearn.model_selection.cross_validate", + "sklearn.model_selection.learning_curve", + "sklearn.model_selection.permutation_test_score", + "sklearn.model_selection.train_test_split", + "sklearn.model_selection.validation_curve", + "sklearn.neighbors.sort_graph_by_row_values", + "sklearn.preprocessing.add_dummy_feature", + "sklearn.preprocessing.binarize", + "sklearn.preprocessing.label_binarize", + "sklearn.preprocessing.maxabs_scale", + "sklearn.preprocessing.normalize", + "sklearn.preprocessing.scale", + "sklearn.random_projection.johnson_lindenstrauss_min_dim", + "sklearn.svm.l1_min_c", + "sklearn.tree.export_text", + "sklearn.tree.plot_tree", + "sklearn.utils.gen_batches", + "sklearn.utils.resample", +] + + +@pytest.mark.parametrize("func_module", PARAM_VALIDATION_FUNCTION_LIST) +def test_function_param_validation(func_module): + """Check param validation for public functions that are not wrappers around + estimators. + """ + func, func_name, func_params, required_params = _get_func_info(func_module) + + parameter_constraints = getattr(func, "_skl_parameter_constraints") + + _check_function_param_validation( + func, func_name, func_params, required_params, parameter_constraints + ) + + +PARAM_VALIDATION_CLASS_WRAPPER_LIST = [ + ("sklearn.cluster.affinity_propagation", "sklearn.cluster.AffinityPropagation"), + ("sklearn.cluster.mean_shift", "sklearn.cluster.MeanShift"), + ("sklearn.cluster.spectral_clustering", "sklearn.cluster.SpectralClustering"), + ("sklearn.covariance.graphical_lasso", "sklearn.covariance.GraphicalLasso"), + ("sklearn.covariance.ledoit_wolf", "sklearn.covariance.LedoitWolf"), + ("sklearn.covariance.oas", "sklearn.covariance.OAS"), + ("sklearn.decomposition.dict_learning", "sklearn.decomposition.DictionaryLearning"), + ("sklearn.decomposition.fastica", "sklearn.decomposition.FastICA"), + ("sklearn.decomposition.non_negative_factorization", "sklearn.decomposition.NMF"), + ("sklearn.preprocessing.minmax_scale", "sklearn.preprocessing.MinMaxScaler"), + ("sklearn.preprocessing.power_transform", "sklearn.preprocessing.PowerTransformer"), + ( + "sklearn.preprocessing.quantile_transform", + "sklearn.preprocessing.QuantileTransformer", + ), + ("sklearn.preprocessing.robust_scale", "sklearn.preprocessing.RobustScaler"), +] + + +@pytest.mark.parametrize( + "func_module, class_module", PARAM_VALIDATION_CLASS_WRAPPER_LIST +) +def test_class_wrapper_param_validation(func_module, class_module): + """Check param validation for public functions that are wrappers around + estimators. + """ + func, func_name, func_params, required_params = _get_func_info(func_module) + + module_name, class_name = class_module.rsplit(".", 1) + module = import_module(module_name) + klass = getattr(module, class_name) + + parameter_constraints_func = getattr(func, "_skl_parameter_constraints") + parameter_constraints_class = getattr(klass, "_parameter_constraints") + parameter_constraints = { + **parameter_constraints_class, + **parameter_constraints_func, + } + parameter_constraints = { + k: v for k, v in parameter_constraints.items() if k in func_params + } + + _check_function_param_validation( + func, func_name, func_params, required_params, parameter_constraints + ) diff --git a/sklearn_fork/calibration.py b/sklearn_fork/calibration.py index f541fa79bdd80..ec4b0c4504f15 100644 --- a/sklearn_fork/calibration.py +++ b/sklearn_fork/calibration.py @@ -7,7 +7,7 @@ # # License: BSD 3 clause -from numbers import Integral +from numbers import Integral, Real import warnings from inspect import signature from functools import partial @@ -35,7 +35,13 @@ from .utils.multiclass import check_classification_targets from .utils.parallel import delayed, Parallel -from .utils._param_validation import StrOptions, HasMethods, Hidden +from .utils._param_validation import ( + StrOptions, + HasMethods, + Hidden, + validate_params, + Interval, +) from .utils._plotting import _BinaryClassifierCurveDisplayMixin from .utils.validation import ( _check_fit_params, @@ -903,6 +909,15 @@ def predict(self, T): return expit(-(self.a_ * T + self.b_)) +@validate_params( + { + "y_true": ["array-like"], + "y_prob": ["array-like"], + "pos_label": [Real, str, "boolean", None], + "n_bins": [Interval(Integral, 1, None, closed="left")], + "strategy": [StrOptions({"uniform", "quantile"})], + } +) def calibration_curve( y_true, y_prob, @@ -928,7 +943,7 @@ def calibration_curve( y_prob : array-like of shape (n_samples,) Probabilities of the positive class. - pos_label : int or str, default=None + pos_label : int, float, bool or str, default=None The label of the positive class. .. versionadded:: 1.1 @@ -1042,7 +1057,7 @@ class CalibrationDisplay(_BinaryClassifierCurveDisplayMixin): estimator_name : str, default=None Name of estimator. If None, the estimator name is not shown. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The positive class when computing the calibration curve. By default, `estimators.classes_[1]` is considered as the positive class. @@ -1208,7 +1223,7 @@ def from_estimator( - `'quantile'`: The bins have the same number of samples and depend on predicted probabilities. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The positive class when computing the calibration curve. By default, `estimators.classes_[1]` is considered as the positive class. @@ -1326,7 +1341,7 @@ def from_predictions( - `'quantile'`: The bins have the same number of samples and depend on predicted probabilities. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The positive class when computing the calibration curve. By default, `estimators.classes_[1]` is considered as the positive class. diff --git a/sklearn_fork/datasets/_arff_parser.py b/sklearn_fork/datasets/_arff_parser.py index 2ec61f5817db1..d085e9d85f2a9 100644 --- a/sklearn_fork/datasets/_arff_parser.py +++ b/sklearn_fork/datasets/_arff_parser.py @@ -187,7 +187,7 @@ def _io_to_generator(gzip_file): # calculate chunksize first_row = next(arff_container["data"]) - first_df = pd.DataFrame([first_row], columns=columns_names) + first_df = pd.DataFrame([first_row], columns=columns_names, copy=False) row_bytes = first_df.memory_usage(deep=True).sum() chunksize = get_chunk_n_rows(row_bytes) @@ -196,7 +196,9 @@ def _io_to_generator(gzip_file): columns_to_keep = [col for col in columns_names if col in columns_to_select] dfs = [first_df[columns_to_keep]] for data in _chunk_generator(arff_container["data"], chunksize): - dfs.append(pd.DataFrame(data, columns=columns_names)[columns_to_keep]) + dfs.append( + pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep] + ) frame = pd.concat(dfs, ignore_index=True) del dfs, first_df diff --git a/sklearn_fork/datasets/_base.py b/sklearn_fork/datasets/_base.py index 84e35fed953c4..a08b9d53916b4 100644 --- a/sklearn_fork/datasets/_base.py +++ b/sklearn_fork/datasets/_base.py @@ -22,7 +22,7 @@ from ..utils import check_random_state from ..utils import check_pandas_support from ..utils.fixes import _open_binary, _open_text, _read_text, _contents -from ..utils._param_validation import validate_params, Interval +from ..utils._param_validation import validate_params, Interval, StrOptions import numpy as np @@ -35,6 +35,11 @@ RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"]) +@validate_params( + { + "data_home": [str, os.PathLike, None], + } +) def get_data_home(data_home=None) -> str: """Return the path of the scikit-learn data directory. @@ -58,7 +63,7 @@ def get_data_home(data_home=None) -> str: Returns ------- - data_home: str + data_home: str or path-like, default=None The path to scikit-learn data directory. """ if data_home is None: @@ -68,12 +73,17 @@ def get_data_home(data_home=None) -> str: return data_home +@validate_params( + { + "data_home": [str, os.PathLike, None], + } +) def clear_data_home(data_home=None): """Delete all the content of the data home cache. Parameters ---------- - data_home : str, default=None + data_home : str or path-like, default=None The path to scikit-learn data directory. If `None`, the default path is `~/sklearn_learn_data`. """ @@ -86,7 +96,7 @@ def _convert_data_dataframe( ): pd = check_pandas_support("{} with as_frame=True".format(caller_name)) if not sparse_data: - data_df = pd.DataFrame(data, columns=feature_names) + data_df = pd.DataFrame(data, columns=feature_names, copy=False) else: data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names) @@ -99,6 +109,19 @@ def _convert_data_dataframe( return combined_df, X, y +@validate_params( + { + "container_path": [str, os.PathLike], + "description": [str, None], + "categories": [list, None], + "load_content": ["boolean"], + "shuffle": ["boolean"], + "encoding": [str, None], + "decode_error": [StrOptions({"strict", "ignore", "replace"})], + "random_state": ["random_state"], + "allowed_extensions": [list, None], + } +) def load_files( container_path, *, @@ -1252,6 +1275,11 @@ def load_sample_images(): return Bunch(images=images, filenames=filenames, DESCR=descr) +@validate_params( + { + "image_name": [StrOptions({"china.jpg", "flower.jpg"})], + } +) def load_sample_image(image_name): """Load the numpy array of a single sample image. diff --git a/sklearn_fork/datasets/_openml.py b/sklearn_fork/datasets/_openml.py index 25f26f7c2d212..f88d68d103797 100644 --- a/sklearn_fork/datasets/_openml.py +++ b/sklearn_fork/datasets/_openml.py @@ -903,7 +903,7 @@ def fetch_openml( data_home = None else: data_home = get_data_home(data_home=data_home) - data_home = join(data_home, "openml") + data_home = join(str(data_home), "openml") # check valid function arguments. data_id XOR (name, version) should be # provided diff --git a/sklearn_fork/datasets/tests/test_base.py b/sklearn_fork/datasets/tests/test_base.py index 7b578dc1c00d1..0142a310cc6e0 100644 --- a/sklearn_fork/datasets/tests/test_base.py +++ b/sklearn_fork/datasets/tests/test_base.py @@ -98,10 +98,11 @@ def test_default_load_files(test_category_dir_1, test_category_dir_2, load_files def test_load_files_w_categories_desc_and_encoding( test_category_dir_1, test_category_dir_2, load_files_root ): - category = os.path.abspath(test_category_dir_1).split("/").pop() + category = os.path.abspath(test_category_dir_1).split(os.sep).pop() res = load_files( - load_files_root, description="test", categories=category, encoding="utf-8" + load_files_root, description="test", categories=[category], encoding="utf-8" ) + assert len(res.filenames) == 1 assert len(res.target_names) == 1 assert res.DESCR == "test" @@ -221,12 +222,6 @@ def test_load_sample_image(): warnings.warn("Could not load sample images, PIL is not available.") -def test_load_missing_sample_image_error(): - pytest.importorskip("PIL") - with pytest.raises(AttributeError): - load_sample_image("blop.jpg") - - def test_load_diabetes_raw(): """Test to check that we load a scaled version by default but that we can get an unscaled version when setting `scaled=False`.""" diff --git a/sklearn_fork/ensemble/_gb.py b/sklearn_fork/ensemble/_gb.py index aa78832550a50..ad76617470dd5 100644 --- a/sklearn_fork/ensemble/_gb.py +++ b/sklearn_fork/ensemble/_gb.py @@ -949,7 +949,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): init : estimator or 'zero', default=None An estimator object that is used to compute the initial predictions. - ``init`` has to provide :meth:`fit` and :meth:`predict_proba`. If + ``init`` has to provide :term:`fit` and :term:`predict_proba`. If 'zero', the initial raw predictions are set to zero. By default, a ``DummyEstimator`` predicting the classes priors is used. diff --git a/sklearn_fork/feature_selection/_sequential.py b/sklearn_fork/feature_selection/_sequential.py index caa197ca61c3a..d4f5d89665de7 100644 --- a/sklearn_fork/feature_selection/_sequential.py +++ b/sklearn_fork/feature_selection/_sequential.py @@ -39,8 +39,8 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator n_features_to_select : "auto", int or float, default='warn' If `"auto"`, the behaviour depends on the `tol` parameter: - - if `tol` is not `None`, then features are selected until the score - improvement does not exceed `tol`. + - if `tol` is not `None`, then features are selected while the score + change does not exceed `tol`. - otherwise, half of the features are selected. If integer, the parameter is the absolute number of features to select. @@ -53,7 +53,7 @@ class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator The default changed from `None` to `"warn"` in 1.1 and will become `"auto"` in 1.3. `None` and `'warn'` will be removed in 1.3. To keep the same behaviour as `None`, set - `n_features_to_select="auto" and `tol=None`. + `n_features_to_select="auto"` and `tol=None`. tol : float, default=None If the score is not incremented by at least `tol` between two diff --git a/sklearn_fork/inspection/_partial_dependence.py b/sklearn_fork/inspection/_partial_dependence.py index 4f21ed86f6ad8..0b32a3a561e81 100644 --- a/sklearn_fork/inspection/_partial_dependence.py +++ b/sklearn_fork/inspection/_partial_dependence.py @@ -22,6 +22,13 @@ from ..utils import _get_column_indices from ..utils.validation import check_is_fitted from ..utils import Bunch +from ..utils._param_validation import ( + HasMethods, + Integral, + Interval, + StrOptions, + validate_params, +) from ..tree import DecisionTreeRegressor from ..ensemble import RandomForestRegressor from ..exceptions import NotFittedError @@ -223,6 +230,24 @@ def _partial_dependence_brute(est, grid, features, X, response_method): return averaged_predictions, predictions +@validate_params( + { + "estimator": [ + HasMethods(["fit", "predict"]), + HasMethods(["fit", "predict_proba"]), + HasMethods(["fit", "decision_function"]), + ], + "X": ["array-like", "sparse matrix"], + "features": ["array-like", Integral, str], + "categorical_features": ["array-like", None], + "feature_names": ["array-like", None], + "response_method": [StrOptions({"auto", "predict_proba", "decision_function"})], + "percentiles": [tuple], + "grid_resolution": [Interval(Integral, 1, None, closed="left")], + "method": [StrOptions({"auto", "recursion", "brute"})], + "kind": [StrOptions({"average", "individual", "both"})], + } +) def partial_dependence( estimator, X, @@ -268,13 +293,13 @@ def partial_dependence( :term:`predict_proba`, or :term:`decision_function`. Multioutput-multiclass classifiers are not supported. - X : {array-like or dataframe} of shape (n_samples, n_features) + X : {array-like, sparse matrix or dataframe} of shape (n_samples, n_features) ``X`` is used to generate a grid of values for the target ``features`` (where the partial dependence will be evaluated), and also to generate values for the complement features when the `method` is 'brute'. - features : array-like of {int, str} + features : array-like of {int, str, bool} or int or str The feature (e.g. `[0]`) or pair of interacting features (e.g. `[(0, 1)]`) for which the partial dependency should be computed. @@ -425,27 +450,12 @@ def partial_dependence( if not (hasattr(X, "__array__") or sparse.issparse(X)): X = check_array(X, force_all_finite="allow-nan", dtype=object) - accepted_responses = ("auto", "predict_proba", "decision_function") - if response_method not in accepted_responses: - raise ValueError( - "response_method {} is invalid. Accepted response_method names " - "are {}.".format(response_method, ", ".join(accepted_responses)) - ) - if is_regressor(estimator) and response_method != "auto": raise ValueError( "The response_method parameter is ignored for regressors and " "must be 'auto'." ) - accepted_methods = ("brute", "recursion", "auto") - if method not in accepted_methods: - raise ValueError( - "method {} is invalid. Accepted method names are {}.".format( - method, ", ".join(accepted_methods) - ) - ) - if kind != "average": if method == "recursion": raise ValueError( diff --git a/sklearn_fork/inspection/_plot/tests/test_plot_partial_dependence.py b/sklearn_fork/inspection/_plot/tests/test_plot_partial_dependence.py index af59706841987..4f86efdf45aec 100644 --- a/sklearn_fork/inspection/_plot/tests/test_plot_partial_dependence.py +++ b/sklearn_fork/inspection/_plot/tests/test_plot_partial_dependence.py @@ -611,16 +611,6 @@ def test_plot_partial_dependence_dataframe(pyplot, clf_diabetes, diabetes): {"features": [1], "categorical_features": [1], "kind": "individual"}, "It is not possible to display individual effects", ), - ( - dummy_classification_data, - {"features": [1], "kind": "foo"}, - "Values provided to `kind` must be one of", - ), - ( - dummy_classification_data, - {"features": [0, 1], "kind": ["foo", "individual"]}, - "Values provided to `kind` must be one of", - ), ], ) def test_plot_partial_dependence_error(pyplot, data, params, err_msg): diff --git a/sklearn_fork/inspection/tests/test_partial_dependence.py b/sklearn_fork/inspection/tests/test_partial_dependence.py index d899915d59a1c..0d27861eb74a3 100644 --- a/sklearn_fork/inspection/tests/test_partial_dependence.py +++ b/sklearn_fork/inspection/tests/test_partial_dependence.py @@ -510,31 +510,6 @@ def fit(self, X, y): {"features": [0], "response_method": "predict_proba", "method": "auto"}, "'recursion' method, the response_method must be 'decision_function'", ), - ( - GradientBoostingClassifier(random_state=0), - {"features": [0], "response_method": "blahblah"}, - "response_method blahblah is invalid. Accepted response_method", - ), - ( - NoPredictProbaNoDecisionFunction(), - {"features": [0], "response_method": "auto"}, - "The estimator has no predict_proba and no decision_function method", - ), - ( - NoPredictProbaNoDecisionFunction(), - {"features": [0], "response_method": "predict_proba"}, - "The estimator has no predict_proba method.", - ), - ( - NoPredictProbaNoDecisionFunction(), - {"features": [0], "response_method": "decision_function"}, - "The estimator has no decision_function method.", - ), - ( - LinearRegression(), - {"features": [0], "method": "blahblah"}, - "blahblah is invalid. Accepted method names are brute, recursion, auto", - ), ( LinearRegression(), {"features": [0], "method": "recursion", "kind": "individual"}, @@ -560,24 +535,6 @@ def test_partial_dependence_error(estimator, params, err_msg): partial_dependence(estimator, X, **params) -@pytest.mark.parametrize( - "with_dataframe, err_msg", - [ - (True, "Only array-like or scalar are supported"), - (False, "Only array-like or scalar are supported"), - ], -) -def test_partial_dependence_slice_error(with_dataframe, err_msg): - X, y = make_classification(random_state=0) - if with_dataframe: - pd = pytest.importorskip("pandas") - X = pd.DataFrame(X) - estimator = LogisticRegression().fit(X, y) - - with pytest.raises(TypeError, match=err_msg): - partial_dependence(estimator, X, features=slice(0, 2, 1)) - - @pytest.mark.parametrize( "estimator", [LinearRegression(), GradientBoostingClassifier(random_state=0)] ) diff --git a/sklearn_fork/metrics/_pairwise_distances_reduction/_base.pyx.tp b/sklearn_fork/metrics/_pairwise_distances_reduction/_base.pyx.tp index 3673d53990a6b..2819a1afa969f 100644 --- a/sklearn_fork/metrics/_pairwise_distances_reduction/_base.pyx.tp +++ b/sklearn_fork/metrics/_pairwise_distances_reduction/_base.pyx.tp @@ -5,7 +5,7 @@ from libcpp.vector cimport vector from ...utils._cython_blas cimport _dot from ...utils._openmp_helpers cimport omp_get_thread_num -from ...utils._typedefs cimport intp_t, float32_t, float64_t +from ...utils._typedefs cimport intp_t, float32_t, float64_t, int32_t import numpy as np @@ -14,7 +14,6 @@ from numbers import Integral from sklearn_fork import get_config from sklearn_fork.utils import check_scalar from ...utils._openmp_helpers import _openmp_effective_n_threads -from ...utils.sparsefuncs_fast import _sqeuclidean_row_norms_sparse ##################### @@ -84,6 +83,23 @@ cdef float64_t[::1] _sqeuclidean_row_norms32_dense( return squared_row_norms +cdef float64_t[::1] _sqeuclidean_row_norms64_sparse( + const float64_t[:] X_data, + const int32_t[:] X_indptr, + intp_t num_threads, +): + cdef: + intp_t n = X_indptr.shape[0] - 1 + int32_t X_i_ptr, idx = 0 + float64_t[::1] squared_row_norms = np.zeros(n, dtype=np.float64) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + for X_i_ptr in range(X_indptr[idx], X_indptr[idx+1]): + squared_row_norms[idx] += X_data[X_i_ptr] * X_data[X_i_ptr] + + return squared_row_norms + + {{for name_suffix in ["64", "32"]}} from ._datasets_pair cimport DatasetsPair{{name_suffix}} @@ -98,7 +114,7 @@ cpdef float64_t[::1] _sqeuclidean_row_norms{{name_suffix}}( # by moving squared row norms computations in MiddleTermComputer. X_data = np.asarray(X.data, dtype=np.float64) X_indptr = np.asarray(X.indptr, dtype=np.int32) - return _sqeuclidean_row_norms_sparse(X_data, X_indptr, num_threads) + return _sqeuclidean_row_norms64_sparse(X_data, X_indptr, num_threads) else: return _sqeuclidean_row_norms{{name_suffix}}_dense(X, num_threads) diff --git a/sklearn_fork/metrics/_plot/det_curve.py b/sklearn_fork/metrics/_plot/det_curve.py index ff0ba2bae2ea9..a5a034363c7b2 100644 --- a/sklearn_fork/metrics/_plot/det_curve.py +++ b/sklearn_fork/metrics/_plot/det_curve.py @@ -26,7 +26,7 @@ class DetCurveDisplay(_BinaryClassifierCurveDisplayMixin): estimator_name : str, default=None Name of estimator. If None, the estimator name is not shown. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The label of the positive class. Attributes @@ -117,7 +117,7 @@ def from_estimator( to 'auto', :term:`predict_proba` is tried first and if it does not exist :term:`decision_function` is tried next. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The label of the positive class. When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an error will be raised. @@ -210,7 +210,7 @@ def from_predictions( sample_weight : array-like of shape (n_samples,), default=None Sample weights. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The label of the positive class. When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an error will be raised. diff --git a/sklearn_fork/metrics/_plot/precision_recall_curve.py b/sklearn_fork/metrics/_plot/precision_recall_curve.py index ea04557de4b45..6a81a13559f9f 100644 --- a/sklearn_fork/metrics/_plot/precision_recall_curve.py +++ b/sklearn_fork/metrics/_plot/precision_recall_curve.py @@ -28,7 +28,7 @@ class PrecisionRecallDisplay(_BinaryClassifierCurveDisplayMixin): estimator_name : str, default=None Name of estimator. If None, then the estimator name is not shown. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class. If None, the class will not be shown in the legend. @@ -194,7 +194,7 @@ def from_estimator( sample_weight : array-like of shape (n_samples,), default=None Sample weights. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class when computing the precision and recall metrics. By default, `estimators.classes_[1]` is considered as the positive class. @@ -306,7 +306,7 @@ def from_predictions( sample_weight : array-like of shape (n_samples,), default=None Sample weights. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class when computing the precision and recall metrics. diff --git a/sklearn_fork/metrics/_plot/roc_curve.py b/sklearn_fork/metrics/_plot/roc_curve.py index 08d304c9261a8..0d3d59de6d35e 100644 --- a/sklearn_fork/metrics/_plot/roc_curve.py +++ b/sklearn_fork/metrics/_plot/roc_curve.py @@ -28,7 +28,7 @@ class RocCurveDisplay(_BinaryClassifierCurveDisplayMixin): estimator_name : str, default=None Name of estimator. If None, the estimator name is not shown. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class when computing the roc auc metrics. By default, `estimators.classes_[1]` is considered as the positive class. @@ -213,7 +213,7 @@ def from_estimator( :term:`predict_proba` is tried first and if it does not exist :term:`decision_function` is tried next. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class when computing the roc auc metrics. By default, `estimators.classes_[1]` is considered as the positive class. @@ -328,7 +328,7 @@ def from_predictions( on a plotted ROC curve. This is useful in order to create lighter ROC curves. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The label of the positive class. When `pos_label=None`, if `y_true` is in {-1, 1} or {0, 1}, `pos_label` is set to 1, otherwise an error will be raised. diff --git a/sklearn_fork/metrics/_scorer.py b/sklearn_fork/metrics/_scorer.py index 9597c07e7d380..46b66063ce5f3 100644 --- a/sklearn_fork/metrics/_scorer.py +++ b/sklearn_fork/metrics/_scorer.py @@ -18,7 +18,6 @@ # Arnaud Joly # License: Simplified BSD -from collections.abc import Iterable from functools import partial from collections import Counter from traceback import format_exc @@ -65,7 +64,7 @@ from ..utils.multiclass import type_of_target from ..base import is_regressor -from ..utils._param_validation import validate_params +from ..utils._param_validation import HasMethods, StrOptions, validate_params def _cached_call(cache, estimator, method, *args, **kwargs): @@ -451,79 +450,6 @@ def _passthrough_scorer(estimator, *args, **kwargs): return estimator.score(*args, **kwargs) -def check_scoring(estimator, scoring=None, *, allow_none=False): - """Determine scorer from user options. - - A TypeError will be thrown if the estimator cannot be scored. - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - scoring : str or callable, default=None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - If None, the provided estimator object's `score` method is used. - - allow_none : bool, default=False - If no scoring is specified and the estimator has no score function, we - can either return None or raise an exception. - - Returns - ------- - scoring : callable - A scorer callable object / function with signature - ``scorer(estimator, X, y)``. - """ - if not hasattr(estimator, "fit"): - raise TypeError( - "estimator should be an estimator implementing 'fit' method, %r was passed" - % estimator - ) - if isinstance(scoring, str): - return get_scorer(scoring) - elif callable(scoring): - # Heuristic to ensure user has not passed a metric - module = getattr(scoring, "__module__", None) - if ( - hasattr(module, "startswith") - and module.startswith("sklearn_fork.metrics.") - and not module.startswith("sklearn_fork.metrics._scorer") - and not module.startswith("sklearn_fork.metrics.tests.") - ): - raise ValueError( - "scoring value %r looks like it is a metric " - "function rather than a scorer. A scorer should " - "require an estimator as its first parameter. " - "Please use `make_scorer` to convert a metric " - "to a scorer." % scoring - ) - return get_scorer(scoring) - elif scoring is None: - if hasattr(estimator, "score"): - return _passthrough_scorer - elif allow_none: - return None - else: - raise TypeError( - "If no scoring is specified, the estimator passed should " - "have a 'score' method. The estimator %r does not." % estimator - ) - elif isinstance(scoring, Iterable): - raise ValueError( - "For evaluating multiple scores, use " - "sklearn_fork.model_selection.cross_validate instead. " - "{0} was passed.".format(scoring) - ) - else: - raise ValueError( - "scoring value should either be a callable, string or None. %r was passed" - % scoring - ) - - def _check_multimetric_scoring(estimator, scoring): """Check the scoring parameter in cases when multiple metrics are allowed. @@ -883,3 +809,67 @@ def get_scorer_names(): _SCORERS[qualified_name] = make_scorer(metric, pos_label=None, average=average) SCORERS = _DeprecatedScorers(_SCORERS) + + +@validate_params( + { + "estimator": [HasMethods("fit")], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "allow_none": ["boolean"], + } +) +def check_scoring(estimator, scoring=None, *, allow_none=False): + """Determine scorer from user options. + + A TypeError will be thrown if the estimator cannot be scored. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + scoring : str or callable, default=None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + If None, the provided estimator object's `score` method is used. + + allow_none : bool, default=False + If no scoring is specified and the estimator has no score function, we + can either return None or raise an exception. + + Returns + ------- + scoring : callable + A scorer callable object / function with signature + ``scorer(estimator, X, y)``. + """ + if isinstance(scoring, str): + return get_scorer(scoring) + if callable(scoring): + # Heuristic to ensure user has not passed a metric + module = getattr(scoring, "__module__", None) + if ( + hasattr(module, "startswith") + and module.startswith("sklearn.metrics.") + and not module.startswith("sklearn.metrics._scorer") + and not module.startswith("sklearn.metrics.tests.") + ): + raise ValueError( + "scoring value %r looks like it is a metric " + "function rather than a scorer. A scorer should " + "require an estimator as its first parameter. " + "Please use `make_scorer` to convert a metric " + "to a scorer." % scoring + ) + return get_scorer(scoring) + if scoring is None: + if hasattr(estimator, "score"): + return _passthrough_scorer + elif allow_none: + return None + else: + raise TypeError( + "If no scoring is specified, the estimator passed should " + "have a 'score' method. The estimator %r does not." % estimator + ) diff --git a/sklearn_fork/metrics/cluster/_supervised.py b/sklearn_fork/metrics/cluster/_supervised.py index 23470ca1c2856..ae85d42c5db4b 100644 --- a/sklearn_fork/metrics/cluster/_supervised.py +++ b/sklearn_fork/metrics/cluster/_supervised.py @@ -476,7 +476,7 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0): Parameters ---------- - labels_true : int array, shape = [n_samples] + labels_true : array-like of shape (n_samples,) Ground truth class labels to be used as a reference. labels_pred : array-like of shape (n_samples,) @@ -532,6 +532,12 @@ def homogeneity_completeness_v_measure(labels_true, labels_pred, *, beta=1.0): return homogeneity, completeness, v_measure_score +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + } +) def homogeneity_score(labels_true, labels_pred): """Homogeneity metric of a cluster labeling given a ground truth. @@ -550,7 +556,7 @@ def homogeneity_score(labels_true, labels_pred): Parameters ---------- - labels_true : int array, shape = [n_samples] + labels_true : array-like of shape (n_samples,) Ground truth class labels to be used as a reference. labels_pred : array-like of shape (n_samples,) @@ -601,6 +607,12 @@ def homogeneity_score(labels_true, labels_pred): return homogeneity_completeness_v_measure(labels_true, labels_pred)[0] +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + } +) def completeness_score(labels_true, labels_pred): """Compute completeness metric of a cluster labeling given a ground truth. @@ -619,7 +631,7 @@ def completeness_score(labels_true, labels_pred): Parameters ---------- - labels_true : int array, shape = [n_samples] + labels_true : array-like of shape (n_samples,) Ground truth class labels to be used as a reference. labels_pred : array-like of shape (n_samples,) @@ -670,6 +682,13 @@ def completeness_score(labels_true, labels_pred): return homogeneity_completeness_v_measure(labels_true, labels_pred)[1] +@validate_params( + { + "labels_true": ["array-like"], + "labels_pred": ["array-like"], + "beta": [Interval(Real, 0, None, closed="left")], + } +) def v_measure_score(labels_true, labels_pred, *, beta=1.0): """V-measure cluster labeling given a ground truth. @@ -694,7 +713,7 @@ def v_measure_score(labels_true, labels_pred, *, beta=1.0): Parameters ---------- - labels_true : int array, shape = [n_samples] + labels_true : array-like of shape (n_samples,) Ground truth class labels to be used as a reference. labels_pred : array-like of shape (n_samples,) diff --git a/sklearn_fork/metrics/pairwise.py b/sklearn_fork/metrics/pairwise.py index f8b11351b29c8..93061d77654e1 100644 --- a/sklearn_fork/metrics/pairwise.py +++ b/sklearn_fork/metrics/pairwise.py @@ -35,6 +35,7 @@ Real, Hidden, MissingValues, + StrOptions, ) from ._pairwise_distances_reduction import ArgKmin @@ -904,6 +905,13 @@ def haversine_distances(X, Y=None): return DistanceMetric.get_metric("haversine").pairwise(X, Y) +@validate_params( + { + "X": ["array-like", "sparse matrix"], + "Y": ["array-like", "sparse matrix", None], + "sum_over_features": ["boolean", Hidden(StrOptions({"deprecated"}))], + } +) def manhattan_distances(X, Y=None, *, sum_over_features="deprecated"): """Compute the L1 distances between the vectors in X and Y. @@ -914,10 +922,10 @@ def manhattan_distances(X, Y=None, *, sum_over_features="deprecated"): Parameters ---------- - X : array-like of shape (n_samples_X, n_features) + X : {array-like, sparse matrix} of shape (n_samples_X, n_features) An array where each row is a sample and each column is a feature. - Y : array-like of shape (n_samples_Y, n_features), default=None + Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), default=None An array where each row is a sample and each column is a feature. If `None`, method uses `Y=X`. diff --git a/sklearn_fork/metrics/tests/test_score_objects.py b/sklearn_fork/metrics/tests/test_score_objects.py index f6b087e8c70ba..649d0a118e3d1 100644 --- a/sklearn_fork/metrics/tests/test_score_objects.py +++ b/sklearn_fork/metrics/tests/test_score_objects.py @@ -179,12 +179,6 @@ def teardown_module(): shutil.rmtree(TEMP_FOLDER) -class EstimatorWithoutFit: - """Dummy estimator to test scoring validators""" - - pass - - class EstimatorWithFit(BaseEstimator): """Dummy estimator to test scoring validators""" @@ -228,13 +222,6 @@ def test_all_scorers_repr(): def check_scoring_validator_for_single_metric_usecases(scoring_validator): # Test all branches of single metric usecases - estimator = EstimatorWithoutFit() - pattern = ( - r"estimator should be an estimator implementing 'fit' method," r" .* was passed" - ) - with pytest.raises(TypeError, match=pattern): - scoring_validator(estimator) - estimator = EstimatorWithFitAndScore() estimator.fit([[1]], [1]) scorer = scoring_validator(estimator) diff --git a/sklearn_fork/model_selection/_validation.py b/sklearn_fork/model_selection/_validation.py index d82e2e42e0726..fba9f71ba04f6 100644 --- a/sklearn_fork/model_selection/_validation.py +++ b/sklearn_fork/model_selection/_validation.py @@ -32,6 +32,7 @@ from ..utils.metaestimators import _safe_split from ..utils._param_validation import ( HasMethods, + Interval, Integral, StrOptions, validate_params, @@ -328,7 +329,7 @@ def cross_validate( _warn_or_raise_about_fit_failures(results, error_score) - # For callabe scoring, the return type is only know after calling. If the + # For callable scoring, the return type is only know after calling. If the # return type is a dictionary, the error scores can now be inserted with # the correct key. if callable(scoring): @@ -1235,6 +1236,21 @@ def _check_is_permutation(indices, n_samples): return True +@validate_params( + { + "estimator": [HasMethods("fit")], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "cv": ["cv_object"], + "n_permutations": [Interval(Integral, 1, None, closed="left")], + "n_jobs": [Integral, None], + "random_state": ["random_state"], + "verbose": ["verbose"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "fit_params": [dict, None], + } +) def permutation_test_score( estimator, X, @@ -1416,6 +1432,26 @@ def _shuffle(y, groups, random_state): return _safe_indexing(y, indices) +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "groups": ["array-like", None], + "train_sizes": ["array-like"], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "exploit_incremental_learning": ["boolean"], + "n_jobs": [Integral, None], + "pre_dispatch": [Integral, str], + "verbose": ["verbose"], + "shuffle": ["boolean"], + "random_state": ["random_state"], + "error_score": [StrOptions({"raise"}), Real], + "return_times": ["boolean"], + "fit_params": [dict, None], + } +) def learning_curve( estimator, X, @@ -1450,18 +1486,20 @@ def learning_curve( Parameters ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. + estimator : object type that implements the "fit" method + An object of that type which is cloned for each validation. It must + also implement "predict" unless `scoring` is a callable that doesn't + rely on "predict" to compute a score. - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. - y : array-like of shape (n_samples,) or (n_samples, n_outputs) + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None Target relative to X for classification or regression; None for unsupervised learning. - groups : array-like of shape (n_samples,), default=None + groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`). @@ -1802,6 +1840,23 @@ def _incremental_fit_estimator( return np.array(ret).T +@validate_params( + { + "estimator": [HasMethods(["fit"])], + "X": ["array-like", "sparse matrix"], + "y": ["array-like", None], + "param_name": [str], + "param_range": ["array-like"], + "groups": ["array-like", None], + "cv": ["cv_object"], + "scoring": [StrOptions(set(get_scorer_names())), callable, None], + "n_jobs": [Integral, None], + "pre_dispatch": [Integral, str], + "verbose": ["verbose"], + "error_score": [StrOptions({"raise"}), Real], + "fit_params": [dict, None], + } +) def validation_curve( estimator, X, @@ -1831,10 +1886,12 @@ def validation_curve( Parameters ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. + estimator : object type that implements the "fit" method + An object of that type which is cloned for each validation. It must + also implement "predict" unless `scoring` is a callable that doesn't + rely on "predict" to compute a score. - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. diff --git a/sklearn_fork/model_selection/tests/test_validation.py b/sklearn_fork/model_selection/tests/test_validation.py index 54364b855e114..6c3ce6964a812 100644 --- a/sklearn_fork/model_selection/tests/test_validation.py +++ b/sklearn_fork/model_selection/tests/test_validation.py @@ -2094,7 +2094,6 @@ def test_fit_and_score_failing(): failing_clf = FailingClassifier(FailingClassifier.FAILING_PARAMETER) # dummy X data X = np.arange(1, 10) - y = np.ones(9) fit_and_score_args = [failing_clf, X, None, dict(), None, None, 0, None, None] # passing error score to trigger the warning message fit_and_score_kwargs = {"error_score": "raise"} @@ -2103,11 +2102,6 @@ def test_fit_and_score_failing(): _fit_and_score(*fit_and_score_args, **fit_and_score_kwargs) # check that functions upstream pass error_score param to _fit_and_score - error_message = re.escape( - "error_score must be the string 'raise' or a numeric value. (Hint: if " - "using 'raise', please make sure that it has been spelled correctly.)" - ) - error_message_cross_validate = ( "The 'error_score' parameter of cross_validate must be .*. Got .* instead." ) @@ -2115,20 +2109,6 @@ def test_fit_and_score_failing(): with pytest.raises(ValueError, match=error_message_cross_validate): cross_val_score(failing_clf, X, cv=3, error_score="unvalid-string") - with pytest.raises(ValueError, match=error_message): - learning_curve(failing_clf, X, y, cv=3, error_score="unvalid-string") - - with pytest.raises(ValueError, match=error_message): - validation_curve( - failing_clf, - X, - y, - param_name="parameter", - param_range=[FailingClassifier.FAILING_PARAMETER], - cv=3, - error_score="unvalid-string", - ) - assert failing_clf.score() == 0.0 # FailingClassifier coverage diff --git a/sklearn_fork/tests/test_public_functions.py b/sklearn_fork/tests/test_public_functions.py index e44c4b24c3c71..8a7dd595d9def 100644 --- a/sklearn_fork/tests/test_public_functions.py +++ b/sklearn_fork/tests/test_public_functions.py @@ -110,6 +110,7 @@ def _check_function_param_validation( PARAM_VALIDATION_FUNCTION_LIST = [ + "sklearn_fork.calibration.calibration_curve", "sklearn_fork.cluster.cluster_optics_dbscan", "sklearn_fork.cluster.compute_optics_graph", "sklearn_fork.cluster.estimate_bandwidth", @@ -119,6 +120,7 @@ def _check_function_param_validation( "sklearn_fork.covariance.empirical_covariance", "sklearn_fork.covariance.ledoit_wolf_shrinkage", "sklearn_fork.covariance.shrunk_covariance", + "sklearn_fork.datasets.clear_data_home", "sklearn_fork.datasets.dump_svmlight_file", "sklearn_fork.datasets.fetch_20newsgroups", "sklearn_fork.datasets.fetch_20newsgroups_vectorized", @@ -130,11 +132,14 @@ def _check_function_param_validation( "sklearn_fork.datasets.fetch_olivetti_faces", "sklearn_fork.datasets.fetch_rcv1", "sklearn_fork.datasets.fetch_species_distributions", + "sklearn_fork.datasets.get_data_home", "sklearn_fork.datasets.load_breast_cancer", "sklearn_fork.datasets.load_diabetes", "sklearn_fork.datasets.load_digits", + "sklearn_fork.datasets.load_files", "sklearn_fork.datasets.load_iris", "sklearn_fork.datasets.load_linnerud", + "sklearn_fork.datasets.load_sample_image", "sklearn_fork.datasets.load_svmlight_file", "sklearn_fork.datasets.load_svmlight_files", "sklearn_fork.datasets.load_wine", @@ -169,6 +174,7 @@ def _check_function_param_validation( "sklearn_fork.feature_selection.mutual_info_classif", "sklearn_fork.feature_selection.mutual_info_regression", "sklearn_fork.feature_selection.r_regression", + "sklearn_fork.inspection.partial_dependence", "sklearn_fork.inspection.permutation_importance", "sklearn_fork.linear_model.orthogonal_mp", "sklearn_fork.metrics.accuracy_score", @@ -177,6 +183,8 @@ def _check_function_param_validation( "sklearn_fork.metrics.balanced_accuracy_score", "sklearn_fork.metrics.brier_score_loss", "sklearn_fork.metrics.calinski_harabasz_score", + "sklearn_fork.metrics.check_scoring", + "sklearn_fork.metrics.completeness_score", "sklearn_fork.metrics.class_likelihood_ratios", "sklearn_fork.metrics.classification_report", "sklearn_fork.metrics.cluster.adjusted_mutual_info_score", @@ -202,6 +210,7 @@ def _check_function_param_validation( "sklearn_fork.metrics.get_scorer", "sklearn_fork.metrics.hamming_loss", "sklearn_fork.metrics.hinge_loss", + "sklearn_fork.metrics.homogeneity_score", "sklearn_fork.metrics.jaccard_score", "sklearn_fork.metrics.label_ranking_average_precision_score", "sklearn_fork.metrics.label_ranking_loss", @@ -229,6 +238,7 @@ def _check_function_param_validation( "sklearn_fork.metrics.pairwise.haversine_distances", "sklearn_fork.metrics.pairwise.laplacian_kernel", "sklearn_fork.metrics.pairwise.linear_kernel", + "sklearn_fork.metrics.pairwise.manhattan_distances", "sklearn_fork.metrics.pairwise.nan_euclidean_distances", "sklearn_fork.metrics.pairwise.paired_cosine_distances", "sklearn_fork.metrics.pairwise.paired_euclidean_distances", @@ -245,9 +255,13 @@ def _check_function_param_validation( "sklearn_fork.metrics.roc_auc_score", "sklearn_fork.metrics.roc_curve", "sklearn_fork.metrics.top_k_accuracy_score", + "sklearn_fork.metrics.v_measure_score", "sklearn_fork.metrics.zero_one_loss", "sklearn_fork.model_selection.cross_validate", + "sklearn_fork.model_selection.learning_curve", + "sklearn_fork.model_selection.permutation_test_score", "sklearn_fork.model_selection.train_test_split", + "sklearn_fork.model_selection.validation_curve", "sklearn_fork.neighbors.sort_graph_by_row_values", "sklearn_fork.preprocessing.add_dummy_feature", "sklearn_fork.preprocessing.binarize", @@ -279,46 +293,22 @@ def test_function_param_validation(func_module): PARAM_VALIDATION_CLASS_WRAPPER_LIST = [ - ( - "sklearn_fork.cluster.affinity_propagation", - "sklearn_fork.cluster.AffinityPropagation", - ), + ("sklearn_fork.cluster.affinity_propagation", "sklearn_fork.cluster.AffinityPropagation"), ("sklearn_fork.cluster.mean_shift", "sklearn_fork.cluster.MeanShift"), - ( - "sklearn_fork.cluster.spectral_clustering", - "sklearn_fork.cluster.SpectralClustering", - ), - ( - "sklearn_fork.covariance.graphical_lasso", - "sklearn_fork.covariance.GraphicalLasso", - ), + ("sklearn_fork.cluster.spectral_clustering", "sklearn_fork.cluster.SpectralClustering"), + ("sklearn_fork.covariance.graphical_lasso", "sklearn_fork.covariance.GraphicalLasso"), ("sklearn_fork.covariance.ledoit_wolf", "sklearn_fork.covariance.LedoitWolf"), ("sklearn_fork.covariance.oas", "sklearn_fork.covariance.OAS"), - ( - "sklearn_fork.decomposition.dict_learning", - "sklearn_fork.decomposition.DictionaryLearning", - ), + ("sklearn_fork.decomposition.dict_learning", "sklearn_fork.decomposition.DictionaryLearning"), ("sklearn_fork.decomposition.fastica", "sklearn_fork.decomposition.FastICA"), - ( - "sklearn_fork.decomposition.non_negative_factorization", - "sklearn_fork.decomposition.NMF", - ), - ( - "sklearn_fork.preprocessing.minmax_scale", - "sklearn_fork.preprocessing.MinMaxScaler", - ), - ( - "sklearn_fork.preprocessing.power_transform", - "sklearn_fork.preprocessing.PowerTransformer", - ), + ("sklearn_fork.decomposition.non_negative_factorization", "sklearn_fork.decomposition.NMF"), + ("sklearn_fork.preprocessing.minmax_scale", "sklearn_fork.preprocessing.MinMaxScaler"), + ("sklearn_fork.preprocessing.power_transform", "sklearn_fork.preprocessing.PowerTransformer"), ( "sklearn_fork.preprocessing.quantile_transform", "sklearn_fork.preprocessing.QuantileTransformer", ), - ( - "sklearn_fork.preprocessing.robust_scale", - "sklearn_fork.preprocessing.RobustScaler", - ), + ("sklearn_fork.preprocessing.robust_scale", "sklearn_fork.preprocessing.RobustScaler"), ] diff --git a/sklearn_fork/utils/_response.py b/sklearn_fork/utils/_response.py index 0e597fb6eb995..16735ba6d1c78 100644 --- a/sklearn_fork/utils/_response.py +++ b/sklearn_fork/utils/_response.py @@ -47,7 +47,7 @@ def _get_response_values( preference. The method returned corresponds to the first method in the list and which is implemented by `estimator`. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class when computing the metrics. By default, `estimators.classes_[1]` is considered as the positive class. @@ -58,7 +58,7 @@ def _get_response_values( Target scores calculated from the provided response_method and `pos_label`. - pos_label : str, int or None + pos_label : int, float, bool, str or None The class considered as the positive class when computing the metrics. Returns `None` if `estimator` is a regressor. @@ -133,24 +133,24 @@ def _get_response_values_binary(estimator, X, response_method, pos_label=None): X : {array-like, sparse matrix} of shape (n_samples, n_features) Input values. - response_method: {'auto', 'predict_proba', 'decision_function'} + response_method : {'auto', 'predict_proba', 'decision_function'} Specifies whether to use :term:`predict_proba` or :term:`decision_function` as the target response. If set to 'auto', :term:`predict_proba` is tried first and if it does not exist :term:`decision_function` is tried next. - pos_label : str or int, default=None + pos_label : int, float, bool or str, default=None The class considered as the positive class when computing the metrics. By default, `estimators.classes_[1]` is considered as the positive class. Returns ------- - y_pred: ndarray of shape (n_samples,) + y_pred : ndarray of shape (n_samples,) Target scores calculated from the provided response_method and pos_label. - pos_label: str or int + pos_label : int, float, bool or str The class considered as the positive class when computing the metrics. """ diff --git a/sklearn_fork/utils/_set_output.py b/sklearn_fork/utils/_set_output.py index ab4f558e1c2e3..8071544091fca 100644 --- a/sklearn_fork/utils/_set_output.py +++ b/sklearn_fork/utils/_set_output.py @@ -57,7 +57,7 @@ def _wrap_in_pandas_container( data_to_wrap.columns = columns return data_to_wrap - return pd.DataFrame(data_to_wrap, index=index, columns=columns) + return pd.DataFrame(data_to_wrap, index=index, columns=columns, copy=False) def _get_output_config(method, estimator=None): diff --git a/sklearn_fork/utils/_testing.py b/sklearn_fork/utils/_testing.py index 3a67cc2b5f329..3842879c2bac2 100644 --- a/sklearn_fork/utils/_testing.py +++ b/sklearn_fork/utils/_testing.py @@ -843,7 +843,7 @@ def _convert_container(container, constructor_name, columns_name=None, dtype=Non return sp.sparse.csr_matrix(container, dtype=dtype) elif constructor_name == "dataframe": pd = pytest.importorskip("pandas") - return pd.DataFrame(container, columns=columns_name, dtype=dtype) + return pd.DataFrame(container, columns=columns_name, dtype=dtype, copy=False) elif constructor_name == "series": pd = pytest.importorskip("pandas") return pd.Series(container, dtype=dtype) diff --git a/sklearn_fork/utils/estimator_checks.py b/sklearn_fork/utils/estimator_checks.py index 8ec3420f924ec..8e2832ea0a6ab 100644 --- a/sklearn_fork/utils/estimator_checks.py +++ b/sklearn_fork/utils/estimator_checks.py @@ -925,11 +925,11 @@ def check_sample_weights_pandas_series(name, estimator_orig): [3, 4], ] ) - X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X)) + X = pd.DataFrame(_enforce_estimator_tags_X(estimator_orig, X), copy=False) y = pd.Series([1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2]) weights = pd.Series([1] * 12) if _safe_tags(estimator, key="multioutput_only"): - y = pd.DataFrame(y) + y = pd.DataFrame(y, copy=False) try: estimator.fit(X, y, sample_weight=weights) except ValueError: @@ -3218,10 +3218,10 @@ def check_estimators_data_not_an_array(name, estimator_orig, X, y, obj_type): y_ = np.asarray(y) if y_.ndim == 1: - y_ = pd.Series(y_) + y_ = pd.Series(y_, copy=False) else: - y_ = pd.DataFrame(y_) - X_ = pd.DataFrame(np.asarray(X)) + y_ = pd.DataFrame(y_, copy=False) + X_ = pd.DataFrame(np.asarray(X), copy=False) except ImportError: raise SkipTest( @@ -3897,7 +3897,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): n_samples, n_features = X_orig.shape names = np.array([f"col_{i}" for i in range(n_features)]) - X = pd.DataFrame(X_orig, columns=names) + X = pd.DataFrame(X_orig, columns=names, copy=False) if is_regressor(estimator): y = rng.normal(size=n_samples) @@ -3985,7 +3985,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): early_stopping_enabled = any(value is True for value in params.values()) for invalid_name, additional_message in invalid_names: - X_bad = pd.DataFrame(X, columns=invalid_name) + X_bad = pd.DataFrame(X, columns=invalid_name, copy=False) expected_msg = re.escape( "The feature names should match those that were passed during fit.\n" @@ -4094,7 +4094,7 @@ def check_transformer_get_feature_names_out_pandas(name, transformer_orig): y_[::2, 1] *= 2 feature_names_in = [f"col{i}" for i in range(n_features)] - df = pd.DataFrame(X, columns=feature_names_in) + df = pd.DataFrame(X, columns=feature_names_in, copy=False) X_transform = transformer.fit_transform(df, y=y_) # error is raised when `input_features` do not match feature_names_in @@ -4324,7 +4324,7 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): # We always rely on the output of `get_feature_names_out` of the # transformer used to generate the dataframe as a ground-truth of the # columns. - expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas) + expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False) try: pd.testing.assert_frame_equal(df_trans, expected_dataframe) @@ -4359,7 +4359,7 @@ def check_set_output_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in) + df = pd.DataFrame(X, columns=feature_names_in, copy=False) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) @@ -4401,7 +4401,7 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in) + df = pd.DataFrame(X, columns=feature_names_in, copy=False) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) diff --git a/sklearn_fork/utils/sparsefuncs_fast.pyx b/sklearn_fork/utils/sparsefuncs_fast.pyx index 64b8ee2412fb1..f4e3ff20ab73b 100644 --- a/sklearn_fork/utils/sparsefuncs_fast.pyx +++ b/sklearn_fork/utils/sparsefuncs_fast.pyx @@ -11,9 +11,6 @@ from libc.math cimport fabs, sqrt, isnan cimport numpy as cnp import numpy as np from cython cimport floating -from cython.parallel cimport prange - -from sklearn_fork.utils._openmp_helpers import _openmp_effective_n_threads cnp.import_array() @@ -28,14 +25,12 @@ def csr_row_norms(X): """Squared L2 norm of each row in CSR matrix X.""" if X.dtype not in [np.float32, np.float64]: X = X.astype(np.float64) - n_threads = _openmp_effective_n_threads() - return _sqeuclidean_row_norms_sparse(X.data, X.indptr, n_threads) + return _sqeuclidean_row_norms_sparse(X.data, X.indptr) def _sqeuclidean_row_norms_sparse( const floating[::1] X_data, const integral[::1] X_indptr, - int n_threads, ): cdef: integral n_samples = X_indptr.shape[0] - 1 @@ -45,9 +40,10 @@ def _sqeuclidean_row_norms_sparse( cdef floating[::1] squared_row_norms = np.zeros(n_samples, dtype=dtype) - for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads): - for j in range(X_indptr[i], X_indptr[i + 1]): - squared_row_norms[i] += X_data[j] * X_data[j] + with nogil: + for i in range(n_samples): + for j in range(X_indptr[i], X_indptr[i + 1]): + squared_row_norms[i] += X_data[j] * X_data[j] return np.asarray(squared_row_norms) diff --git a/sklearn_fork/utils/validation.py b/sklearn_fork/utils/validation.py index d71ed1dd1d671..a591df754c87a 100644 --- a/sklearn_fork/utils/validation.py +++ b/sklearn_fork/utils/validation.py @@ -595,17 +595,17 @@ def _pandas_dtype_needs_early_conversion(pd_dtype): # Check these early for pandas versions without extension dtypes from pandas.api.types import ( is_bool_dtype, - is_sparse, is_float_dtype, is_integer_dtype, ) + from pandas import SparseDtype if is_bool_dtype(pd_dtype): # bool and extension booleans need early converstion because __array__ # converts mixed dtype dataframes into object dtypes return True - if is_sparse(pd_dtype): + if isinstance(pd_dtype, SparseDtype): # Sparse arrays will be converted later in `check_array` return False @@ -614,7 +614,7 @@ def _pandas_dtype_needs_early_conversion(pd_dtype): except ImportError: return False - if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype): + if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype): # Sparse arrays will be converted later in `check_array` # Only handle extension arrays for integer and floats return False @@ -769,7 +769,10 @@ def check_array( # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be preserved (later). with suppress(ImportError): - from pandas.api.types import is_sparse + from pandas import SparseDtype + + def is_sparse(dtype): + return isinstance(dtype, SparseDtype) if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any(): warnings.warn( @@ -847,7 +850,10 @@ def check_array( # When all dataframe columns are sparse, convert to a sparse array if hasattr(array, "sparse") and array.ndim > 1: with suppress(ImportError): - from pandas.api.types import is_sparse + from pandas import SparseDtype # noqa: F811 + + def is_sparse(dtype): + return isinstance(dtype, SparseDtype) if array.dtypes.apply(is_sparse).all(): # DataFrame.sparse only supports `to_coo` @@ -2169,14 +2175,14 @@ def _check_pos_label_consistency(pos_label, y_true): Parameters ---------- - pos_label : int, str or None + pos_label : int, float, bool, str or None The positive label. y_true : ndarray of shape (n_samples,) The target vector. Returns ------- - pos_label : int + pos_label : int, float, bool or str If `pos_label` can be inferred, it will be returned. Raises