From e209cb9e81ebf1d87befa97d101b92e2481f5920 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 29 Mar 2022 17:54:45 +0200 Subject: [PATCH 01/43] wip engines --- sklearn/_engine.py | 65 +++++++++++++++++++++++++++++++++++ sklearn/tests/test_engines.py | 19 ++++++++++ 2 files changed, 84 insertions(+) create mode 100644 sklearn/_engine.py create mode 100644 sklearn/tests/test_engines.py diff --git a/sklearn/_engine.py b/sklearn/_engine.py new file mode 100644 index 0000000000000..f9d24a75931c8 --- /dev/null +++ b/sklearn/_engine.py @@ -0,0 +1,65 @@ +from importlib.metadata import entry_points +from importlib import import_module +from contextlib import contextmanager +from functools import lru_cache +import warnings + + +SKLEARN_ENGINES_ENTRY_POINT = "skearn_engines" + + +@contextmanager +def computational_engine(provider_name): + engines = load_engines(provider_name=provider_name) + if not engines: + raise ImportError( + "Could not find entry point in group 'sklearn_engines' for" + f" '{provider_name}'" + ) + # TODO: implement me + yield + + +def _parse_entry_points(provider_name=None): + for entry_point in entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT): + try: + module_name, engine_qualname = entry_point["value"].split(":") + this_provider_name = next(iter(module_name.split(".", 1))) + if provider_name is not None and this_provider_name != provider_name: + # Skip entry points that do not match the requested provider name. + continue + except Exception as e: + warnings.warn( + f"Invalid sklearn_engine entry point: {entry_point['name']}: {e}" + ) + + +@lru_caches +def list_engine_provider_names(): + + for entry_point in entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT): + try: + module_name, engine_qualname = entry_point["value"].split(":") + this_provider_name = next(iter(module_name.split(".", 1))) + + return [ + + ] + + +@lru_cache +def load_engines(provider_name=None): + engines = [] + + engine = import_module(module_name) + for attr in engine_qualname.split("."): + engine = getattr(engine, attr) + engines.append( + { + "name": entry_point["name"], + "provider_name": this_provider_name, + "engine": engine, + } + ) + + return engines diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py new file mode 100644 index 0000000000000..1f749800a1fb5 --- /dev/null +++ b/sklearn/tests/test_engines.py @@ -0,0 +1,19 @@ +from sklearn._engines import list_engine_provider_names +from sklearn._engines import load_engines +import pytest + + +def test_list_engine_provider_names(): + provider_names = list_engine_provider_names() + for provider_name in provider_names: + assert isinstance(provider_name, str) + + +def test_load_engines(): + all_engines = load_engines() + # TODO: write me + +def test_load_engines_invalid_provider(): + provider_name = "some_invalid_test_provider_name" + assert provider_name not in list_engine_provider_names() + assert load_engines(provider_name=provider_name) == [] From 1647eed5b339caf7657d699fe8a82dfc15264ba5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 1 Apr 2022 18:22:31 +0200 Subject: [PATCH 02/43] wip --- sklearn/_engine.py | 90 +++++++++++++++++++---------------- sklearn/tests/test_engines.py | 44 ++++++++++++----- 2 files changed, 82 insertions(+), 52 deletions(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index f9d24a75931c8..4501ccc80a744 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from importlib.metadata import entry_points from importlib import import_module from contextlib import contextmanager @@ -8,58 +9,65 @@ SKLEARN_ENGINES_ENTRY_POINT = "skearn_engines" +class EngineSpec: + + __slots__ = ["name", "provider_name", "engine_qualname"] + + def __init__(self, name, provider_name, engine_qualname): + self.name = name + self.provider_name = provider_name + self.engine_qualname = engine_qualname + + def get_engine_class(self): + engine = import_module(self.module_name) + for attr in self.engine_qualname.split("."): + engine = getattr(engine, attr) + return engine + + @contextmanager -def computational_engine(provider_name): - engines = load_engines(provider_name=provider_name) - if not engines: - raise ImportError( - "Could not find entry point in group 'sklearn_engines' for" - f" '{provider_name}'" - ) - # TODO: implement me +def computational_engine(provider_names): + if isinstance(provider_names, str): + provider_names = [provider_names] + # TODO: implement me and complain if no entry point can be found with the given provider name + parsed_entry_points = _parse_entry_points(provider_names=provider_names) + if len(parsed_entry_points) == 0: + raise RuntimeError() yield -def _parse_entry_points(provider_name=None): +def _parse_entry_point(entry_point): + module_name, engine_qualname = entry_point["value"].split(":") + provider_name = next(iter(module_name.split(".", 1))) + return EngineSpec( + entry_point["name"], provider_name, engine_qualname + ) + + +@lru_cache +def _parse_entry_points(provider_names=None): + specs = [] for entry_point in entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT): try: - module_name, engine_qualname = entry_point["value"].split(":") - this_provider_name = next(iter(module_name.split(".", 1))) - if provider_name is not None and this_provider_name != provider_name: - # Skip entry points that do not match the requested provider name. + spec = _parse_entry_point(entry_point) + if provider_names is not None and spec.provider_name in provider_names: + # Skip entry points that do not match the requested provider names. continue + specs.append(spec) except Exception as e: + # Do not raise an exception in case an invalid package has been + # installed in the same Python env as scikit-learn: just warn and + # skip. warnings.warn( - f"Invalid sklearn_engine entry point: {entry_point['name']}: {e}" + f"Invalid sklearn_engine entry point {entry_point['name']} " + f"with value {entry_point['value']}: {e}" ) + return specs -@lru_caches def list_engine_provider_names(): + """Find the list of sklearn_engine provider names - for entry_point in entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT): - try: - module_name, engine_qualname = entry_point["value"].split(":") - this_provider_name = next(iter(module_name.split(".", 1))) - - return [ - - ] - - -@lru_cache -def load_engines(provider_name=None): - engines = [] - - engine = import_module(module_name) - for attr in engine_qualname.split("."): - engine = getattr(engine, attr) - engines.append( - { - "name": entry_point["name"], - "provider_name": this_provider_name, - "engine": engine, - } - ) - - return engines + This function only inspects the metadata and should trigger any module import. + """ + return sorted({spec.provider_name for spec in _parse_entry_points()}) diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index 1f749800a1fb5..d7fbe3a0c3821 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -1,6 +1,36 @@ -from sklearn._engines import list_engine_provider_names -from sklearn._engines import load_engines -import pytest +from sklearn._engine import list_engine_provider_names +from sklearn._engine import _parse_entry_point + + +class FakeEngine: + pass + + +class FakeEngineHolder: + class NestedFakeEngine: + pass + + +def test_get_engine_class(): + fake_entry_point = { + "name": "fake_engine", + "value": "sklearn.tests.test_engines:FakeEngine" + } + spec = _parse_entry_point(fake_entry_point) + assert spec.name == "fake_engine" + assert spec.provider_name == "sklearn" # or should it be scikit-learn? + assert spec.get_engine_class() is FakeEngine + + +def test_get_nested_engine_class(): + fake_entry_point = { + "name": "nested_fake_engine", + "value": "sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine" + } + spec = _parse_entry_point(fake_entry_point) + assert spec.name == "fake_engine" + assert spec.provider_name == "sklearn" # or should it be scikit-learn? + assert spec.get_engine_class() is FakeEngineHolder.NestedFakeEngine def test_list_engine_provider_names(): @@ -9,11 +39,3 @@ def test_list_engine_provider_names(): assert isinstance(provider_name, str) -def test_load_engines(): - all_engines = load_engines() - # TODO: write me - -def test_load_engines_invalid_provider(): - provider_name = "some_invalid_test_provider_name" - assert provider_name not in list_engine_provider_names() - assert load_engines(provider_name=provider_name) == [] From 87654c1846b1e6532833e85a94edc62e0f6a8a1e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 4 Apr 2022 16:52:30 +0200 Subject: [PATCH 03/43] fixes --- sklearn/_engine.py | 11 +++++------ sklearn/tests/test_engines.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index 4501ccc80a744..57cac492b1ef8 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -11,11 +11,12 @@ class EngineSpec: - __slots__ = ["name", "provider_name", "engine_qualname"] + __slots__ = ["name", "provider_name", "module_name", "engine_qualname"] - def __init__(self, name, provider_name, engine_qualname): + def __init__(self, name, provider_name, module_name, engine_qualname): self.name = name self.provider_name = provider_name + self.module_name = module_name self.engine_qualname = engine_qualname def get_engine_class(self): @@ -39,15 +40,13 @@ def computational_engine(provider_names): def _parse_entry_point(entry_point): module_name, engine_qualname = entry_point["value"].split(":") provider_name = next(iter(module_name.split(".", 1))) - return EngineSpec( - entry_point["name"], provider_name, engine_qualname - ) + return EngineSpec(entry_point["name"], provider_name, module_name, engine_qualname) @lru_cache def _parse_entry_points(provider_names=None): specs = [] - for entry_point in entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT): + for entry_point in entry_points().select(group=SKLEARN_ENGINES_ENTRY_POINT): try: spec = _parse_entry_point(entry_point) if provider_names is not None and spec.provider_name in provider_names: diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index d7fbe3a0c3821..c070f41fe2cb6 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -28,7 +28,7 @@ def test_get_nested_engine_class(): "value": "sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine" } spec = _parse_entry_point(fake_entry_point) - assert spec.name == "fake_engine" + assert spec.name == "nested_fake_engine" assert spec.provider_name == "sklearn" # or should it be scikit-learn? assert spec.get_engine_class() is FakeEngineHolder.NestedFakeEngine From b05d4ab38f55623039d3ae4eaf16d48d430a7945 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 4 Apr 2022 19:00:31 +0200 Subject: [PATCH 04/43] wip --- sklearn/_config.py | 6 +++ sklearn/_engine.py | 59 +++++++++++++++++++++------- sklearn/tests/test_engines.py | 73 ++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 16 deletions(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index c865b879dbea3..b797c932f9d0e 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -13,6 +13,7 @@ os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256) ), "enable_cython_pairwise_dist": True, + "engine_provider": (), } _threadlocal = threading.local() @@ -50,6 +51,7 @@ def set_config( display=None, pairwise_dist_chunk_size=None, enable_cython_pairwise_dist=None, + engine_provider=None, ): """Set global scikit-learn configuration @@ -128,6 +130,8 @@ def set_config( local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size if enable_cython_pairwise_dist is not None: local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist + if engine_provider is not None: + local_config["engine_provider"] = engine_provider @contextmanager @@ -139,6 +143,7 @@ def config_context( display=None, pairwise_dist_chunk_size=None, enable_cython_pairwise_dist=None, + engine_provider=None, ): """Context manager for global scikit-learn configuration. @@ -232,6 +237,7 @@ def config_context( display=display, pairwise_dist_chunk_size=pairwise_dist_chunk_size, enable_cython_pairwise_dist=enable_cython_pairwise_dist, + engine_provider=engine_provider, ) try: diff --git a/sklearn/_engine.py b/sklearn/_engine.py index 57cac492b1ef8..8b957e4b62c68 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -1,10 +1,10 @@ -from dataclasses import dataclass from importlib.metadata import entry_points from importlib import import_module from contextlib import contextmanager from functools import lru_cache import warnings +from sklearn._config import get_config SKLEARN_ENGINES_ENTRY_POINT = "skearn_engines" @@ -26,17 +26,6 @@ def get_engine_class(self): return engine -@contextmanager -def computational_engine(provider_names): - if isinstance(provider_names, str): - provider_names = [provider_names] - # TODO: implement me and complain if no entry point can be found with the given provider name - parsed_entry_points = _parse_entry_points(provider_names=provider_names) - if len(parsed_entry_points) == 0: - raise RuntimeError() - yield - - def _parse_entry_point(entry_point): module_name, engine_qualname = entry_point["value"].split(":") provider_name = next(iter(module_name.split(".", 1))) @@ -58,8 +47,16 @@ def _parse_entry_points(provider_names=None): # installed in the same Python env as scikit-learn: just warn and # skip. warnings.warn( - f"Invalid sklearn_engine entry point {entry_point['name']} " - f"with value {entry_point['value']}: {e}" + f"Invalid {SKLEARN_ENGINES_ENTRY_POINT} entry point" + f" {entry_point['name']} with value {entry_point['value']}: {e}" + ) + if provider_names is not None: + observed_provider_names = {spec.provider_name for spec in specs} + missing_providers = set(provider_names) - observed_provider_names + if missing_providers: + raise RuntimeError( + f"Could not find any provider for the {SKLEARN_ENGINES_ENTRY_POINT}" + f" entry point with name(s): {', '.join(sorted(missing_providers))}" ) return specs @@ -70,3 +67,37 @@ def list_engine_provider_names(): This function only inspects the metadata and should trigger any module import. """ return sorted({spec.provider_name for spec in _parse_entry_points()}) + + +def _get_engine_class(engine_name, provider_names, engine_specs): + specs_by_provider = {} + for spec in engine_specs: + if spec.name != engine_name: + continue + specs_by_provider.setdefault(spec.provider_name, spec) + + for provider_name in provider_names: + spec = specs_by_provider.get(provider_name) + if spec is not None: + # XXX: should we return an instance or the class itself? + return spec.get_engine_class() + + return None + + +def get_engine_class(engine_name): + provider_names = get_config()["engine_provider"] + if isinstance(provider_names, str): + provider_names = (provider_names,) + elif not isinstance(provider_names, tuple): + # Make sure the provider names are a tuple to make it possible for the + # lru cache to hash them. + provider_names = tuple(provider_names) + if not provider_names: + return None + engine_specs = _parse_entry_points(provider_names=provider_names) + return _get_engine_class( + engine_name=engine_name, + provider_names=provider_names, + engine_specs=engine_specs, + ) diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index c070f41fe2cb6..122ec9ed8297e 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -1,5 +1,12 @@ +import re +import pytest + from sklearn._engine import list_engine_provider_names from sklearn._engine import _parse_entry_point +from sklearn._engine import get_engine_class +from sklearn._engine import _get_engine_class +from sklearn._engine import EngineSpec +from sklearn._config import config_context class FakeEngine: @@ -14,7 +21,7 @@ class NestedFakeEngine: def test_get_engine_class(): fake_entry_point = { "name": "fake_engine", - "value": "sklearn.tests.test_engines:FakeEngine" + "value": "sklearn.tests.test_engines:FakeEngine", } spec = _parse_entry_point(fake_entry_point) assert spec.name == "fake_engine" @@ -25,7 +32,7 @@ def test_get_engine_class(): def test_get_nested_engine_class(): fake_entry_point = { "name": "nested_fake_engine", - "value": "sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine" + "value": "sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine", } spec = _parse_entry_point(fake_entry_point) assert spec.name == "nested_fake_engine" @@ -39,3 +46,65 @@ def test_list_engine_provider_names(): assert isinstance(provider_name, str) +def test_get_engine_class_for_invalid_provider(): + expected_message = re.escape( + "Could not find any provider for the skearn_engines entry point with" + " name(s): invalid_provider_name" + ) + with pytest.raises(RuntimeError, match=expected_message): + with config_context(engine_provider="invalid_provider_name"): + get_engine_class("kmeans") + + expected_message = re.escape( + "Could not find any provider for the skearn_engines entry point with" + " name(s): invalid_provider_name_1, invalid_provider_name_2" + ) + with pytest.raises(RuntimeError, match=expected_message): + with config_context( + engine_provider=("invalid_provider_name_1", "invalid_provider_name_2") + ): + get_engine_class("kmeans") + + +def test_get_engine_class(): + engine_specs = ( + EngineSpec("kmeans", "provider1", "sklearn.provider1.module", "KMeansEngine"), + EngineSpec("other", "provider1", "sklearn.provider1.module", "OtherEngine"), + EngineSpec("kmeans", "provider2", "sklearn.provider2.module", "KMeansEngine"), + EngineSpec("kmeans", "provider3", "sklearn.tests.test_engines", "FakeEngine"), + EngineSpec( + "kmeans", + "provider4", + "sklearn.tests.test_engines", + "FakeEngineHolder.NestedFakeEngine", + ), + ) + + engine_class = _get_engine_class( + engine_name="missing", + provider_names=("provider1", "provider3"), + engine_specs=engine_specs, + ) + assert engine_class is None + + engine_class = _get_engine_class( + engine_name="kmeans", + provider_names=("provider3", "provider4", "provider1", "provider2"), + engine_specs=engine_specs, + ) + assert engine_class == FakeEngine + + engine_class = _get_engine_class( + engine_name="kmeans", + provider_names=("provider4", "provider3", "provider1", "provider2"), + engine_specs=engine_specs, + ) + assert engine_class == FakeEngineHolder.NestedFakeEngine + + with pytest.raises(ImportError): + # Invalid imports are delayed until they are actually needed. + _get_engine_class( + engine_name="kmeans", + provider_names=("provider1", "provider3"), + engine_specs=engine_specs, + ) From 828f7971cfed0ec82fc9a6317a6ea36b7f73cf7d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 4 Apr 2022 19:37:43 +0200 Subject: [PATCH 05/43] more specific assertion --- sklearn/tests/test_engines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index 122ec9ed8297e..f784149722dd7 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -101,7 +101,7 @@ def test_get_engine_class(): ) assert engine_class == FakeEngineHolder.NestedFakeEngine - with pytest.raises(ImportError): + with pytest.raises(ImportError, match=re.escape("sklearn.provider1")): # Invalid imports are delayed until they are actually needed. _get_engine_class( engine_name="kmeans", From 5819524d9fcc476f1d7bbc14ec4985833d9c748f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 5 Apr 2022 12:08:11 +0200 Subject: [PATCH 06/43] Add docstring to the config context --- sklearn/_config.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sklearn/_config.py b/sklearn/_config.py index b797c932f9d0e..ba91acfd89ed4 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -111,6 +111,15 @@ def set_config( .. versionadded:: 1.1 + engine_provider : str or sequence of str, default=None + Enable computational engine implementation provided by third party + packages to leverage specific hardware platforms using frameworks or + libraries outside of the usual scikit-learn project dependencies. + + TODO: add link to doc + + .. versionadded:: 1.2 + See Also -------- config_context : Context manager for global scikit-learn configuration. @@ -202,6 +211,15 @@ def config_context( .. versionadded:: 1.1 + engine_provider : str or sequence of str, default=None + Enable computational engine implementation provided by third party + packages to leverage specific hardware platforms using frameworks or + libraries outside of the usual scikit-learn project dependencies. + + TODO: add link to doc + + .. versionadded:: 1.2 + Yields ------ None. From 5df598ce4731cefc471e417cbedf86f538992008 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 6 Apr 2022 09:39:34 +0200 Subject: [PATCH 07/43] add default kwarg --- sklearn/_engine.py | 9 +++++---- sklearn/cluster/_kmeans.py | 15 +++++++++++++++ sklearn/tests/test_engines.py | 8 ++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index 8b957e4b62c68..fe36b26fc3020 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -69,7 +69,7 @@ def list_engine_provider_names(): return sorted({spec.provider_name for spec in _parse_entry_points()}) -def _get_engine_class(engine_name, provider_names, engine_specs): +def _get_engine_class(engine_name, provider_names, engine_specs, default=None): specs_by_provider = {} for spec in engine_specs: if spec.name != engine_name: @@ -82,10 +82,10 @@ def _get_engine_class(engine_name, provider_names, engine_specs): # XXX: should we return an instance or the class itself? return spec.get_engine_class() - return None + return default -def get_engine_class(engine_name): +def get_engine_class(engine_name, default=None): provider_names = get_config()["engine_provider"] if isinstance(provider_names, str): provider_names = (provider_names,) @@ -94,10 +94,11 @@ def get_engine_class(engine_name): # lru cache to hash them. provider_names = tuple(provider_names) if not provider_names: - return None + return default engine_specs = _parse_entry_points(provider_names=provider_names) return _get_engine_class( engine_name=engine_name, provider_names=provider_names, engine_specs=engine_specs, + default=default, ) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index ae55e4b58bfb1..779df80390f58 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -49,6 +49,7 @@ from ._k_means_elkan import init_bounds_sparse from ._k_means_elkan import elkan_iter_chunked_dense from ._k_means_elkan import elkan_iter_chunked_sparse +from .._engine import get_engine_class ############################################################################### @@ -260,6 +261,17 @@ def _tolerance(X, tol): return np.mean(variances) * tol +class KMeansCythonEngine: + """Cython-based implementation of the core k-means routines + + This implementation is meant to be swappable by alternative implementations + in third-party packages via the sklearn_engines entry-point and the + `engine_provider` kwarg of `sklearn.config_context`. + + TODO: see URL for more details. + """ + + def k_means( X, n_clusters, @@ -1347,6 +1359,9 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ + engine_class = get_engine_class("kmeans", default=KMeansCythonEngine) + engine = engine_class(self) + X = self._validate_data( X, accept_sparse="csr", diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index f784149722dd7..44b0923f5eccc 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -46,6 +46,14 @@ def test_list_engine_provider_names(): assert isinstance(provider_name, str) +def test_get_engine_class_with_default(): + # Use config_context with an empty provider tuple to make sure that not provider + # are available for test_missing_engine_name + with config_context(engine_provider=()): + engine_class = get_engine_class("test_missing_engine_name", default=FakeEngine) + assert engine_class is FakeEngine + + def test_get_engine_class_for_invalid_provider(): expected_message = re.escape( "Could not find any provider for the skearn_engines entry point with" From 44cbd6ca52401f93a9328dea7c275d0149b2313c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 15 Apr 2022 12:02:28 +0200 Subject: [PATCH 08/43] Various fixes --- sklearn/_engine.py | 21 ++++++++++++++------- sklearn/tests/test_engines.py | 28 ++++++++++++++++------------ 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index fe36b26fc3020..67335de835efd 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -2,11 +2,12 @@ from importlib import import_module from contextlib import contextmanager from functools import lru_cache +from ssl import ALERT_DESCRIPTION_BAD_CERTIFICATE_HASH_VALUE import warnings from sklearn._config import get_config -SKLEARN_ENGINES_ENTRY_POINT = "skearn_engines" +SKLEARN_ENGINES_ENTRY_POINT = "sklearn_engines" class EngineSpec: @@ -27,15 +28,20 @@ def get_engine_class(self): def _parse_entry_point(entry_point): - module_name, engine_qualname = entry_point["value"].split(":") + module_name, engine_qualname = entry_point.value.split(":") provider_name = next(iter(module_name.split(".", 1))) - return EngineSpec(entry_point["name"], provider_name, module_name, engine_qualname) + return EngineSpec(entry_point.name, provider_name, module_name, engine_qualname) @lru_cache def _parse_entry_points(provider_names=None): specs = [] - for entry_point in entry_points().select(group=SKLEARN_ENGINES_ENTRY_POINT): + all_entry_points = entry_points() + if hasattr(all_entry_points, "select"): + engine_entry_points = all_entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT) + else: + engine_entry_points = all_entry_points[SKLEARN_ENGINES_ENTRY_POINT] + for entry_point in engine_entry_points: try: spec = _parse_entry_point(entry_point) if provider_names is not None and spec.provider_name in provider_names: @@ -48,15 +54,16 @@ def _parse_entry_points(provider_names=None): # skip. warnings.warn( f"Invalid {SKLEARN_ENGINES_ENTRY_POINT} entry point" - f" {entry_point['name']} with value {entry_point['value']}: {e}" + f" {entry_point.name} with value {entry_point.value}: {e}" ) if provider_names is not None: observed_provider_names = {spec.provider_name for spec in specs} missing_providers = set(provider_names) - observed_provider_names if missing_providers: raise RuntimeError( - f"Could not find any provider for the {SKLEARN_ENGINES_ENTRY_POINT}" - f" entry point with name(s): {', '.join(sorted(missing_providers))}" + "Could not find any provider for the" + f" {SKLEARN_ENGINES_ENTRY_POINT} entry point with name(s):" + f" {', '.join(repr(p) for p in sorted(missing_providers))}" ) return specs diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index 44b0923f5eccc..dd1548b69f124 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -1,4 +1,5 @@ import re +from collections import namedtuple import pytest from sklearn._engine import list_engine_provider_names @@ -18,11 +19,14 @@ class NestedFakeEngine: pass +FakeEntryPoint = namedtuple('FakeEntryPoint', ['name', 'value']) + + def test_get_engine_class(): - fake_entry_point = { - "name": "fake_engine", - "value": "sklearn.tests.test_engines:FakeEngine", - } + fake_entry_point = FakeEntryPoint( + name="fake_engine", + value="sklearn.tests.test_engines:FakeEngine", + ) spec = _parse_entry_point(fake_entry_point) assert spec.name == "fake_engine" assert spec.provider_name == "sklearn" # or should it be scikit-learn? @@ -30,10 +34,10 @@ def test_get_engine_class(): def test_get_nested_engine_class(): - fake_entry_point = { - "name": "nested_fake_engine", - "value": "sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine", - } + fake_entry_point = FakeEntryPoint( + name="nested_fake_engine", + value="sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine", + ) spec = _parse_entry_point(fake_entry_point) assert spec.name == "nested_fake_engine" assert spec.provider_name == "sklearn" # or should it be scikit-learn? @@ -56,16 +60,16 @@ def test_get_engine_class_with_default(): def test_get_engine_class_for_invalid_provider(): expected_message = re.escape( - "Could not find any provider for the skearn_engines entry point with" - " name(s): invalid_provider_name" + "Could not find any provider for the sklearn_engines entry point with" + " name(s): 'invalid_provider_name'" ) with pytest.raises(RuntimeError, match=expected_message): with config_context(engine_provider="invalid_provider_name"): get_engine_class("kmeans") expected_message = re.escape( - "Could not find any provider for the skearn_engines entry point with" - " name(s): invalid_provider_name_1, invalid_provider_name_2" + "Could not find any provider for the sklearn_engines entry point with" + " name(s): 'invalid_provider_name_1', 'invalid_provider_name_2'" ) with pytest.raises(RuntimeError, match=expected_message): with config_context( From a693d2e0c9b9546a81ece135791153412aea3ecc Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 19 Apr 2022 10:49:58 +0200 Subject: [PATCH 09/43] empty doc --- doc/modules/engine.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 doc/modules/engine.rst diff --git a/doc/modules/engine.rst b/doc/modules/engine.rst new file mode 100644 index 0000000000000..401da467ab6a1 --- /dev/null +++ b/doc/modules/engine.rst @@ -0,0 +1,11 @@ +.. Places parent toc into the sidebar +:parenttoc: True + +.. _engine: + + +================================== +Computation Engines (experimental) +================================== + + From 1dbae5bf06a92503747e5b0ed7c5ce2e9851fc54 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 21 May 2022 14:42:26 +0200 Subject: [PATCH 10/43] WIP --- sklearn/cluster/_kmeans.py | 96 ++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index d3e64f27ebf53..1985a4ef069b6 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -269,7 +269,7 @@ def _tolerance(X, tol): class KMeansCythonEngine: """Cython-based implementation of the core k-means routines - + This implementation is meant to be swappable by alternative implementations in third-party packages via the sklearn_engines entry-point and the `engine_provider` kwarg of `sklearn.config_context`. @@ -277,6 +277,57 @@ class KMeansCythonEngine: TODO: see URL for more details. """ + def _prepare_fit(self, estimator, X, y=None, sample_weight=None): + engine_fit_context = {} + X = estimator._validate_data( + X, + accept_sparse="csr", + dtype=[np.float64, np.float32], + order="C", + copy=estimator.copy_x, + accept_large_sparse=False, + ) + # TODO: delegate rng and sample weight checks to engine + random_state = check_random_state(estimator.random_state) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + self._n_threads = _openmp_effective_n_threads() + + # Validate init array + init = estimator.init + init_is_array_like = _is_arraylike_not_scalar(init) + if init_is_array_like: + init = check_array(init, dtype=X.dtype, copy=True, order="C") + estimator._validate_center_shape(X, init) + + # subtract of mean of x for more accurate distance computations + if not sp.issparse(X): + X_mean = X.mean(axis=0) + # The copy was already done above + X -= X_mean + + if init_is_array_like: + init -= X_mean + + # precompute squared norms of data points + x_squared_norms = row_norms(X, squared=True) + + if estimator._algorithm == "elkan": + kmeans_single = _kmeans_single_elkan + else: + kmeans_single = _kmeans_single_lloyd + estimator._check_mkl_vcomp(X, X.shape[0]) + + engine_fit_context = { + "x_squared_norms": x_squared_norms, + "kmeans_single_func": kmeans_single, + "random_state": random_state, + } + return X, init, engine_fit_context + + def _init_centroids(self, estimator, *args, **kwargs): + # XXX: this implementation should be part of the engine. + return estimator._init_centroids(*args, **kwargs) + def k_means( X, @@ -1376,53 +1427,20 @@ def fit(self, X, y=None, sample_weight=None): self._validate_params() engine_class = get_engine_class("kmeans", default=KMeansCythonEngine) engine = engine_class(self) - X = self._validate_data( + X, y, sample_weight, engine_fit_ctx = engine._prepare_fit( + self, X, - accept_sparse="csr", - dtype=[np.float64, np.float32], - order="C", - copy=self.copy_x, - accept_large_sparse=False, + y=y, + sample_weight=sample_weight, ) self._check_params_vs_input(X) - random_state = check_random_state(self.random_state) - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - self._n_threads = _openmp_effective_n_threads() - - # Validate init array - init = self.init - init_is_array_like = _is_arraylike_not_scalar(init) - if init_is_array_like: - init = check_array(init, dtype=X.dtype, copy=True, order="C") - self._validate_center_shape(X, init) - - # subtract of mean of x for more accurate distance computations - if not sp.issparse(X): - X_mean = X.mean(axis=0) - # The copy was already done above - X -= X_mean - - if init_is_array_like: - init -= X_mean - - # precompute squared norms of data points - x_squared_norms = row_norms(X, squared=True) - - if self._algorithm == "elkan": - kmeans_single = _kmeans_single_elkan - else: - kmeans_single = _kmeans_single_lloyd - self._check_mkl_vcomp(X, X.shape[0]) - best_inertia, best_labels = None, None for i in range(self._n_init): # Initialize centers - centers_init = self._init_centroids( - X, x_squared_norms=x_squared_norms, init=init, random_state=random_state - ) + centers_init = engine._init_centroids(X, **engine_fit_ctx) if self.verbose: print("Initialization complete") From dd586f13f0de77ff1b762964727b1407d492a21b Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 3 Jun 2022 10:37:57 +0200 Subject: [PATCH 11/43] wip --- sklearn/cluster/_kmeans.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 0f64270e129d5..7e10529091db3 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -325,7 +325,8 @@ def _prepare_fit(self, estimator, X, y=None, sample_weight=None): return X, init, engine_fit_context def _init_centroids(self, estimator, *args, **kwargs): - # XXX: this implementation should be part of the engine. + # XXX: the actual implementation of the centroids init should also be + # moved to the engine. return estimator._init_centroids(*args, **kwargs) @@ -1502,14 +1503,14 @@ def fit(self, X, y=None, sample_weight=None): print("Initialization complete") # run a k-means once - labels, inertia, centers, n_iter_ = kmeans_single( + labels, inertia, centers, n_iter_ = engine.kmeans_single( X, sample_weight, centers_init, max_iter=self.max_iter, verbose=self.verbose, tol=self._tol, - x_squared_norms=x_squared_norms, + x_squared_norms=engine_fit_ctx["x_squared_norms"], n_threads=self._n_threads, ) From e3c105621f745a47c30eeb8eb2fdbf4ceb6fb4cb Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 8 Jun 2022 09:05:11 +0200 Subject: [PATCH 12/43] Move tolerance computation to the engine --- sklearn/cluster/_kmeans.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 7e10529091db3..3c14635264325 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -256,15 +256,6 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trial # K-means batch estimation by EM (expectation maximization) -def _tolerance(X, tol): - """Return a tolerance which is dependent on the dataset.""" - if tol == 0: - return 0 - if sp.issparse(X): - variances = mean_variance_axis(X, axis=0)[1] - else: - variances = np.var(X, axis=0) - return np.mean(variances) * tol class KMeansCythonEngine: @@ -290,7 +281,12 @@ def _prepare_fit(self, estimator, X, y=None, sample_weight=None): # TODO: delegate rng and sample weight checks to engine random_state = check_random_state(estimator.random_state) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) - self._n_threads = _openmp_effective_n_threads() + + # Also store the number of threads on the estimator to be reused at + # prediction time XXX: shall we wrap engine-specific private fit + # attributes in a predict context dict set as attribute on the + # estimator? + estimator._n_threads, self._n_threads = _openmp_effective_n_threads() # Validate init array init = estimator.init @@ -321,6 +317,7 @@ def _prepare_fit(self, estimator, X, y=None, sample_weight=None): "x_squared_norms": x_squared_norms, "kmeans_single_func": kmeans_single, "random_state": random_state, + "tol": self._tolerance(X, self.tol) } return X, init, engine_fit_context @@ -329,6 +326,16 @@ def _init_centroids(self, estimator, *args, **kwargs): # moved to the engine. return estimator._init_centroids(*args, **kwargs) + def _tolerance(self, X, tol): + """Return a tolerance which is dependent on the dataset.""" + if tol == 0: + return 0 + if sp.issparse(X): + variances = mean_variance_axis(X, axis=0)[1] + else: + variances = np.var(X, axis=0) + return np.mean(variances) * tol + @validate_params( { @@ -933,9 +940,6 @@ def _check_params_vs_input(self, X, default_n_init=None): f"n_samples={X.shape[0]} should be >= n_clusters={self.n_clusters}." ) - # tol - self._tol = _tolerance(X, self.tol) - # n-init # TODO(1.4): Remove self._n_init = self.n_init @@ -1491,7 +1495,6 @@ def fit(self, X, y=None, sample_weight=None): y=y, sample_weight=sample_weight, ) - self._check_params_vs_input(X) best_inertia, best_labels = None, None @@ -1509,9 +1512,7 @@ def fit(self, X, y=None, sample_weight=None): centers_init, max_iter=self.max_iter, verbose=self.verbose, - tol=self._tol, - x_squared_norms=engine_fit_ctx["x_squared_norms"], - n_threads=self._n_threads, + **engine_fit_ctx, ) # determine if these results are the best so far From bd280ef7f490cf885abf1f16fff2ea0e669454be Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 8 Jun 2022 10:42:17 +0200 Subject: [PATCH 13/43] wip --- sklearn/cluster/_kmeans.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 3c14635264325..25182926941c7 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -317,7 +317,8 @@ def _prepare_fit(self, estimator, X, y=None, sample_weight=None): "x_squared_norms": x_squared_norms, "kmeans_single_func": kmeans_single, "random_state": random_state, - "tol": self._tolerance(X, self.tol) + "tol": self._scale_tolerance(X, self.tol), + "X_mean": X_mean, } return X, init, engine_fit_context @@ -326,7 +327,7 @@ def _init_centroids(self, estimator, *args, **kwargs): # moved to the engine. return estimator._init_centroids(*args, **kwargs) - def _tolerance(self, X, tol): + def _scale_tolerance(self, X, tol): """Return a tolerance which is dependent on the dataset.""" if tol == 0: return 0 @@ -1529,10 +1530,12 @@ def fit(self, X, y=None, sample_weight=None): best_inertia = inertia best_n_iter = n_iter_ - if not sp.issparse(X): - if not self.copy_x: - X += X_mean - best_centers += X_mean + engine._rescale_centers_and_data(X, best_centers, engine_fit_ctx[]) + # XXX: + # if not sp.issparse(X): + # if not self.copy_x: + # X += X_mean + # best_centers += X_mean distinct_clusters = len(set(best_labels)) if distinct_clusters < self.n_clusters: From 3352c58d271b19faf0cb3a6fb566d428bc8a7ae1 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 22 Sep 2022 16:22:33 +0200 Subject: [PATCH 14/43] wip --- sklearn/cluster/_kmeans.py | 54 ++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 739600899808d..737019038a1bc 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -267,8 +267,7 @@ class KMeansCythonEngine: TODO: see URL for more details. """ - def _prepare_fit(self, estimator, X, y=None, sample_weight=None): - engine_fit_context = {} + def prepare_fit(self, estimator, X, y=None, sample_weight=None): X = estimator._validate_data( X, accept_sparse="csr", @@ -312,21 +311,26 @@ def _prepare_fit(self, estimator, X, y=None, sample_weight=None): kmeans_single = _kmeans_single_lloyd estimator._check_mkl_vcomp(X, X.shape[0]) - engine_fit_context = { - "x_squared_norms": x_squared_norms, - "kmeans_single_func": kmeans_single, - "random_state": random_state, - "tol": self._scale_tolerance(X, self.tol), - "X_mean": X_mean, - } - return X, init, engine_fit_context + self.estimator = estimator + self.x_squared_norms = x_squared_norms + self.kmeans_single_func = kmeans_single + self.random_state = random_state + self.tol = self.scale_tolerance(X, self.tol) + self.X_mean = X_mean + self.init = init + return X, y, sample_weight - def _init_centroids(self, estimator, *args, **kwargs): + def init_centroids(self, X): # XXX: the actual implementation of the centroids init should also be # moved to the engine. - return estimator._init_centroids(*args, **kwargs) + return self.estimator._init_centroids( + X, + x_squared_norms=self.x_squared_norms, + init=self.init, + random_state=self.random_state, + ) - def _scale_tolerance(self, X, tol): + def scale_tolerance(self, X, tol): """Return a tolerance which is dependent on the dataset.""" if tol == 0: return 0 @@ -336,6 +340,16 @@ def _scale_tolerance(self, X, tol): variances = np.var(X, axis=0) return np.mean(variances) * tol + def unshift_centers(self, estimator, X, best_centers): + X_mean = self.engine_fit_context["X_mean"] + if not sp.issparse(X): + if not estimator.copy_x: + X += X_mean + best_centers += X_mean + + def is_same_clustering(self, labels, best_labels, n_clusters): + return _is_same_clustering(labels, best_labels, n_clusters) + @validate_params( { @@ -1476,7 +1490,7 @@ def fit(self, X, y=None, sample_weight=None): self._validate_params() engine_class = get_engine_class("kmeans", default=KMeansCythonEngine) engine = engine_class(self) - X, y, sample_weight, engine_fit_ctx = engine._prepare_fit( + X, y, sample_weight = engine.prepare_fit( self, X, y=y, @@ -1488,7 +1502,7 @@ def fit(self, X, y=None, sample_weight=None): for i in range(self._n_init): # Initialize centers - centers_init = engine._init_centroids(X, **engine_fit_ctx) + centers_init = engine.init_centroids(X) if self.verbose: print("Initialization complete") @@ -1499,7 +1513,6 @@ def fit(self, X, y=None, sample_weight=None): centers_init, max_iter=self.max_iter, verbose=self.verbose, - **engine_fit_ctx, ) # determine if these results are the best so far @@ -1509,19 +1522,14 @@ def fit(self, X, y=None, sample_weight=None): # permuted labels, due to rounding errors) if best_inertia is None or ( inertia < best_inertia - and not _is_same_clustering(labels, best_labels, self.n_clusters) + and not engine.is_same_clustering(labels, best_labels, self.n_clusters) ): best_labels = labels best_centers = centers best_inertia = inertia best_n_iter = n_iter_ - engine._rescale_centers_and_data(X, best_centers, engine_fit_ctx) - # XXX: - # if not sp.issparse(X): - # if not self.copy_x: - # X += X_mean - # best_centers += X_mean + engine.unshift_centers(X, best_centers) distinct_clusters = len(set(best_labels)) if distinct_clusters < self.n_clusters: From 475314356f9506dc00b7aa5116e38d2426bab5f3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 22 Sep 2022 16:43:59 +0200 Subject: [PATCH 15/43] wip --- sklearn/cluster/_kmeans.py | 42 ++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 737019038a1bc..2a915646b5fab 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -267,7 +267,11 @@ class KMeansCythonEngine: TODO: see URL for more details. """ - def prepare_fit(self, estimator, X, y=None, sample_weight=None): + def __init__(self, estimator): + self.estimator = estimator + + def prepare_fit(self, X, y=None, sample_weight=None): + estimator = self.estimator X = estimator._validate_data( X, accept_sparse="csr", @@ -276,6 +280,11 @@ def prepare_fit(self, estimator, X, y=None, sample_weight=None): copy=estimator.copy_x, accept_large_sparse=False, ) + # this sets estimator _algorithm implicitly + # XXX: shall we explose this logic as part of then engine API? + # or is the current API flexible enough? + estimator._check_params_vs_input(X) + # TODO: delegate rng and sample weight checks to engine random_state = check_random_state(estimator.random_state) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) @@ -284,7 +293,7 @@ def prepare_fit(self, estimator, X, y=None, sample_weight=None): # prediction time XXX: shall we wrap engine-specific private fit # attributes in a predict context dict set as attribute on the # estimator? - estimator._n_threads, self._n_threads = _openmp_effective_n_threads() + estimator._n_threads = self._n_threads = _openmp_effective_n_threads() # Validate init array init = estimator.init @@ -302,6 +311,8 @@ def prepare_fit(self, estimator, X, y=None, sample_weight=None): if init_is_array_like: init -= X_mean + self.X_mean = X_mean + # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) @@ -311,12 +322,10 @@ def prepare_fit(self, estimator, X, y=None, sample_weight=None): kmeans_single = _kmeans_single_lloyd estimator._check_mkl_vcomp(X, X.shape[0]) - self.estimator = estimator self.x_squared_norms = x_squared_norms self.kmeans_single_func = kmeans_single self.random_state = random_state - self.tol = self.scale_tolerance(X, self.tol) - self.X_mean = X_mean + self.tol = self.scale_tolerance(X, estimator.tol) self.init = init return X, y, sample_weight @@ -340,16 +349,26 @@ def scale_tolerance(self, X, tol): variances = np.var(X, axis=0) return np.mean(variances) * tol - def unshift_centers(self, estimator, X, best_centers): - X_mean = self.engine_fit_context["X_mean"] + def unshift_centers(self, X, best_centers): if not sp.issparse(X): - if not estimator.copy_x: - X += X_mean - best_centers += X_mean + if not self.estimator.copy_x: + X += self.X_mean + best_centers += self.X_mean def is_same_clustering(self, labels, best_labels, n_clusters): return _is_same_clustering(labels, best_labels, n_clusters) + def kmeans_single(self, X, sample_weight, centers_init): + return self.kmeans_single_func( + X, + sample_weight, + centers_init, + max_iter=self.estimator.max_iter, + tol=self.tol, + n_threads=self._n_threads, + verbose=self.estimator.verbose, + ) + @validate_params( { @@ -1491,7 +1510,6 @@ def fit(self, X, y=None, sample_weight=None): engine_class = get_engine_class("kmeans", default=KMeansCythonEngine) engine = engine_class(self) X, y, sample_weight = engine.prepare_fit( - self, X, y=y, sample_weight=sample_weight, @@ -1511,8 +1529,6 @@ def fit(self, X, y=None, sample_weight=None): X, sample_weight, centers_init, - max_iter=self.max_iter, - verbose=self.verbose, ) # determine if these results are the best so far From 2794d26546e7acd4f4c7714f14583c422b02f9de Mon Sep 17 00:00:00 2001 From: Franck Charras Date: Fri, 23 Sep 2022 09:10:09 +0200 Subject: [PATCH 16/43] linting --- sklearn/tests/test_engines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index dd1548b69f124..aa50867b068a8 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -19,7 +19,7 @@ class NestedFakeEngine: pass -FakeEntryPoint = namedtuple('FakeEntryPoint', ['name', 'value']) +FakeEntryPoint = namedtuple("FakeEntryPoint", ["name", "value"]) def test_get_engine_class(): From cc36c6e1120d0aea183dda3e0b8f8ff95e0bb45b Mon Sep 17 00:00:00 2001 From: Franck Charras Date: Fri, 23 Sep 2022 18:13:27 +0200 Subject: [PATCH 17/43] fix MBKMeans and linting --- sklearn/_engine.py | 2 -- sklearn/cluster/_kmeans.py | 7 +++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index 67335de835efd..be25464ea8729 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -1,8 +1,6 @@ from importlib.metadata import entry_points from importlib import import_module -from contextlib import contextmanager from functools import lru_cache -from ssl import ALERT_DESCRIPTION_BAD_CERTIFICATE_HASH_VALUE import warnings from sklearn._config import get_config diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 2a915646b5fab..43fc23faf2215 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1923,6 +1923,13 @@ def __init__( def _check_params_vs_input(self, X): super()._check_params_vs_input(X, default_n_init=3) + if self.tol > 0: + if sp.issparse(X): + variances = mean_variance_axis(X, axis=0)[1] + else: + variances = np.var(X, axis=0) + self._tol = np.mean(variances) * self.tol + self._batch_size = min(self.batch_size, X.shape[0]) # init_size From f92d63fc2786775dc78113480f7db13018b9e48f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 23 Sep 2022 20:02:06 +0200 Subject: [PATCH 18/43] Draft changelog entry --- doc/whats_new/v1.2.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 3874e10c03b26..27ddcfbb5016b 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -2,6 +2,25 @@ .. currentmodule:: sklearn + +TODO: move to doc/whats_new/v1.3.rst once it exists in main: + +- |Enhancement| Experimential engine API to allow for external packages to + contribute alternative implementations for the core computational routines + of some selected scikit-learn estimators. + + Currently, the following estimators allow alternative implementations: + + - :class:`~sklearn.cluster.KMeans` (only for the LLoyd algorithm). + - TODO: add more here + + External engine providers include: + + - https://github.com/soda-inria/sklearn-numba-dpex that provided a KMeans + engine optimized for OpenCL enabled GPUs. + - TODO: add more here + + .. _changes_1_2: Version 1.2.0 From dbf607b1d98922edd19dff27f63ed890abd4395e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 23 Sep 2022 20:27:11 +0200 Subject: [PATCH 19/43] doc reorg --- doc/computing.rst | 1 + doc/computing/engine.rst | 29 +++++++++++++++++++++++++++++ doc/modules/engine.rst | 11 ----------- 3 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 doc/computing/engine.rst delete mode 100644 doc/modules/engine.rst diff --git a/doc/computing.rst b/doc/computing.rst index 6732b754918b0..8b355f22ec641 100644 --- a/doc/computing.rst +++ b/doc/computing.rst @@ -14,3 +14,4 @@ Computing with scikit-learn computing/scaling_strategies computing/computational_performance computing/parallelism + computing/engine diff --git a/doc/computing/engine.rst b/doc/computing/engine.rst new file mode 100644 index 0000000000000..5be3d53dbf70e --- /dev/null +++ b/doc/computing/engine.rst @@ -0,0 +1,29 @@ +.. Places parent toc into the sidebar +:parenttoc: True + +.. _engine: + +================================== +Computation Engines (experimental) +================================== + +**This API is experiment** which means that it is subject to change without +any backward compatibility guarantees. + +TODO: explain goals here + +Activating an engine +==================== + +TODO: installing third party engine provider packages + +TODO: how to list installed engines + +TODO: how to install a plugin + +Writing a new engine provider +============================= + +TODO: show engine API of a given estimator. + +TODO: give example setup.py with setuptools to define an entrypoint. diff --git a/doc/modules/engine.rst b/doc/modules/engine.rst deleted file mode 100644 index 401da467ab6a1..0000000000000 --- a/doc/modules/engine.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. Places parent toc into the sidebar -:parenttoc: True - -.. _engine: - - -================================== -Computation Engines (experimental) -================================== - - From abb278a2d9e95c4dc1641db9ef2d322399215d1e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 23 Sep 2022 21:07:20 +0200 Subject: [PATCH 20/43] fix changelog entry to add the pr number --- doc/whats_new/v1.2.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 27ddcfbb5016b..2ec86f1b78343 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -5,9 +5,9 @@ TODO: move to doc/whats_new/v1.3.rst once it exists in main: -- |Enhancement| Experimential engine API to allow for external packages to - contribute alternative implementations for the core computational routines - of some selected scikit-learn estimators. +- |Enhancement| Experimential engine API (no backward compatibility guarantees) + to allow for external packages to contribute alternative implementations for + the core computational routines of some selected scikit-learn estimators. Currently, the following estimators allow alternative implementations: @@ -20,6 +20,7 @@ TODO: move to doc/whats_new/v1.3.rst once it exists in main: engine optimized for OpenCL enabled GPUs. - TODO: add more here + :pr:`24497` by :user:`ogrisel`, :user:`fcharras`. .. _changes_1_2: From c1e85104074061e4596ac431bcfd260958fac1b3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 23 Sep 2022 21:11:47 +0200 Subject: [PATCH 21/43] fix test name --- sklearn/tests/test_engines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index aa50867b068a8..ce342c810d792 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -22,7 +22,7 @@ class NestedFakeEngine: FakeEntryPoint = namedtuple("FakeEntryPoint", ["name", "value"]) -def test_get_engine_class(): +def test_parse_entry_point(): fake_entry_point = FakeEntryPoint( name="fake_engine", value="sklearn.tests.test_engines:FakeEngine", @@ -33,7 +33,7 @@ def test_get_engine_class(): assert spec.get_engine_class() is FakeEngine -def test_get_nested_engine_class(): +def test_parse_entry_point_for_nested_engine_class(): fake_entry_point = FakeEntryPoint( name="nested_fake_engine", value="sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine", From 940828077d8b0305e2297589a5837029a3149b5e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Fri, 23 Sep 2022 21:20:15 +0200 Subject: [PATCH 22/43] attempt at sphinx the sphinx warning --- doc/computing/engine.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/computing/engine.rst b/doc/computing/engine.rst index 5be3d53dbf70e..b27a61fd3a25a 100644 --- a/doc/computing/engine.rst +++ b/doc/computing/engine.rst @@ -1,9 +1,9 @@ .. Places parent toc into the sidebar + :parenttoc: True .. _engine: -================================== Computation Engines (experimental) ================================== @@ -13,7 +13,7 @@ any backward compatibility guarantees. TODO: explain goals here Activating an engine -==================== +-------------------- TODO: installing third party engine provider packages @@ -22,7 +22,7 @@ TODO: how to list installed engines TODO: how to install a plugin Writing a new engine provider -============================= +----------------------------- TODO: show engine API of a given estimator. From 1fc4d794fd704b1f7706d29050440022a90092c8 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Mon, 26 Sep 2022 09:38:13 +0200 Subject: [PATCH 23/43] fix MBKMeans test --- sklearn/cluster/_kmeans.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 43fc23faf2215..441cfc0c4d8d3 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1929,6 +1929,8 @@ def _check_params_vs_input(self, X): else: variances = np.var(X, axis=0) self._tol = np.mean(variances) * self.tol + else: + self._tol = 0.0 self._batch_size = min(self.batch_size, X.shape[0]) From acc4b4772d8b62a216903106395087bac4324b1d Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Mon, 26 Sep 2022 10:20:58 +0200 Subject: [PATCH 24/43] fix: skip entry points that do not match the requested provider names (rather than the opposite) --- sklearn/_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index be25464ea8729..a718083351be9 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -42,7 +42,7 @@ def _parse_entry_points(provider_names=None): for entry_point in engine_entry_points: try: spec = _parse_entry_point(entry_point) - if provider_names is not None and spec.provider_name in provider_names: + if provider_names is not None and spec.provider_name not in provider_names: # Skip entry points that do not match the requested provider names. continue specs.append(spec) From d0d7f9586ba32da08b72eefcecf7d11f60bf53c9 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Mon, 26 Sep 2022 10:26:14 +0200 Subject: [PATCH 25/43] update config_context unit test to account for new engine_provider keyword --- sklearn/tests/test_config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index 51a5a80ebf5b4..5b8d71d58cb4a 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -17,6 +17,7 @@ def test_config_context(): "array_api_dispatch": False, "pairwise_dist_chunk_size": 256, "enable_cython_pairwise_dist": True, + "engine_provider": (), } # Not using as a context manager affects nothing @@ -32,6 +33,7 @@ def test_config_context(): "array_api_dispatch": False, "pairwise_dist_chunk_size": 256, "enable_cython_pairwise_dist": True, + "engine_provider": (), } assert get_config()["assume_finite"] is False @@ -64,6 +66,7 @@ def test_config_context(): "array_api_dispatch": False, "pairwise_dist_chunk_size": 256, "enable_cython_pairwise_dist": True, + "engine_provider": (), } # No positional arguments From dcb3140c2758133ea6c69ee89bc1f21e322221fc Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Mon, 26 Sep 2022 10:52:55 +0200 Subject: [PATCH 26/43] for python < 3.10 returns an empty list when the slearn engines entry point is missing (rather than erroring out)) --- sklearn/_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index a718083351be9..f3912c8c6ee8a 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -38,7 +38,7 @@ def _parse_entry_points(provider_names=None): if hasattr(all_entry_points, "select"): engine_entry_points = all_entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT) else: - engine_entry_points = all_entry_points[SKLEARN_ENGINES_ENTRY_POINT] + engine_entry_points = all_entry_points.get(SKLEARN_ENGINES_ENTRY_POINT, ()) for entry_point in engine_entry_points: try: spec = _parse_entry_point(entry_point) From 8df26c7c26cbe56a1ad6ca731e4a50394829c1fc Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 26 Sep 2022 11:44:43 +0200 Subject: [PATCH 27/43] Link to user guide from docstring --- sklearn/_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index 760355b120ecc..3d1937957d654 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -127,7 +127,7 @@ def set_config( packages to leverage specific hardware platforms using frameworks or libraries outside of the usual scikit-learn project dependencies. - TODO: add link to doc + See the :ref:`User Guide ` for more details. .. versionadded:: 1.3 @@ -239,7 +239,7 @@ def config_context( packages to leverage specific hardware platforms using frameworks or libraries outside of the usual scikit-learn project dependencies. - TODO: add link to doc + See the :ref:`User Guide ` for more details. .. versionadded:: 1.3 From db7b98c5626ea15d773e9aa07b7c678563fab8ba Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Mon, 26 Sep 2022 14:53:15 +0200 Subject: [PATCH 28/43] add a verbosity parameter to get_engine_class --- sklearn/_engine.py | 7 +++++-- sklearn/cluster/_kmeans.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/_engine.py b/sklearn/_engine.py index f3912c8c6ee8a..3bd95af1e6d40 100644 --- a/sklearn/_engine.py +++ b/sklearn/_engine.py @@ -90,7 +90,7 @@ def _get_engine_class(engine_name, provider_names, engine_specs, default=None): return default -def get_engine_class(engine_name, default=None): +def get_engine_class(engine_name, default=None, verbose=False): provider_names = get_config()["engine_provider"] if isinstance(provider_names, str): provider_names = (provider_names,) @@ -101,9 +101,12 @@ def get_engine_class(engine_name, default=None): if not provider_names: return default engine_specs = _parse_entry_points(provider_names=provider_names) - return _get_engine_class( + engine_class = _get_engine_class( engine_name=engine_name, provider_names=provider_names, engine_specs=engine_specs, default=default, ) + if verbose: + print(f"Using engine {engine_class.__module__}.{engine_class.__qualname__} .") + return engine_class diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 441cfc0c4d8d3..52377e5a68dca 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1507,7 +1507,9 @@ def fit(self, X, y=None, sample_weight=None): Fitted estimator. """ self._validate_params() - engine_class = get_engine_class("kmeans", default=KMeansCythonEngine) + engine_class = get_engine_class( + "kmeans", default=KMeansCythonEngine, verbose=self.verbose + ) engine = engine_class(self) X, y, sample_weight = engine.prepare_fit( X, From 4a379d258b8de09f041cf0a09dbd3b502050caf8 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 10:50:27 +0200 Subject: [PATCH 29/43] add pytest plugin that can be used by engine providers to run sklearn tests with customised engines --- setup.py | 1 + sklearn/_engine.py | 112 ------------------------------------ sklearn/_engine/__init__.py | 4 ++ sklearn/_engine/testing.py | 33 +++++++++++ sklearn/exceptions.py | 14 +++++ 5 files changed, 52 insertions(+), 112 deletions(-) delete mode 100644 sklearn/_engine.py create mode 100644 sklearn/_engine/__init__.py create mode 100644 sklearn/_engine/testing.py diff --git a/setup.py b/setup.py index 650050226dbbe..75a7865eb5ebf 100755 --- a/setup.py +++ b/setup.py @@ -340,6 +340,7 @@ def setup_package(): python_requires=python_requires, install_requires=min_deps.tag_to_packages["install"], package_data={"": ["*.pxd"]}, + entry_points={"pytest11": ["sklearn_plugin_testing = sklearn._engine.testing"]}, **extra_setuptools_args, ) diff --git a/sklearn/_engine.py b/sklearn/_engine.py deleted file mode 100644 index 3bd95af1e6d40..0000000000000 --- a/sklearn/_engine.py +++ /dev/null @@ -1,112 +0,0 @@ -from importlib.metadata import entry_points -from importlib import import_module -from functools import lru_cache -import warnings - -from sklearn._config import get_config - -SKLEARN_ENGINES_ENTRY_POINT = "sklearn_engines" - - -class EngineSpec: - - __slots__ = ["name", "provider_name", "module_name", "engine_qualname"] - - def __init__(self, name, provider_name, module_name, engine_qualname): - self.name = name - self.provider_name = provider_name - self.module_name = module_name - self.engine_qualname = engine_qualname - - def get_engine_class(self): - engine = import_module(self.module_name) - for attr in self.engine_qualname.split("."): - engine = getattr(engine, attr) - return engine - - -def _parse_entry_point(entry_point): - module_name, engine_qualname = entry_point.value.split(":") - provider_name = next(iter(module_name.split(".", 1))) - return EngineSpec(entry_point.name, provider_name, module_name, engine_qualname) - - -@lru_cache -def _parse_entry_points(provider_names=None): - specs = [] - all_entry_points = entry_points() - if hasattr(all_entry_points, "select"): - engine_entry_points = all_entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT) - else: - engine_entry_points = all_entry_points.get(SKLEARN_ENGINES_ENTRY_POINT, ()) - for entry_point in engine_entry_points: - try: - spec = _parse_entry_point(entry_point) - if provider_names is not None and spec.provider_name not in provider_names: - # Skip entry points that do not match the requested provider names. - continue - specs.append(spec) - except Exception as e: - # Do not raise an exception in case an invalid package has been - # installed in the same Python env as scikit-learn: just warn and - # skip. - warnings.warn( - f"Invalid {SKLEARN_ENGINES_ENTRY_POINT} entry point" - f" {entry_point.name} with value {entry_point.value}: {e}" - ) - if provider_names is not None: - observed_provider_names = {spec.provider_name for spec in specs} - missing_providers = set(provider_names) - observed_provider_names - if missing_providers: - raise RuntimeError( - "Could not find any provider for the" - f" {SKLEARN_ENGINES_ENTRY_POINT} entry point with name(s):" - f" {', '.join(repr(p) for p in sorted(missing_providers))}" - ) - return specs - - -def list_engine_provider_names(): - """Find the list of sklearn_engine provider names - - This function only inspects the metadata and should trigger any module import. - """ - return sorted({spec.provider_name for spec in _parse_entry_points()}) - - -def _get_engine_class(engine_name, provider_names, engine_specs, default=None): - specs_by_provider = {} - for spec in engine_specs: - if spec.name != engine_name: - continue - specs_by_provider.setdefault(spec.provider_name, spec) - - for provider_name in provider_names: - spec = specs_by_provider.get(provider_name) - if spec is not None: - # XXX: should we return an instance or the class itself? - return spec.get_engine_class() - - return default - - -def get_engine_class(engine_name, default=None, verbose=False): - provider_names = get_config()["engine_provider"] - if isinstance(provider_names, str): - provider_names = (provider_names,) - elif not isinstance(provider_names, tuple): - # Make sure the provider names are a tuple to make it possible for the - # lru cache to hash them. - provider_names = tuple(provider_names) - if not provider_names: - return default - engine_specs = _parse_entry_points(provider_names=provider_names) - engine_class = _get_engine_class( - engine_name=engine_name, - provider_names=provider_names, - engine_specs=engine_specs, - default=default, - ) - if verbose: - print(f"Using engine {engine_class.__module__}.{engine_class.__qualname__} .") - return engine_class diff --git a/sklearn/_engine/__init__.py b/sklearn/_engine/__init__.py new file mode 100644 index 0000000000000..0a9d5492e37b5 --- /dev/null +++ b/sklearn/_engine/__init__.py @@ -0,0 +1,4 @@ +from .base import get_engine_class, list_engine_provider_names + + +__all__ = [get_engine_class, list_engine_provider_names] diff --git a/sklearn/_engine/testing.py b/sklearn/_engine/testing.py new file mode 100644 index 0000000000000..b9d18e30fc628 --- /dev/null +++ b/sklearn/_engine/testing.py @@ -0,0 +1,33 @@ +from pytest import xfail + +from sklearn import config_context + +from sklearn.exceptions import FeatureNotCoveredByPluginError + + +# TODO: document this pytest plugin + write a tutorial on how to develop a new plugin +# and explain good practices regarding testing against sklearn test modules. +def pytest_addoption(parser): + group = parser.getgroup("Sklearn plugin testing") + group.addoption( + "--sklearn-engine-provider", + action="store", + nargs=1, + type=str, + help="Name of the an engine provider for sklearn to activate for all tests.", + ) + + +def pytest_runtest_call(item): + engine_provider = item.config.getoption("sklearn_engine_provider") + if engine_provider is None: + return item.runtest() + + with config_context(engine_provider=engine_provider): + try: + item.runtest() + except FeatureNotCoveredByPluginError: + xfail( + reason=f"This test cover features that are not supported by the " + f"engine provided by {engine_provider}." + ) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index d84c1f6b40526..c33a245614192 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -5,6 +5,7 @@ __all__ = [ "NotFittedError", + "FeatureNotCoveredByPluginError", "ConvergenceWarning", "DataConversionWarning", "DataDimensionalityWarning", @@ -38,6 +39,19 @@ class NotFittedError(ValueError, AttributeError): """ +class FeatureNotCoveredByPluginError(NotImplementedError): + """External plugins might not support all the combinations of parameters and + input types that the the vanilla sklearn implementation otherwise supports. In such + cases, plugins can raise this exception class. When running the sklearn test modules + using the sklearn pytest plugin, all the unit tests that fail by raising this + exception class will be automatically marked as "xfail", this enables sorting out + the tests that fail because they test features that are not supported by the plugin + and tests that fail because the plugin misbehave on supported features. + + .. versionadded:: 1.2 + """ + + class ConvergenceWarning(UserWarning): """Custom warning to capture convergence problems From 12fc503587084e104c053765405514acd2fdb3ac Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 12:14:44 +0200 Subject: [PATCH 30/43] Add plugin methods for predict, transform and score methods for KMeans --- sklearn/cluster/_kmeans.py | 148 ++++++++++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 4 deletions(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 52377e5a68dca..b1eeb24916422 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -257,6 +257,10 @@ def _kmeans_plusplus(X, n_clusters, x_squared_norms, random_state, n_local_trial # K-means batch estimation by EM (expectation maximization) +class _IgnoreParam: + pass + + class KMeansCythonEngine: """Cython-based implementation of the core k-means routines @@ -369,6 +373,33 @@ def kmeans_single(self, X, sample_weight, centers_init): verbose=self.estimator.verbose, ) + def prepare_prediction(self, X, sample_weight): + X = self.estimator._check_test_data(X) + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) + return X, sample_weight + + def get_labels(self, X, sample_weight): + labels, _ = _labels_inertia_threadpool_limit( + X, + sample_weight, + self.estimator.cluster_centers_, + n_threads=self.estimator._n_threads, + ) + + return labels + + def prepare_transform(self, X): + return self.estimator._check_test_data(X) + + def get_euclidean_distances(self, X): + return euclidean_distances(X, self.estimator.cluster_centers_) + + def get_score(self, X, sample_weight): + _, scores = _labels_inertia_threadpool_limit( + X, sample_weight, self.estimator.cluster_centers_, self.estimator._n_threads + ) + return scores + @validate_params( { @@ -1471,6 +1502,12 @@ def _check_params_vs_input(self, X): ) self._algorithm = "lloyd" + def _get_engine(self): + engine_class = get_engine_class( + "kmeans", default=KMeansCythonEngine, verbose=self.verbose + ) + return engine_class(self) + def _warn_mkl_vcomp(self, n_active_threads): """Warn when vcomp and mkl are both present""" warnings.warn( @@ -1507,10 +1544,8 @@ def fit(self, X, y=None, sample_weight=None): Fitted estimator. """ self._validate_params() - engine_class = get_engine_class( - "kmeans", default=KMeansCythonEngine, verbose=self.verbose - ) - engine = engine_class(self) + engine = self._get_engine() + X, y, sample_weight = engine.prepare_fit( X, y=y, @@ -1566,6 +1601,111 @@ def fit(self, X, y=None, sample_weight=None): self.n_iter_ = best_n_iter return self + def predict(self, X, sample_weight=None): + """Predict the closest cluster each sample in X belongs to. + + In the vector quantization literature, `cluster_centers_` is called + the code book and each value returned by `predict` is the index of + the closest code in the code book. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to predict. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + labels : ndarray of shape (n_samples,) + Index of the cluster each sample belongs to. + """ + check_is_fitted(self) + engine = self._get_engine() + X, sample_weight = engine.prepare_prediction(X, sample_weight) + return engine.get_labels(X, sample_weight) + + def fit_transform(self, X, y=None, sample_weight=None): + """Compute clustering and transform X to cluster-distance space. + + Equivalent to fit(X).transform(X), but more efficiently implemented. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_clusters) + X transformed in the new space. + """ + self.fit(X, sample_weight=sample_weight) + engine = self._get_engine() + return self._transform(X, engine) + + def transform(self, X): + """Transform X to a cluster-distance space. + + In the new space, each dimension is the distance to the cluster + centers. Note that even if X is sparse, the array returned by + `transform` will typically be dense. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data to transform. + + Returns + ------- + X_new : ndarray of shape (n_samples, n_clusters) + X transformed in the new space. + """ + check_is_fitted(self) + engine = self._get_engine() + X = engine.prepare_transform(X) + return self._transform(X, engine) + + def _transform(self, X, engine): + """Guts of transform method; no input validation.""" + return engine.get_euclidean_distances(X) + + def score(self, X, y=None, sample_weight=None): + """Opposite of the value of X on the K-means objective. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + New data. + + y : Ignored + Not used, present here for API consistency by convention. + + sample_weight : array-like of shape (n_samples,), default=None + The weights for each observation in X. If None, all observations + are assigned equal weight. + + Returns + ------- + score : float + Opposite of the value of X on the K-means objective. + """ + check_is_fitted(self) + engine = self._get_engine() + + X, sample_weight = engine.prepare_prediction(X, sample_weight) + + return -engine.get_score(X, sample_weight) + def _mini_batch_step( X, From da5fe854a97ad7d9641cc8daaac6f90df1f4f416 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 12:44:19 +0200 Subject: [PATCH 31/43] ad _engine.base module --- sklearn/_engine/base.py | 112 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 sklearn/_engine/base.py diff --git a/sklearn/_engine/base.py b/sklearn/_engine/base.py new file mode 100644 index 0000000000000..3bd95af1e6d40 --- /dev/null +++ b/sklearn/_engine/base.py @@ -0,0 +1,112 @@ +from importlib.metadata import entry_points +from importlib import import_module +from functools import lru_cache +import warnings + +from sklearn._config import get_config + +SKLEARN_ENGINES_ENTRY_POINT = "sklearn_engines" + + +class EngineSpec: + + __slots__ = ["name", "provider_name", "module_name", "engine_qualname"] + + def __init__(self, name, provider_name, module_name, engine_qualname): + self.name = name + self.provider_name = provider_name + self.module_name = module_name + self.engine_qualname = engine_qualname + + def get_engine_class(self): + engine = import_module(self.module_name) + for attr in self.engine_qualname.split("."): + engine = getattr(engine, attr) + return engine + + +def _parse_entry_point(entry_point): + module_name, engine_qualname = entry_point.value.split(":") + provider_name = next(iter(module_name.split(".", 1))) + return EngineSpec(entry_point.name, provider_name, module_name, engine_qualname) + + +@lru_cache +def _parse_entry_points(provider_names=None): + specs = [] + all_entry_points = entry_points() + if hasattr(all_entry_points, "select"): + engine_entry_points = all_entry_points.select(group=SKLEARN_ENGINES_ENTRY_POINT) + else: + engine_entry_points = all_entry_points.get(SKLEARN_ENGINES_ENTRY_POINT, ()) + for entry_point in engine_entry_points: + try: + spec = _parse_entry_point(entry_point) + if provider_names is not None and spec.provider_name not in provider_names: + # Skip entry points that do not match the requested provider names. + continue + specs.append(spec) + except Exception as e: + # Do not raise an exception in case an invalid package has been + # installed in the same Python env as scikit-learn: just warn and + # skip. + warnings.warn( + f"Invalid {SKLEARN_ENGINES_ENTRY_POINT} entry point" + f" {entry_point.name} with value {entry_point.value}: {e}" + ) + if provider_names is not None: + observed_provider_names = {spec.provider_name for spec in specs} + missing_providers = set(provider_names) - observed_provider_names + if missing_providers: + raise RuntimeError( + "Could not find any provider for the" + f" {SKLEARN_ENGINES_ENTRY_POINT} entry point with name(s):" + f" {', '.join(repr(p) for p in sorted(missing_providers))}" + ) + return specs + + +def list_engine_provider_names(): + """Find the list of sklearn_engine provider names + + This function only inspects the metadata and should trigger any module import. + """ + return sorted({spec.provider_name for spec in _parse_entry_points()}) + + +def _get_engine_class(engine_name, provider_names, engine_specs, default=None): + specs_by_provider = {} + for spec in engine_specs: + if spec.name != engine_name: + continue + specs_by_provider.setdefault(spec.provider_name, spec) + + for provider_name in provider_names: + spec = specs_by_provider.get(provider_name) + if spec is not None: + # XXX: should we return an instance or the class itself? + return spec.get_engine_class() + + return default + + +def get_engine_class(engine_name, default=None, verbose=False): + provider_names = get_config()["engine_provider"] + if isinstance(provider_names, str): + provider_names = (provider_names,) + elif not isinstance(provider_names, tuple): + # Make sure the provider names are a tuple to make it possible for the + # lru cache to hash them. + provider_names = tuple(provider_names) + if not provider_names: + return default + engine_specs = _parse_entry_points(provider_names=provider_names) + engine_class = _get_engine_class( + engine_name=engine_name, + provider_names=provider_names, + engine_specs=engine_specs, + default=default, + ) + if verbose: + print(f"Using engine {engine_class.__module__}.{engine_class.__qualname__} .") + return engine_class From 53cb8f1196f9895825783f193e9b25c9e488b80b Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 13:13:03 +0200 Subject: [PATCH 32/43] linting --- sklearn/_engine/testing.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/_engine/testing.py b/sklearn/_engine/testing.py index b9d18e30fc628..51acea6fa91b9 100644 --- a/sklearn/_engine/testing.py +++ b/sklearn/_engine/testing.py @@ -28,6 +28,8 @@ def pytest_runtest_call(item): item.runtest() except FeatureNotCoveredByPluginError: xfail( - reason=f"This test cover features that are not supported by the " - f"engine provided by {engine_provider}." + reason=( + f"This test cover features that are not supported by the " + f"engine provided by {engine_provider}." + ) ) From d72050f628173a9c72b58391a6199ae649ce4402 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 16:43:17 +0200 Subject: [PATCH 33/43] fix a bug that caused by the pytest plugin where sklearn tests would be executed twice --- sklearn/_engine/testing.py | 15 +++++++-------- sklearn/cluster/tests/test_bicluster.py | 1 - sklearn/tests/test_engines.py | 6 +++--- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/sklearn/_engine/testing.py b/sklearn/_engine/testing.py index 51acea6fa91b9..562a56380a9b1 100644 --- a/sklearn/_engine/testing.py +++ b/sklearn/_engine/testing.py @@ -1,4 +1,4 @@ -from pytest import xfail +from pytest import xfail, hookimpl from sklearn import config_context @@ -18,18 +18,17 @@ def pytest_addoption(parser): ) -def pytest_runtest_call(item): - engine_provider = item.config.getoption("sklearn_engine_provider") - if engine_provider is None: - return item.runtest() - +@hookimpl(hookwrapper=True) +def pytest_pyfunc_call(pyfuncitem): + engine_provider = pyfuncitem.config.getoption("sklearn_engine_provider") with config_context(engine_provider=engine_provider): try: - item.runtest() + outcome = yield + outcome.get_result() except FeatureNotCoveredByPluginError: xfail( reason=( - f"This test cover features that are not supported by the " + "This test cover features that are not supported by the " f"engine provided by {engine_provider}." ) ) diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index d04e9dba4fade..977c667840483 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -253,7 +253,6 @@ def test_spectralbiclustering_parameter_validation(params, type_err, err_msg): @pytest.mark.parametrize("est", (SpectralBiclustering(), SpectralCoclustering())) def test_n_features_in_(est): - X, _, _ = make_biclusters((3, 3), 3, random_state=0) assert not hasattr(est, "n_features_in_") diff --git a/sklearn/tests/test_engines.py b/sklearn/tests/test_engines.py index ce342c810d792..d09da2ec27b30 100644 --- a/sklearn/tests/test_engines.py +++ b/sklearn/tests/test_engines.py @@ -3,10 +3,10 @@ import pytest from sklearn._engine import list_engine_provider_names -from sklearn._engine import _parse_entry_point from sklearn._engine import get_engine_class -from sklearn._engine import _get_engine_class -from sklearn._engine import EngineSpec +from sklearn._engine.base import _parse_entry_point +from sklearn._engine.base import _get_engine_class +from sklearn._engine.base import EngineSpec from sklearn._config import config_context From a299bd8b467f52bdc98c6182cc2ede6b9a52a77f Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 17:12:13 +0200 Subject: [PATCH 34/43] __all__fixup and renaming FeatureNotCoveredByPluginError -> NotSupportedByEngineError --- sklearn/_engine/__init__.py | 2 +- sklearn/_engine/testing.py | 8 ++++++-- sklearn/exceptions.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/_engine/__init__.py b/sklearn/_engine/__init__.py index 0a9d5492e37b5..26a03813032ce 100644 --- a/sklearn/_engine/__init__.py +++ b/sklearn/_engine/__init__.py @@ -1,4 +1,4 @@ from .base import get_engine_class, list_engine_provider_names -__all__ = [get_engine_class, list_engine_provider_names] +__all__ = ["get_engine_class", "list_engine_provider_names"] diff --git a/sklearn/_engine/testing.py b/sklearn/_engine/testing.py index 562a56380a9b1..8c553ee06d655 100644 --- a/sklearn/_engine/testing.py +++ b/sklearn/_engine/testing.py @@ -2,7 +2,7 @@ from sklearn import config_context -from sklearn.exceptions import FeatureNotCoveredByPluginError +from sklearn.exceptions import NotSupportedByEngineError # TODO: document this pytest plugin + write a tutorial on how to develop a new plugin @@ -21,11 +21,15 @@ def pytest_addoption(parser): @hookimpl(hookwrapper=True) def pytest_pyfunc_call(pyfuncitem): engine_provider = pyfuncitem.config.getoption("sklearn_engine_provider") + if engine_provider is None: + yield + return + with config_context(engine_provider=engine_provider): try: outcome = yield outcome.get_result() - except FeatureNotCoveredByPluginError: + except NotSupportedByEngineError: xfail( reason=( "This test cover features that are not supported by the " diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index c33a245614192..ff36b6fcce080 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -39,7 +39,7 @@ class NotFittedError(ValueError, AttributeError): """ -class FeatureNotCoveredByPluginError(NotImplementedError): +class NotSupportedByEngineError(NotImplementedError): """External plugins might not support all the combinations of parameters and input types that the the vanilla sklearn implementation otherwise supports. In such cases, plugins can raise this exception class. When running the sklearn test modules From 2d08a91f255b8a4df974add6794691fdec243652 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Wed, 28 Sep 2022 17:18:03 +0200 Subject: [PATCH 35/43] exception __all__ fixup --- sklearn/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py index ff36b6fcce080..bf368be65a274 100644 --- a/sklearn/exceptions.py +++ b/sklearn/exceptions.py @@ -5,7 +5,7 @@ __all__ = [ "NotFittedError", - "FeatureNotCoveredByPluginError", + "NotSupportedByEngineError", "ConvergenceWarning", "DataConversionWarning", "DataDimensionalityWarning", From 70e4ecfdc9dcc57622339d095028e764c484c674 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Thu, 29 Sep 2022 12:44:27 +0200 Subject: [PATCH 36/43] register the _engine subpackage in setup.py --- sklearn/{ => _engine}/tests/test_engines.py | 0 sklearn/setup.py | 2 ++ 2 files changed, 2 insertions(+) rename sklearn/{ => _engine}/tests/test_engines.py (100%) diff --git a/sklearn/tests/test_engines.py b/sklearn/_engine/tests/test_engines.py similarity index 100% rename from sklearn/tests/test_engines.py rename to sklearn/_engine/tests/test_engines.py diff --git a/sklearn/setup.py b/sklearn/setup.py index 874bdbbcbed43..d2bb1f2a9dd26 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -26,6 +26,8 @@ def configuration(parent_package="", top_path=None): config.add_subpackage("covariance/tests") config.add_subpackage("cross_decomposition") config.add_subpackage("cross_decomposition/tests") + config.add_subpackage("_engine") + config.add_subpackage("_engine/tests") config.add_subpackage("feature_selection") config.add_subpackage("feature_selection/tests") config.add_subpackage("gaussian_process") From 54d1924441780fac4689bacd46f80fde4247aad4 Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Thu, 29 Sep 2022 13:27:00 +0200 Subject: [PATCH 37/43] add __init__.py --- sklearn/_engine/tests/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 sklearn/_engine/tests/__init__.py diff --git a/sklearn/_engine/tests/__init__.py b/sklearn/_engine/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 1a805d70ab8fc4903fb8e93edc8b2512816395bc Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Thu, 29 Sep 2022 14:25:58 +0200 Subject: [PATCH 38/43] fix test_engines test --- sklearn/_engine/tests/test_engines.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/_engine/tests/test_engines.py b/sklearn/_engine/tests/test_engines.py index d09da2ec27b30..dda18697d3d58 100644 --- a/sklearn/_engine/tests/test_engines.py +++ b/sklearn/_engine/tests/test_engines.py @@ -25,7 +25,7 @@ class NestedFakeEngine: def test_parse_entry_point(): fake_entry_point = FakeEntryPoint( name="fake_engine", - value="sklearn.tests.test_engines:FakeEngine", + value="sklearn._engine.tests.test_engines:FakeEngine", ) spec = _parse_entry_point(fake_entry_point) assert spec.name == "fake_engine" @@ -36,7 +36,7 @@ def test_parse_entry_point(): def test_parse_entry_point_for_nested_engine_class(): fake_entry_point = FakeEntryPoint( name="nested_fake_engine", - value="sklearn.tests.test_engines:FakeEngineHolder.NestedFakeEngine", + value="sklearn._engine.tests.test_engines:FakeEngineHolder.NestedFakeEngine", ) spec = _parse_entry_point(fake_entry_point) assert spec.name == "nested_fake_engine" @@ -83,11 +83,13 @@ def test_get_engine_class(): EngineSpec("kmeans", "provider1", "sklearn.provider1.module", "KMeansEngine"), EngineSpec("other", "provider1", "sklearn.provider1.module", "OtherEngine"), EngineSpec("kmeans", "provider2", "sklearn.provider2.module", "KMeansEngine"), - EngineSpec("kmeans", "provider3", "sklearn.tests.test_engines", "FakeEngine"), + EngineSpec( + "kmeans", "provider3", "sklearn._engine.tests.test_engines", "FakeEngine" + ), EngineSpec( "kmeans", "provider4", - "sklearn.tests.test_engines", + "sklearn._engine.tests.test_engines", "FakeEngineHolder.NestedFakeEngine", ), ) From c1c578376dc2bffc6697fdb80e30df21ec8ddadb Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 3 Nov 2022 17:36:36 +0100 Subject: [PATCH 39/43] Switch to unified engine API, add multi engine support This renames the methods an engine has to/can implement along the pattern of `pre_X`, `X`, `post_X` (X=fit, transform, ..). Changes the engine loading mechanism to return all engines in a user defined order so that we can try ecah engine in turn to see which one supports the given input case. --- sklearn/_engine/__init__.py | 4 +- sklearn/_engine/base.py | 23 ++-- sklearn/_engine/tests/test_engines.py | 106 ++++++++------- sklearn/cluster/_kmeans.py | 183 +++++++++++++++----------- 4 files changed, 181 insertions(+), 135 deletions(-) diff --git a/sklearn/_engine/__init__.py b/sklearn/_engine/__init__.py index 26a03813032ce..649ad1ba062d3 100644 --- a/sklearn/_engine/__init__.py +++ b/sklearn/_engine/__init__.py @@ -1,4 +1,4 @@ -from .base import get_engine_class, list_engine_provider_names +from .base import get_engine_classes, list_engine_provider_names -__all__ = ["get_engine_class", "list_engine_provider_names"] +__all__ = ["get_engine_classes", "list_engine_provider_names"] diff --git a/sklearn/_engine/base.py b/sklearn/_engine/base.py index 3bd95af1e6d40..af4c968dc851a 100644 --- a/sklearn/_engine/base.py +++ b/sklearn/_engine/base.py @@ -74,7 +74,7 @@ def list_engine_provider_names(): return sorted({spec.provider_name for spec in _parse_entry_points()}) -def _get_engine_class(engine_name, provider_names, engine_specs, default=None): +def _get_engine_classes(engine_name, provider_names, engine_specs, default): specs_by_provider = {} for spec in engine_specs: if spec.name != engine_name: @@ -85,12 +85,12 @@ def _get_engine_class(engine_name, provider_names, engine_specs, default=None): spec = specs_by_provider.get(provider_name) if spec is not None: # XXX: should we return an instance or the class itself? - return spec.get_engine_class() + yield spec.get_engine_class() - return default + yield default -def get_engine_class(engine_name, default=None, verbose=False): +def get_engine_classes(engine_name, default, verbose=False): provider_names = get_config()["engine_provider"] if isinstance(provider_names, str): provider_names = (provider_names,) @@ -99,14 +99,17 @@ def get_engine_class(engine_name, default=None, verbose=False): # lru cache to hash them. provider_names = tuple(provider_names) if not provider_names: - return default + yield default + return engine_specs = _parse_entry_points(provider_names=provider_names) - engine_class = _get_engine_class( + for engine_class in _get_engine_classes( engine_name=engine_name, provider_names=provider_names, engine_specs=engine_specs, default=default, - ) - if verbose: - print(f"Using engine {engine_class.__module__}.{engine_class.__qualname__} .") - return engine_class + ): + if verbose: + print( + f"trying engine {engine_class.__module__}.{engine_class.__qualname__} ." + ) + yield engine_class diff --git a/sklearn/_engine/tests/test_engines.py b/sklearn/_engine/tests/test_engines.py index dda18697d3d58..f1a0262d368c0 100644 --- a/sklearn/_engine/tests/test_engines.py +++ b/sklearn/_engine/tests/test_engines.py @@ -3,13 +3,17 @@ import pytest from sklearn._engine import list_engine_provider_names -from sklearn._engine import get_engine_class +from sklearn._engine import get_engine_classes from sklearn._engine.base import _parse_entry_point -from sklearn._engine.base import _get_engine_class +from sklearn._engine.base import _get_engine_classes from sklearn._engine.base import EngineSpec from sklearn._config import config_context +class FakeDefaultEngine: + pass + + class FakeEngine: pass @@ -54,35 +58,14 @@ def test_get_engine_class_with_default(): # Use config_context with an empty provider tuple to make sure that not provider # are available for test_missing_engine_name with config_context(engine_provider=()): - engine_class = get_engine_class("test_missing_engine_name", default=FakeEngine) - assert engine_class is FakeEngine - - -def test_get_engine_class_for_invalid_provider(): - expected_message = re.escape( - "Could not find any provider for the sklearn_engines entry point with" - " name(s): 'invalid_provider_name'" - ) - with pytest.raises(RuntimeError, match=expected_message): - with config_context(engine_provider="invalid_provider_name"): - get_engine_class("kmeans") - - expected_message = re.escape( - "Could not find any provider for the sklearn_engines entry point with" - " name(s): 'invalid_provider_name_1', 'invalid_provider_name_2'" - ) - with pytest.raises(RuntimeError, match=expected_message): - with config_context( - engine_provider=("invalid_provider_name_1", "invalid_provider_name_2") - ): - get_engine_class("kmeans") + engine_classes = list( + get_engine_classes("test_missing_engine_name", default=FakeEngine) + ) + assert engine_classes == [FakeEngine] def test_get_engine_class(): engine_specs = ( - EngineSpec("kmeans", "provider1", "sklearn.provider1.module", "KMeansEngine"), - EngineSpec("other", "provider1", "sklearn.provider1.module", "OtherEngine"), - EngineSpec("kmeans", "provider2", "sklearn.provider2.module", "KMeansEngine"), EngineSpec( "kmeans", "provider3", "sklearn._engine.tests.test_engines", "FakeEngine" ), @@ -94,31 +77,62 @@ def test_get_engine_class(): ), ) - engine_class = _get_engine_class( - engine_name="missing", - provider_names=("provider1", "provider3"), - engine_specs=engine_specs, + engine_class = list( + _get_engine_classes( + engine_name="missing", + provider_names=("provider1", "provider3"), + engine_specs=engine_specs, + default=FakeDefaultEngine, + ) ) - assert engine_class is None + assert engine_class == [FakeDefaultEngine] - engine_class = _get_engine_class( - engine_name="kmeans", - provider_names=("provider3", "provider4", "provider1", "provider2"), - engine_specs=engine_specs, + engine_class = list( + _get_engine_classes( + engine_name="kmeans", + provider_names=("provider3", "provider4"), + engine_specs=engine_specs, + default=FakeDefaultEngine, + ) + ) + assert engine_class == [ + FakeEngine, + FakeEngineHolder.NestedFakeEngine, + FakeDefaultEngine, + ] + + engine_class = list( + _get_engine_classes( + engine_name="kmeans", + provider_names=("provider4", "provider3"), + engine_specs=engine_specs, + default=FakeDefaultEngine, + ) ) - assert engine_class == FakeEngine + assert engine_class == [ + FakeEngineHolder.NestedFakeEngine, + FakeEngine, + FakeDefaultEngine, + ] - engine_class = _get_engine_class( + engine_specs = engine_specs + ( + EngineSpec( + "kmeans", + "provider1", + "sklearn.provider1.somewhere", + "OtherEngine", + ), + ) + + # Invalid imports are delayed until they are actually needed. + engine_classes = _get_engine_classes( engine_name="kmeans", - provider_names=("provider4", "provider3", "provider1", "provider2"), + provider_names=("provider4", "provider3", "provider1"), engine_specs=engine_specs, + default=FakeDefaultEngine, ) - assert engine_class == FakeEngineHolder.NestedFakeEngine + next(engine_classes) + next(engine_classes) with pytest.raises(ImportError, match=re.escape("sklearn.provider1")): - # Invalid imports are delayed until they are actually needed. - _get_engine_class( - engine_name="kmeans", - provider_names=("provider1", "provider3"), - engine_specs=engine_specs, - ) + next(engine_classes) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index cb5d2a45eddce..e65b0ec5e9b02 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -54,7 +54,7 @@ from ._k_means_elkan import init_bounds_sparse from ._k_means_elkan import elkan_iter_chunked_dense from ._k_means_elkan import elkan_iter_chunked_sparse -from .._engine import get_engine_class +from .._engine import get_engine_classes ############################################################################### @@ -274,7 +274,11 @@ class KMeansCythonEngine: def __init__(self, estimator): self.estimator = estimator - def prepare_fit(self, X, y=None, sample_weight=None): + def accepts(self, X, y=None, sample_weight=None): + # The default engine accepts everything + return True + + def pre_fit(self, X, y=None, sample_weight=None): estimator = self.estimator X = estimator._validate_data( X, @@ -362,23 +366,65 @@ def unshift_centers(self, X, best_centers): def is_same_clustering(self, labels, best_labels, n_clusters): return _is_same_clustering(labels, best_labels, n_clusters) - def kmeans_single(self, X, sample_weight, centers_init): - return self.kmeans_single_func( - X, - sample_weight, - centers_init, - max_iter=self.estimator.max_iter, - tol=self.tol, - n_threads=self._n_threads, - verbose=self.estimator.verbose, - ) + def fit(self, X, y=None, sample_weight=None): + centers_init = self.init_centroids(X) + if self.estimator.verbose: + print("Initialization complete") + + best_inertia, best_labels = None, None - def prepare_prediction(self, X, sample_weight): + for i in range(self.estimator._n_init): + labels, inertia, centers, n_iter_ = self.kmeans_single_func( + X, + sample_weight, + centers_init, + max_iter=self.estimator.max_iter, + tol=self.tol, + n_threads=self._n_threads, + verbose=self.estimator.verbose, + ) + + # determine if these results are the best so far + # we chose a new run if it has a better inertia and the clustering is + # different from the best so far (it's possible that the inertia is + # slightly better even if the clustering is the same with potentially + # permuted labels, due to rounding errors) + if best_inertia is None or ( + inertia < best_inertia + and not self.is_same_clustering(labels, best_labels, self.n_clusters) + ): + self.best_labels = labels + self.best_centers = centers + self.best_inertia = inertia + self.best_n_iter = n_iter_ + + # return best_labels, best_inertia, best_centers, best_n_iter + + def post_fit(self, X, y=None, sample_weight=None): + self.unshift_centers(X, self.best_centers) + + distinct_clusters = len(set(self.best_labels)) + if distinct_clusters < self.estimator.n_clusters: + warnings.warn( + "Number of distinct clusters ({}) found smaller than " + "n_clusters ({}). Possibly due to duplicate points " + "in X.".format(distinct_clusters, self.estimator.n_clusters), + ConvergenceWarning, + stacklevel=2, + ) + + self.estimator.cluster_centers_ = self.best_centers + self.estimator._n_features_out = self.best_centers.shape[0] + self.estimator.labels_ = self.best_labels + self.estimator.inertia_ = self.best_inertia + self.estimator.n_iter_ = self.best_n_iter + + def pre_predict(self, X, sample_weight): X = self.estimator._check_test_data(X) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) return X, sample_weight - def get_labels(self, X, sample_weight): + def predict(self, X, sample_weight=None): labels, _ = _labels_inertia_threadpool_limit( X, sample_weight, @@ -388,13 +434,13 @@ def get_labels(self, X, sample_weight): return labels - def prepare_transform(self, X): + def pre_transform(self, X): return self.estimator._check_test_data(X) - def get_euclidean_distances(self, X): + def transform(self, X): return euclidean_distances(X, self.estimator.cluster_centers_) - def get_score(self, X, sample_weight): + def score(self, X, sample_weight): _, scores = _labels_inertia_threadpool_limit( X, sample_weight, self.estimator.cluster_centers_, self.estimator._n_threads ) @@ -1504,11 +1550,13 @@ def _check_params_vs_input(self, X): ) self._algorithm = "lloyd" - def _get_engine(self): - engine_class = get_engine_class( + def _get_engine(self, X, y=None, sample_weight=None): + for engine_class in get_engine_classes( "kmeans", default=KMeansCythonEngine, verbose=self.verbose - ) - return engine_class(self) + ): + engine = engine_class(self) + if engine.accepts(X, y=y, sample_weight=sample_weight): + return engine def _warn_mkl_vcomp(self, n_active_threads): """Warn when vcomp and mkl are both present""" @@ -1546,61 +1594,30 @@ def fit(self, X, y=None, sample_weight=None): Fitted estimator. """ self._validate_params() - engine = self._get_engine() - - X, y, sample_weight = engine.prepare_fit( - X, - y=y, - sample_weight=sample_weight, - ) self._check_params_vs_input(X) - best_inertia, best_labels = None, None - - for i in range(self._n_init): - # Initialize centers - centers_init = engine.init_centroids(X) - if self.verbose: - print("Initialization complete") + engine = self._get_engine(X, y, sample_weight) - # run a k-means once - labels, inertia, centers, n_iter_ = engine.kmeans_single( + if hasattr(engine, "pre_fit"): + X, y, sample_weight = engine.pre_fit( X, - sample_weight, - centers_init, + y=y, + sample_weight=sample_weight, ) - # determine if these results are the best so far - # we chose a new run if it has a better inertia and the clustering is - # different from the best so far (it's possible that the inertia is - # slightly better even if the clustering is the same with potentially - # permuted labels, due to rounding errors) - if best_inertia is None or ( - inertia < best_inertia - and not engine.is_same_clustering(labels, best_labels, self.n_clusters) - ): - best_labels = labels - best_centers = centers - best_inertia = inertia - best_n_iter = n_iter_ - - engine.unshift_centers(X, best_centers) + engine.fit( + X, + y=y, + sample_weight=sample_weight, + ) - distinct_clusters = len(set(best_labels)) - if distinct_clusters < self.n_clusters: - warnings.warn( - "Number of distinct clusters ({}) found smaller than " - "n_clusters ({}). Possibly due to duplicate points " - "in X.".format(distinct_clusters, self.n_clusters), - ConvergenceWarning, - stacklevel=2, + if hasattr(engine, "post_fit"): + engine.post_fit( + X, + y=y, + sample_weight=sample_weight, ) - self.cluster_centers_ = best_centers - self._n_features_out = self.cluster_centers_.shape[0] - self.labels_ = best_labels - self.inertia_ = best_inertia - self.n_iter_ = best_n_iter return self def predict(self, X, sample_weight=None): @@ -1625,9 +1642,16 @@ def predict(self, X, sample_weight=None): Index of the cluster each sample belongs to. """ check_is_fitted(self) - engine = self._get_engine() - X, sample_weight = engine.prepare_prediction(X, sample_weight) - return engine.get_labels(X, sample_weight) + engine = self._get_engine(X) + if hasattr(engine, "pre_predict"): + X, sample_weight = engine.pre_predict(X, sample_weight) + + y_pred = engine.predict(X, sample_weight) + + if hasattr(engine, "post_predict"): + engine.post_predict(X, sample_weight) + + return y_pred def fit_transform(self, X, y=None, sample_weight=None): """Compute clustering and transform X to cluster-distance space. @@ -1651,8 +1675,9 @@ def fit_transform(self, X, y=None, sample_weight=None): X_new : ndarray of shape (n_samples, n_clusters) X transformed in the new space. """ + # XXX pre_transform() is not called because fit() calls pre_fit() self.fit(X, sample_weight=sample_weight) - engine = self._get_engine() + engine = self._get_engine(X) return self._transform(X, engine) def transform(self, X): @@ -1673,13 +1698,17 @@ def transform(self, X): X transformed in the new space. """ check_is_fitted(self) - engine = self._get_engine() - X = engine.prepare_transform(X) + engine = self._get_engine(X) + if hasattr(engine, "pre_transform"): + X = engine.pre_transform(X) return self._transform(X, engine) def _transform(self, X, engine): """Guts of transform method; no input validation.""" - return engine.get_euclidean_distances(X) + X_ = engine.transform(X) + if hasattr(engine, "post_transform"): + engine.post_transform(X_) + return X_ def score(self, X, y=None, sample_weight=None): """Opposite of the value of X on the K-means objective. @@ -1702,11 +1731,11 @@ def score(self, X, y=None, sample_weight=None): Opposite of the value of X on the K-means objective. """ check_is_fitted(self) - engine = self._get_engine() + engine = self._get_engine(X) - X, sample_weight = engine.prepare_prediction(X, sample_weight) + X, sample_weight = engine.pre_predict(X, sample_weight) - return -engine.get_score(X, sample_weight) + return -engine.score(X, sample_weight) def _mini_batch_step( From fea84150a88ef196f5089778d60700f6cbda8ce3 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 7 Nov 2022 10:53:31 +0100 Subject: [PATCH 40/43] WIP Switch to using an engine for KNeighbors --- sklearn/neighbors/_classification.py | 115 +++++++++++++++++++-------- 1 file changed, 82 insertions(+), 33 deletions(-) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index b849d28e131a5..58d6d1cebee11 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -19,6 +19,56 @@ from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import ClassifierMixin from ..utils._param_validation import StrOptions +from .._engine import get_engine_classes + + +class KNeighborsClassifierCythonEngine: + def __init__(self, estimator): + self.estimator = estimator + + def accepts(self, X, y=None, sample_weight=None): + # The default engine accepts everything + return True + + def pre_fit(self, X, y=None, sample_weight=None): + return X, y, sample_weight + + def fit(self, X, y=None, sample_weight=None): + return self.estimator._fit(X, y) + + def predict(self, X, sample_weight=None): + if self.estimator.weights == "uniform": + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.estimator.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + + classes_ = self.classes_ + _y = self._y + if not self.outputs_2d_: + _y = self._y.reshape((-1, 1)) + classes_ = [self.classes_] + + n_outputs = len(classes_) + n_queries = _num_samples(X) + weights = _get_weights(neigh_dist, self.weights) + + y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) + for k, classes_k in enumerate(classes_): + if weights is None: + mode, _ = _mode(_y[neigh_ind, k], axis=1) + else: + mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) + + mode = np.asarray(mode.ravel(), dtype=np.intp) + y_pred[:, k] = classes_k.take(mode) + + if not self.outputs_2d_: + y_pred = y_pred.ravel() + + return y_pred class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): @@ -212,7 +262,29 @@ def fit(self, X, y): """ self._validate_params() - return self._fit(X, y) + engine = self._get_engine(X, y, sample_weight) + + if hasattr(engine, "pre_fit"): + X, y, sample_weight = engine.pre_fit( + X, + y=y, + sample_weight=sample_weight, + ) + + engine.fit( + X, + y=y, + sample_weight=sample_weight, + ) + + if hasattr(engine, "post_fit"): + engine.post_fit( + X, + y=y, + sample_weight=sample_weight, + ) + + return self def predict(self, X): """Predict the class labels for the provided data. @@ -228,38 +300,7 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each data sample. """ - if self.weights == "uniform": - # In that case, we do not need the distances to perform - # the weighting so we do not compute them. - neigh_ind = self.kneighbors(X, return_distance=False) - neigh_dist = None - else: - neigh_dist, neigh_ind = self.kneighbors(X) - - classes_ = self.classes_ - _y = self._y - if not self.outputs_2d_: - _y = self._y.reshape((-1, 1)) - classes_ = [self.classes_] - - n_outputs = len(classes_) - n_queries = _num_samples(X) - weights = _get_weights(neigh_dist, self.weights) - - y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) - for k, classes_k in enumerate(classes_): - if weights is None: - mode, _ = _mode(_y[neigh_ind, k], axis=1) - else: - mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1) - - mode = np.asarray(mode.ravel(), dtype=np.intp) - y_pred[:, k] = classes_k.take(mode) - - if not self.outputs_2d_: - y_pred = y_pred.ravel() - - return y_pred + pass def predict_proba(self, X): """Return probability estimates for the test data X. @@ -322,6 +363,14 @@ def predict_proba(self, X): def _more_tags(self): return {"multilabel": True} + def _get_engine(self, X, y=None, sample_weight=None): + for engine_class in get_engine_classes( + "kneigborsclassifier", default=KNeighborsClassifierCythonEngine, verbose=self.verbose + ): + engine = engine_class(self) + if engine.accepts(X, y=y, sample_weight=sample_weight): + return engine + class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): """Classifier implementing a vote among neighbors within a given radius. From 08f4e3683e67b6666f34acbb3f10fcc937d0d18d Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 8 Nov 2022 15:01:42 +0100 Subject: [PATCH 41/43] Transition KNearestNeighbors to use an engine --- sklearn/neighbors/_base.py | 5 + sklearn/neighbors/_classification.py | 204 ++++++++++++++++++++------- 2 files changed, 157 insertions(+), 52 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3a0a702be3792..dc92711769771 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -758,6 +758,11 @@ class from an array representing our data set and ask who's array([[1], [2]]...) """ + return self._kneighbors( + X=X, n_neighbors=n_neighbors, return_distance=return_distance + ) + + def _kneighbors(self, X=None, n_neighbors=None, return_distance=True): check_is_fitted(self) if n_neighbors is None: diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 58d6d1cebee11..9afb4649d3cb9 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -26,34 +26,31 @@ class KNeighborsClassifierCythonEngine: def __init__(self, estimator): self.estimator = estimator - def accepts(self, X, y=None, sample_weight=None): + def accepts(self, X, y=None): # The default engine accepts everything return True - def pre_fit(self, X, y=None, sample_weight=None): - return X, y, sample_weight - - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None): return self.estimator._fit(X, y) - def predict(self, X, sample_weight=None): + def predict(self, X): if self.estimator.weights == "uniform": # In that case, we do not need the distances to perform # the weighting so we do not compute them. neigh_ind = self.estimator.kneighbors(X, return_distance=False) neigh_dist = None else: - neigh_dist, neigh_ind = self.kneighbors(X) + neigh_dist, neigh_ind = self.estimator.kneighbors(X) - classes_ = self.classes_ - _y = self._y - if not self.outputs_2d_: - _y = self._y.reshape((-1, 1)) - classes_ = [self.classes_] + classes_ = self.estimator.classes_ + _y = self.estimator._y + if not self.estimator.outputs_2d_: + _y = self.estimator._y.reshape((-1, 1)) + classes_ = [self.estimator.classes_] n_outputs = len(classes_) n_queries = _num_samples(X) - weights = _get_weights(neigh_dist, self.weights) + weights = _get_weights(neigh_dist, self.estimator.weights) y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype) for k, classes_k in enumerate(classes_): @@ -65,11 +62,59 @@ def predict(self, X, sample_weight=None): mode = np.asarray(mode.ravel(), dtype=np.intp) y_pred[:, k] = classes_k.take(mode) - if not self.outputs_2d_: + if not self.estimator.outputs_2d_: y_pred = y_pred.ravel() return y_pred + def predict_proba(self, X): + if self.estimator.weights == "uniform": + # In that case, we do not need the distances to perform + # the weighting so we do not compute them. + neigh_ind = self.estimator.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.estimator.kneighbors(X) + + classes_ = self.estimator.classes_ + _y = self.estimator._y + if not self.estimator.outputs_2d_: + _y = self.estimator._y.reshape((-1, 1)) + classes_ = [self.estimator.classes_] + + n_queries = _num_samples(X) + + weights = _get_weights(neigh_dist, self.estimator.weights) + if weights is None: + weights = np.ones_like(neigh_ind) + + all_rows = np.arange(n_queries) + probabilities = [] + for k, classes_k in enumerate(classes_): + pred_labels = _y[:, k][neigh_ind] + proba_k = np.zeros((n_queries, classes_k.size)) + + # a simple ':' index doesn't work right + for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) + proba_k[all_rows, idx] += weights[:, i] + + # normalize 'votes' into real [0,1] probabilities + normalizer = proba_k.sum(axis=1)[:, np.newaxis] + normalizer[normalizer == 0.0] = 1.0 + proba_k /= normalizer + + probabilities.append(proba_k) + + if not self.estimator.outputs_2d_: + probabilities = probabilities[0] + + return probabilities + + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + return self.estimator._kneighbors( + X=X, n_neighbors=n_neighbors, return_distance=return_distance + ) + class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): """Classifier implementing the k-nearest neighbors vote. @@ -262,26 +307,23 @@ def fit(self, X, y): """ self._validate_params() - engine = self._get_engine(X, y, sample_weight) + engine = self._get_engine(X, y) if hasattr(engine, "pre_fit"): X, y, sample_weight = engine.pre_fit( X, y=y, - sample_weight=sample_weight, ) engine.fit( X, y=y, - sample_weight=sample_weight, ) if hasattr(engine, "post_fit"): engine.post_fit( X, y=y, - sample_weight=sample_weight, ) return self @@ -300,7 +342,21 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each data sample. """ - pass + engine = self._get_engine(X) + + if hasattr(engine, "pre_predict"): + X = engine.pre_predict( + X, + ) + + y_pred = engine.predict(X) + + if hasattr(engine, "post_predict"): + engine.post_predict( + X, + ) + + return y_pred def predict_proba(self, X): """Return probability estimates for the test data X. @@ -318,45 +374,89 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ - if self.weights == "uniform": - # In that case, we do not need the distances to perform - # the weighting so we do not compute them. - neigh_ind = self.kneighbors(X, return_distance=False) - neigh_dist = None - else: - neigh_dist, neigh_ind = self.kneighbors(X) + engine = self._get_engine(X) - classes_ = self.classes_ - _y = self._y - if not self.outputs_2d_: - _y = self._y.reshape((-1, 1)) - classes_ = [self.classes_] + if hasattr(engine, "pre_predict_proba"): + X = engine.pre_predict_proba( + X, + ) - n_queries = _num_samples(X) + probabilities = engine.predict_proba(X) - weights = _get_weights(neigh_dist, self.weights) - if weights is None: - weights = np.ones_like(neigh_ind) + if hasattr(engine, "post_predict_proba"): + engine.post_predict_proba( + X, + ) - all_rows = np.arange(n_queries) - probabilities = [] - for k, classes_k in enumerate(classes_): - pred_labels = _y[:, k][neigh_ind] - proba_k = np.zeros((n_queries, classes_k.size)) + return probabilities - # a simple ':' index doesn't work right - for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors) - proba_k[all_rows, idx] += weights[:, i] + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + """Find the K-neighbors of a point. - # normalize 'votes' into real [0,1] probabilities - normalizer = proba_k.sum(axis=1)[:, np.newaxis] - normalizer[normalizer == 0.0] = 1.0 - proba_k /= normalizer + Returns indices of and distances to the neighbors of each point. - probabilities.append(proba_k) + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_queries, n_features), \ + or (n_queries, n_indexed) if metric == 'precomputed', default=None + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. - if not self.outputs_2d_: - probabilities = probabilities[0] + n_neighbors : int, default=None + Number of neighbors required for each sample. The default is the + value passed to the constructor. + + return_distance : bool, default=True + Whether or not to return the distances. + + Returns + ------- + neigh_dist : ndarray of shape (n_queries, n_neighbors) + Array representing the lengths to points, only present if + return_distance=True. + + neigh_ind : ndarray of shape (n_queries, n_neighbors) + Indices of the nearest points in the population matrix. + + Examples + -------- + In the following example, we construct a NearestNeighbors + class from an array representing our data set and ask who's + the closest point to [1,1,1] + + >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=1) + >>> neigh.fit(samples) + NearestNeighbors(n_neighbors=1) + >>> print(neigh.kneighbors([[1., 1., 1.]])) + (array([[0.5]]), array([[2]])) + + As you can see, it returns [[0.5]], and [[2]], which means that the + element is at distance 0.5 and is the third element of samples + (indexes start at 0). You can also query for multiple points: + + >>> X = [[0., 1., 0.], [1., 0., 1.]] + >>> neigh.kneighbors(X, return_distance=False) + array([[1], + [2]]...) + """ + engine = self._get_engine(X) + + if hasattr(engine, "pre_kneighbors"): + X = engine.pre_kneighbors( + X=X, n_neighbors=n_neighbors, return_distance=return_distance + ) + + probabilities = engine.kneighbors( + X=X, n_neighbors=n_neighbors, return_distance=return_distance + ) + + if hasattr(engine, "post_kneighbors"): + engine.post_kneighbors( + X=X, n_neighbors=n_neighbors, return_distance=return_distance + ) return probabilities @@ -365,10 +465,10 @@ def _more_tags(self): def _get_engine(self, X, y=None, sample_weight=None): for engine_class in get_engine_classes( - "kneigborsclassifier", default=KNeighborsClassifierCythonEngine, verbose=self.verbose + "kneigborsclassifier", default=KNeighborsClassifierCythonEngine ): engine = engine_class(self) - if engine.accepts(X, y=y, sample_weight=sample_weight): + if engine.accepts(X, y=y): return engine From fdba2170a1b1143e1c0f28a725dc0969189f334f Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 15 Nov 2022 10:04:35 +0100 Subject: [PATCH 42/43] Store selected engine name This allows us to raise an exception when an estimator is used with a different engine than the one used to fit it. --- sklearn/_engine/base.py | 12 ++++++------ sklearn/_engine/tests/test_engines.py | 16 ++++++++-------- sklearn/cluster/_kmeans.py | 24 ++++++++++++++++++------ sklearn/neighbors/_classification.py | 19 ++++++++++++++++--- 4 files changed, 48 insertions(+), 23 deletions(-) diff --git a/sklearn/_engine/base.py b/sklearn/_engine/base.py index af4c968dc851a..8e84432784075 100644 --- a/sklearn/_engine/base.py +++ b/sklearn/_engine/base.py @@ -3,7 +3,7 @@ from functools import lru_cache import warnings -from sklearn._config import get_config +from .._config import get_config SKLEARN_ENGINES_ENTRY_POINT = "sklearn_engines" @@ -85,9 +85,9 @@ def _get_engine_classes(engine_name, provider_names, engine_specs, default): spec = specs_by_provider.get(provider_name) if spec is not None: # XXX: should we return an instance or the class itself? - yield spec.get_engine_class() + yield spec.provider_name, spec.get_engine_class() - yield default + yield "default", default def get_engine_classes(engine_name, default, verbose=False): @@ -99,10 +99,10 @@ def get_engine_classes(engine_name, default, verbose=False): # lru cache to hash them. provider_names = tuple(provider_names) if not provider_names: - yield default + yield "default", default return engine_specs = _parse_entry_points(provider_names=provider_names) - for engine_class in _get_engine_classes( + for provider, engine_class in _get_engine_classes( engine_name=engine_name, provider_names=provider_names, engine_specs=engine_specs, @@ -112,4 +112,4 @@ def get_engine_classes(engine_name, default, verbose=False): print( f"trying engine {engine_class.__module__}.{engine_class.__qualname__} ." ) - yield engine_class + yield provider, engine_class diff --git a/sklearn/_engine/tests/test_engines.py b/sklearn/_engine/tests/test_engines.py index f1a0262d368c0..93c3962a753a5 100644 --- a/sklearn/_engine/tests/test_engines.py +++ b/sklearn/_engine/tests/test_engines.py @@ -61,7 +61,7 @@ def test_get_engine_class_with_default(): engine_classes = list( get_engine_classes("test_missing_engine_name", default=FakeEngine) ) - assert engine_classes == [FakeEngine] + assert engine_classes == [("default", FakeEngine)] def test_get_engine_class(): @@ -85,7 +85,7 @@ def test_get_engine_class(): default=FakeDefaultEngine, ) ) - assert engine_class == [FakeDefaultEngine] + assert engine_class == [("default", FakeDefaultEngine)] engine_class = list( _get_engine_classes( @@ -96,9 +96,9 @@ def test_get_engine_class(): ) ) assert engine_class == [ - FakeEngine, - FakeEngineHolder.NestedFakeEngine, - FakeDefaultEngine, + ("provider3", FakeEngine), + ("provider4", FakeEngineHolder.NestedFakeEngine), + ("default", FakeDefaultEngine), ] engine_class = list( @@ -110,9 +110,9 @@ def test_get_engine_class(): ) ) assert engine_class == [ - FakeEngineHolder.NestedFakeEngine, - FakeEngine, - FakeDefaultEngine, + ("provider4", FakeEngineHolder.NestedFakeEngine), + ("provider3", FakeEngine), + ("default", FakeDefaultEngine), ] engine_specs = engine_specs + ( diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index e65b0ec5e9b02..acea37c1e624c 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -55,7 +55,7 @@ from ._k_means_elkan import elkan_iter_chunked_dense from ._k_means_elkan import elkan_iter_chunked_sparse from .._engine import get_engine_classes - +from .._config import get_config ############################################################################### # Initialization heuristic @@ -1550,14 +1550,26 @@ def _check_params_vs_input(self, X): ) self._algorithm = "lloyd" - def _get_engine(self, X, y=None, sample_weight=None): - for engine_class in get_engine_classes( - "kmeans", default=KMeansCythonEngine, verbose=self.verbose + def _get_engine(self, X, y=None, sample_weight=None, reset=False): + for provider, engine_class in get_engine_classes( + "kmeans", default=KMeansCythonEngine ): + if hasattr(self, "_engine_provider") and not reset: + if self._engine_provider != provider: + continue + engine = engine_class(self) - if engine.accepts(X, y=y, sample_weight=sample_weight): + if engine.accepts(X, y=y): + self._engine_provider = provider return engine + if hasattr(self, "_engine_provider"): + raise RuntimeError( + "Estimator was previously fitted with the" + f" {self._engine_provider} engine, but it is not available. Currently" + f" configured engines: {get_config()['engine_provider']}" + ) + def _warn_mkl_vcomp(self, n_active_threads): """Warn when vcomp and mkl are both present""" warnings.warn( @@ -1596,7 +1608,7 @@ def fit(self, X, y=None, sample_weight=None): self._validate_params() self._check_params_vs_input(X) - engine = self._get_engine(X, y, sample_weight) + engine = self._get_engine(X, y, sample_weight, reset=True) if hasattr(engine, "pre_fit"): X, y, sample_weight = engine.pre_fit( diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 9afb4649d3cb9..ffe0bd04b6c7b 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -20,6 +20,7 @@ from ..base import ClassifierMixin from ..utils._param_validation import StrOptions from .._engine import get_engine_classes +from .._config import get_config class KNeighborsClassifierCythonEngine: @@ -307,7 +308,7 @@ def fit(self, X, y): """ self._validate_params() - engine = self._get_engine(X, y) + engine = self._get_engine(X, y, reset=True) if hasattr(engine, "pre_fit"): X, y, sample_weight = engine.pre_fit( @@ -463,14 +464,26 @@ class from an array representing our data set and ask who's def _more_tags(self): return {"multilabel": True} - def _get_engine(self, X, y=None, sample_weight=None): - for engine_class in get_engine_classes( + def _get_engine(self, X, y=None, sample_weight=None, reset=False): + for provider, engine_class in get_engine_classes( "kneigborsclassifier", default=KNeighborsClassifierCythonEngine ): + if hasattr(self, "_engine_provider") and not reset: + if self._engine_provider != provider: + continue + engine = engine_class(self) if engine.accepts(X, y=y): + self._engine_provider = provider return engine + if hasattr(self, "_engine_provider"): + raise RuntimeError( + "Estimator was previously fitted with the" + f" {self._engine_provider} engine, but it is not available. Currently" + f" configured engines: {get_config()['engine_provider']}" + ) + class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, NeighborsBase): """Classifier implementing a vote among neighbors within a given radius. From ed9d04ab85dc98172ff7d30bac064a8f149b312f Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 15 Nov 2022 10:47:53 +0100 Subject: [PATCH 43/43] Fix test --- sklearn/cluster/_kmeans.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index acea37c1e624c..de54602e5ebfc 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1606,7 +1606,6 @@ def fit(self, X, y=None, sample_weight=None): Fitted estimator. """ self._validate_params() - self._check_params_vs_input(X) engine = self._get_engine(X, y, sample_weight, reset=True)