From 630e7340b0786d5fc9e029f68fbb9028b3893ad0 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Mon, 24 Oct 2022 17:10:41 +0530 Subject: [PATCH 01/25] modified constraints to allow p < 1 --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3a0a702be3792..4dbcc5320f3f0 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -382,7 +382,7 @@ class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "radius": [Interval(Real, 0, None, closed="both"), None], "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})], "leaf_size": [Interval(Integral, 1, None, closed="left")], - "p": [Interval(Real, 1, None, closed="both"), None], + "p": [Interval(Real, 0, None, closed="both"), None], "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], "metric_params": [dict, None], "n_jobs": [Integral, None], From cdf7e6ae3483e5b4572d72e299e00b5c63075b8e Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Tue, 25 Oct 2022 00:45:24 +0530 Subject: [PATCH 02/25] Added feature to conditionally allow minkowski with p < 1 --- sklearn/neighbors/_base.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 4dbcc5320f3f0..9500476c66763 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -447,12 +447,6 @@ def _check_algorithm_metric(self): SyntaxWarning, stacklevel=3, ) - effective_p = self.metric_params["p"] - else: - effective_p = self.p - - if self.metric in ["wminkowski", "minkowski"] and effective_p < 1: - raise ValueError("p must be greater or equal to one for minkowski metric") def _fit(self, X, y=None): if self._get_tags()["requires_y"]: @@ -619,6 +613,21 @@ def _fit(self, X, y=None): else: self._fit_method = "brute" + if ( + self.effective_metric_ in ["wminkowski", "minkowski"] + and self.effective_metric_params_["p"] < 1 + ): + # For p < 1 minkowski is not a valid metric as it does not + # satisfy triangular inequality, but that is workable for + # bruteforce algorithm. But kd_tree and ball_tree requires + # valid metric to work + if self._fit_method == "brute": + warnings.warn("for p < 1 minkowski is not a valid metric") + else: + raise ValueError( + "p must be greater or equal to one for minkowski metric" + ) + if self._fit_method == "ball_tree": self._tree = BallTree( X, From 956f4703a4e1285c621eadac4647cb1796dae4de Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 00:21:18 +0530 Subject: [PATCH 03/25] for lgorithm=auto set _fit_method=brute when p < 1 --- sklearn/neighbors/_base.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 9500476c66763..ed9012cf2276f 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -427,8 +427,7 @@ def _check_algorithm_metric(self): raise ValueError( "kd_tree does not support callable metric '%s'" "Function call overhead will result" - "in very poor performance." - % self.metric + "in very poor performance." % self.metric ) elif self.metric not in VALID_METRICS[alg_check]: raise ValueError( @@ -590,6 +589,11 @@ def _fit(self, X, y=None): self._fit_method = "brute" else: if ( + self.effective_metric_ == "minkowski" + and self.effective_metric_params_["p"] < 1 + ): + self._fit_method = "brute" + elif ( self.effective_metric_ == "minkowski" and self.effective_metric_params_.get("w") is not None ): @@ -855,8 +859,7 @@ class from an array representing our data set and ask who's if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " - "or set algorithm='brute'" - % self._fit_method + "or set algorithm='brute'" % self._fit_method ) chunked_results = Parallel(n_jobs, prefer="threads")( delayed(_tree_query_parallel_helper)( @@ -1206,8 +1209,7 @@ class from an array representing our data set and ask who's if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " - "or set algorithm='brute'" - % self._fit_method + "or set algorithm='brute'" % self._fit_method ) n_jobs = effective_n_jobs(self.n_jobs) From 8cd400732431ad9ccde51d6253c793468be39899 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 00:33:09 +0530 Subject: [PATCH 04/25] black formatting --- sklearn/neighbors/_base.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index ed9012cf2276f..90f1bf03559bd 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -427,7 +427,8 @@ def _check_algorithm_metric(self): raise ValueError( "kd_tree does not support callable metric '%s'" "Function call overhead will result" - "in very poor performance." % self.metric + "in very poor performance." + % self.metric ) elif self.metric not in VALID_METRICS[alg_check]: raise ValueError( @@ -859,7 +860,8 @@ class from an array representing our data set and ask who's if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " - "or set algorithm='brute'" % self._fit_method + "or set algorithm='brute'" + % self._fit_method ) chunked_results = Parallel(n_jobs, prefer="threads")( delayed(_tree_query_parallel_helper)( @@ -1209,7 +1211,8 @@ class from an array representing our data set and ask who's if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " - "or set algorithm='brute'" % self._fit_method + "or set algorithm='brute'" + % self._fit_method ) n_jobs = effective_n_jobs(self.n_jobs) From f38bb21c15103bd16491cb4eacca251222b985ba Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 00:35:47 +0530 Subject: [PATCH 05/25] test added for validation of NeighborsBase for minkowski with p < 1 --- sklearn/neighbors/tests/test_neighbors.py | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 4cea0bbb00a5f..0d8657fb8b0ad 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1498,6 +1498,39 @@ def test_neighbors_validate_parameters(Estimator): nbrs.predict([[]]) +@pytest.mark.parametrize( + "Estimator", + [ + neighbors.KNeighborsClassifier, + neighbors.RadiusNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsRegressor, + ], +) +def test_neighbors_non_metric_minkowski(Estimator): + """ + Validation of all classes extending NeighborsBase for minkowski metric with p < 1 + """ + X = rng.random_sample((10, 2)) + y = np.ones(10) + + model = Estimator(p=0.1) + msg = "for p < 1 minkowski is not a valid metric" + with pytest.warns(UserWarning, match=msg): + model.fit(X, y) + assert model._fit_method == "brute" + + model = Estimator(algorithm="kd_tree", p=0.1) + msg = "p must be greater or equal to one for minkowski metric" + with pytest.raises(ValueError, match=msg): + model.fit(X, y) + + model = Estimator(algorithm="ball_tree", p=0.1) + msg = "p must be greater or equal to one for minkowski metric" + with pytest.raises(ValueError, match=msg): + model.fit(X, y) + + # TODO: remove when NearestNeighbors methods uses parameter validation mechanism def test_nearest_neighbors_validate_params(): """Validate parameter of NearestNeighbors.""" From d5d5dea1edac284b20cd2a7cffdcfc84dd75c447 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 01:23:52 +0530 Subject: [PATCH 06/25] modified error msg to include suggestion --- sklearn/neighbors/_base.py | 3 ++- sklearn/neighbors/tests/test_neighbors.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 90f1bf03559bd..e586bad4f28ce 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -630,7 +630,8 @@ def _fit(self, X, y=None): warnings.warn("for p < 1 minkowski is not a valid metric") else: raise ValueError( - "p must be greater or equal to one for minkowski metric" + "p must be greater or equal to one for minkowski metric, set p >= 1" + " or algorithm='brute'" ) if self._fit_method == "ball_tree": diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 0d8657fb8b0ad..f9324098d9d74 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1521,12 +1521,18 @@ def test_neighbors_non_metric_minkowski(Estimator): assert model._fit_method == "brute" model = Estimator(algorithm="kd_tree", p=0.1) - msg = "p must be greater or equal to one for minkowski metric" + msg = ( + "p must be greater or equal to one for minkowski metric, set p >= 1 or" + " algorithm='brute'" + ) with pytest.raises(ValueError, match=msg): model.fit(X, y) model = Estimator(algorithm="ball_tree", p=0.1) - msg = "p must be greater or equal to one for minkowski metric" + msg = ( + "p must be greater or equal to one for minkowski metric, set p >= 1 or" + " algorithm='brute'" + ) with pytest.raises(ValueError, match=msg): model.fit(X, y) From 06c1634f255030f46a1237df8452b791a8d80022 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Wed, 26 Oct 2022 13:33:14 +0530 Subject: [PATCH 07/25] Update sklearn/neighbors/_base.py Co-authored-by: Julien Jerphanion --- sklearn/neighbors/_base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index e586bad4f28ce..1865e9ce3d613 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -622,10 +622,12 @@ def _fit(self, X, y=None): self.effective_metric_ in ["wminkowski", "minkowski"] and self.effective_metric_params_["p"] < 1 ): - # For p < 1 minkowski is not a valid metric as it does not - # satisfy triangular inequality, but that is workable for - # bruteforce algorithm. But kd_tree and ball_tree requires - # valid metric to work + # For 0 < p < 1 Minkowski distances aren't valid distance + # metric as they do not satisfy triangular inequality: + # they are semimetrics. + # algo="kd_tree" and algo="ball_tree" can't be used because + # KDTree and BallTree require distance metric to work properly. + # However, the brute-force algorithm supports semimetrics. if self._fit_method == "brute": warnings.warn("for p < 1 minkowski is not a valid metric") else: From f88f4243d958520b676f6122661f67beb0c3bbc7 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Wed, 26 Oct 2022 13:41:16 +0530 Subject: [PATCH 08/25] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/neighbors/_base.py | 8 +++++--- sklearn/neighbors/tests/test_neighbors.py | 10 ++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 1865e9ce3d613..351a8be1b3189 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -629,11 +629,13 @@ def _fit(self, X, y=None): # KDTree and BallTree require distance metric to work properly. # However, the brute-force algorithm supports semimetrics. if self._fit_method == "brute": - warnings.warn("for p < 1 minkowski is not a valid metric") + warnings.warn("Mind that for 0 < p < 1, Minkowski metrics are + not distance metric. Continuing the execution with `algo="brute"`.) else: raise ValueError( - "p must be greater or equal to one for minkowski metric, set p >= 1" - " or algorithm='brute'" + 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' + " the Minkowski metric. To resolve this problem either " + "set p >= 1 or algo='brute'" ) if self._fit_method == "ball_tree": diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index f9324098d9d74..a7f74ef2f37e1 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1507,18 +1507,20 @@ def test_neighbors_validate_parameters(Estimator): neighbors.RadiusNeighborsRegressor, ], ) -def test_neighbors_non_metric_minkowski(Estimator): +def test_neighbors_minkowski_semimetric(Estimator): """ - Validation of all classes extending NeighborsBase for minkowski metric with p < 1 + Validation of all classes extending NeighborsBase with + Minkowski semi-metrics (i.e. when 0 < p < 1). """ X = rng.random_sample((10, 2)) y = np.ones(10) - model = Estimator(p=0.1) + model = Estimator(p=0.1, algo="auto") msg = "for p < 1 minkowski is not a valid metric" with pytest.warns(UserWarning, match=msg): model.fit(X, y) - assert model._fit_method == "brute" + + assert model._fit_method == "brute" model = Estimator(algorithm="kd_tree", p=0.1) msg = ( From f4f48b5093991e4ec28d73bb587fe4be94f99096 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 17:11:58 +0530 Subject: [PATCH 09/25] balck formatting --- sklearn/neighbors/_base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 351a8be1b3189..29bf03c22a659 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -629,8 +629,10 @@ def _fit(self, X, y=None): # KDTree and BallTree require distance metric to work properly. # However, the brute-force algorithm supports semimetrics. if self._fit_method == "brute": - warnings.warn("Mind that for 0 < p < 1, Minkowski metrics are - not distance metric. Continuing the execution with `algo="brute"`.) + warnings.warn( + "Mind that for 0 < p < 1, Minkowski metrics are not distance" + " metric. Continuing the execution with `algo='brute'`." + ) else: raise ValueError( 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' From a8a7c9d947af8309853b249ce1d00c5054d85182 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 17:14:18 +0530 Subject: [PATCH 10/25] modified tests accoring to suggestions --- sklearn/neighbors/tests/test_neighbors.py | 30 +++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a7f74ef2f37e1..268ff5f6ea6f8 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1507,33 +1507,47 @@ def test_neighbors_validate_parameters(Estimator): neighbors.RadiusNeighborsRegressor, ], ) -def test_neighbors_minkowski_semimetric(Estimator): +@pytest.mark.parametrize("n_features", [2, 100]) +def test_neighbors_minkowski_semimetric(Estimator, n_features): """ Validation of all classes extending NeighborsBase with Minkowski semi-metrics (i.e. when 0 < p < 1). """ - X = rng.random_sample((10, 2)) + X = rng.random_sample((10, n_features)) y = np.ones(10) - model = Estimator(p=0.1, algo="auto") - msg = "for p < 1 minkowski is not a valid metric" + model = Estimator(p=0.1, algorithm="auto") + msg = ( + "Mind that for 0 < p < 1, Minkowski metrics are not distance metric. Continuing" + " the execution with `algo='brute'`." + ) with pytest.warns(UserWarning, match=msg): model.fit(X, y) assert model._fit_method == "brute" + model = Estimator(p=0.1, algorithm="brute") + msg = ( + "Mind that for 0 < p < 1, Minkowski metrics are not distance metric. Continuing" + " the execution with `algo='brute'`." + ) + with pytest.warns(UserWarning, match=msg): + model.fit(X, y) + model = Estimator(algorithm="kd_tree", p=0.1) msg = ( - "p must be greater or equal to one for minkowski metric, set p >= 1 or" - " algorithm='brute'" + 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' + " the Minkowski metric. To resolve this problem either " + "set p >= 1 or algo='brute'" ) with pytest.raises(ValueError, match=msg): model.fit(X, y) model = Estimator(algorithm="ball_tree", p=0.1) msg = ( - "p must be greater or equal to one for minkowski metric, set p >= 1 or" - " algorithm='brute'" + 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' + " the Minkowski metric. To resolve this problem either " + "set p >= 1 or algo='brute'" ) with pytest.raises(ValueError, match=msg): model.fit(X, y) From 24e6dc8de0c4e460c3a8eff3430abe5bb3a9b6f3 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 17:15:39 +0530 Subject: [PATCH 11/25] added enhancement changelog to v1.2 --- doc/whats_new/v1.2.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index e2b18cd0149a2..315637bcfcc9d 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -495,6 +495,14 @@ Changelog :pr:`10468` by :user:`Ruben ` and :pr:`22993` by :user:`Jovan Stojanovic `. +- |Enhancement| :class:`neighbors.NeighborsBase` now accepts Minkowski metric with + `0 < p < 1` for `algorithm="brute"` and raises `UserWarning`. For `algorithm="kd_tree"` + and `algorithm="ball_tree"` it raises `ValueError` as Minkowski with `0 < p < 1` isn't valid + distance metric. If `algorithm="auto"` and metric is Minkowski with `0 < p < 1`, `brute` algorithm + will be used. + :pr:`24750` by :user:`Rudresh Veerkhare ` + + - |Efficiency| :class:`neighbors.NearestCentroid` is faster and requires less memory as it better leverages CPUs' caches to compute predictions. :pr:`24645` by :user:`Olivier Grisel `. From 548fe87e77c47f191d1384f42e812735f4f385f4 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Wed, 26 Oct 2022 18:30:41 +0530 Subject: [PATCH 12/25] Update doc/whats_new/v1.2.rst Co-authored-by: Julien Jerphanion --- doc/whats_new/v1.2.rst | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 88293baef8b20..39cc453c8f402 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -495,14 +495,11 @@ Changelog :pr:`10468` by :user:`Ruben ` and :pr:`22993` by :user:`Jovan Stojanovic `. -- |Enhancement| :class:`neighbors.NeighborsBase` now accepts Minkowski metric with - `0 < p < 1` for `algorithm="brute"` and raises `UserWarning`. For `algorithm="kd_tree"` - and `algorithm="ball_tree"` it raises `ValueError` as Minkowski with `0 < p < 1` isn't valid - distance metric. If `algorithm="auto"` and metric is Minkowski with `0 < p < 1`, `brute` algorithm - will be used. +- |Enhancement| :class:`neighbors.NeighborsBase` now accepts + Minkowski semi-metric (i.e. when :math:`0 < p < 1` for + `metric="minkowski"`) for `algo="auto"` or `algo="brute"`. :pr:`24750` by :user:`Rudresh Veerkhare ` - - |Efficiency| :class:`neighbors.NearestCentroid` is faster and requires less memory as it better leverages CPUs' caches to compute predictions. :pr:`24645` by :user:`Olivier Grisel `. From cfcfc0de62d6ddb4ff31b5994ef7cb8d6db7c378 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Wed, 26 Oct 2022 18:33:31 +0530 Subject: [PATCH 13/25] Update sklearn/neighbors/_base.py Co-authored-by: Julien Jerphanion --- sklearn/neighbors/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 29bf03c22a659..decd58dff4b2a 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -633,11 +633,11 @@ def _fit(self, X, y=None): "Mind that for 0 < p < 1, Minkowski metrics are not distance" " metric. Continuing the execution with `algo='brute'`." ) - else: + else: # self._fit_method in ("kd_tree", "ball_tree") raise ValueError( - 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' - " the Minkowski metric. To resolve this problem either " - "set p >= 1 or algo='brute'" + f'algo="{self._fit_method}" does not support 0 < p < 1 for ' + "the Minkowski metric. To resolve this problem either " + 'set p >= 1 or algo="brute".' ) if self._fit_method == "ball_tree": From 0d1b5c6019f2bcdc52149ff8729fdc701e75d6e9 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 19:01:08 +0530 Subject: [PATCH 14/25] modified _parameter_constraints to exclude p=0 --- sklearn/neighbors/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index decd58dff4b2a..3516c3c88173f 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -382,7 +382,7 @@ class NeighborsBase(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "radius": [Interval(Real, 0, None, closed="both"), None], "algorithm": [StrOptions({"auto", "ball_tree", "kd_tree", "brute"})], "leaf_size": [Interval(Integral, 1, None, closed="left")], - "p": [Interval(Real, 0, None, closed="both"), None], + "p": [Interval(Real, 0, None, closed="right"), None], "metric": [StrOptions(set(itertools.chain(*VALID_METRICS.values()))), callable], "metric_params": [dict, None], "n_jobs": [Integral, None], @@ -633,7 +633,7 @@ def _fit(self, X, y=None): "Mind that for 0 < p < 1, Minkowski metrics are not distance" " metric. Continuing the execution with `algo='brute'`." ) - else: # self._fit_method in ("kd_tree", "ball_tree") + else: # self._fit_method in ("kd_tree", "ball_tree") raise ValueError( f'algo="{self._fit_method}" does not support 0 < p < 1 for ' "the Minkowski metric. To resolve this problem either " From e22e811071b2ec771148e2d51dccb2b79e7fd7b5 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Wed, 26 Oct 2022 19:03:42 +0530 Subject: [PATCH 15/25] added test to validate minkowski with p=0 --- sklearn/neighbors/tests/test_neighbors.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 268ff5f6ea6f8..158c93af11943 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1516,6 +1516,11 @@ def test_neighbors_minkowski_semimetric(Estimator, n_features): X = rng.random_sample((10, n_features)) y = np.ones(10) + model = Estimator(p=0) + with pytest.raises(ValueError): + # This Exception will be raised by _validate_params called in fit method + model.fit(X, y) + model = Estimator(p=0.1, algorithm="auto") msg = ( "Mind that for 0 < p < 1, Minkowski metrics are not distance metric. Continuing" @@ -1536,18 +1541,18 @@ def test_neighbors_minkowski_semimetric(Estimator, n_features): model = Estimator(algorithm="kd_tree", p=0.1) msg = ( - 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' - " the Minkowski metric. To resolve this problem either " - "set p >= 1 or algo='brute'" + 'algo="kd_tree" does not support 0 < p < 1 for ' + "the Minkowski metric. To resolve this problem either " + 'set p >= 1 or algo="brute".' ) with pytest.raises(ValueError, match=msg): model.fit(X, y) model = Estimator(algorithm="ball_tree", p=0.1) msg = ( - 'algo="kd_tree" or algo="ball_tree" requires 0 < p < 1 for' - " the Minkowski metric. To resolve this problem either " - "set p >= 1 or algo='brute'" + 'algo="ball_tree" does not support 0 < p < 1 for ' + "the Minkowski metric. To resolve this problem either " + 'set p >= 1 or algo="brute".' ) with pytest.raises(ValueError, match=msg): model.fit(X, y) From a06addc7a1361b5245ec6dbede2410ffe2f45929 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Wed, 26 Oct 2022 20:27:01 +0530 Subject: [PATCH 16/25] Update sklearn/neighbors/_base.py Co-authored-by: Julien Jerphanion --- sklearn/neighbors/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3516c3c88173f..ac6a2db3d48df 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -590,7 +590,8 @@ def _fit(self, X, y=None): self._fit_method = "brute" else: if ( - self.effective_metric_ == "minkowski" + # TODO(1.3): remove "wminkowski" + self.effective_metric_ in ("wminkowski", "minkowski") and self.effective_metric_params_["p"] < 1 ): self._fit_method = "brute" From 160b8421ca68eb1d7e9c3c0b1bd33db6ad13d874 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Wed, 26 Oct 2022 20:27:19 +0530 Subject: [PATCH 17/25] Update sklearn/neighbors/_base.py Co-authored-by: Julien Jerphanion --- sklearn/neighbors/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index ac6a2db3d48df..6ed7a717c127d 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -620,7 +620,8 @@ def _fit(self, X, y=None): self._fit_method = "brute" if ( - self.effective_metric_ in ["wminkowski", "minkowski"] + # TODO(1.3): remove "wminkowski" + self.effective_metric_ in ("wminkowski", "minkowski") and self.effective_metric_params_["p"] < 1 ): # For 0 < p < 1 Minkowski distances aren't valid distance From cc47bace11e0bb5b08c72681bb8702c4f639ed8e Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Thu, 3 Nov 2022 23:31:13 +0530 Subject: [PATCH 18/25] Update sklearn/neighbors/_base.py Co-authored-by: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 6ed7a717c127d..9134ff4f74417 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -633,7 +633,7 @@ def _fit(self, X, y=None): if self._fit_method == "brute": warnings.warn( "Mind that for 0 < p < 1, Minkowski metrics are not distance" - " metric. Continuing the execution with `algo='brute'`." + " metrics. Continuing the execution with `algo='brute'`." ) else: # self._fit_method in ("kd_tree", "ball_tree") raise ValueError( From 65b033d8b399db080fc4569c64fff63b69e99887 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Thu, 3 Nov 2022 23:33:19 +0530 Subject: [PATCH 19/25] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 9134ff4f74417..7940a21ef7966 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -629,7 +629,7 @@ def _fit(self, X, y=None): # they are semimetrics. # algo="kd_tree" and algo="ball_tree" can't be used because # KDTree and BallTree require distance metric to work properly. - # However, the brute-force algorithm supports semimetrics. + # However, the brute-force algorithm supports semi-metrics. if self._fit_method == "brute": warnings.warn( "Mind that for 0 < p < 1, Minkowski metrics are not distance" From 9151b1c3a2324e880bb810b082eb0cffeec34883 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Thu, 3 Nov 2022 23:37:23 +0530 Subject: [PATCH 20/25] Update sklearn/neighbors/_base.py Co-authored-by: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 7940a21ef7966..53321d7d103b3 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -628,7 +628,7 @@ def _fit(self, X, y=None): # metric as they do not satisfy triangular inequality: # they are semimetrics. # algo="kd_tree" and algo="ball_tree" can't be used because - # KDTree and BallTree require distance metric to work properly. + # KDTree and BallTree require a proper distance metric to work properly. # However, the brute-force algorithm supports semi-metrics. if self._fit_method == "brute": warnings.warn( From 90d5db510c9874d7ab6f30915d41973acb59afb2 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Sat, 5 Nov 2022 11:56:14 +0530 Subject: [PATCH 21/25] modified error and warning messages according to review --- sklearn/neighbors/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 53321d7d103b3..3e653dbbfca3d 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -627,19 +627,19 @@ def _fit(self, X, y=None): # For 0 < p < 1 Minkowski distances aren't valid distance # metric as they do not satisfy triangular inequality: # they are semimetrics. - # algo="kd_tree" and algo="ball_tree" can't be used because + # algorithm="kd_tree" and algorithm="ball_tree" can't be used because # KDTree and BallTree require a proper distance metric to work properly. # However, the brute-force algorithm supports semi-metrics. if self._fit_method == "brute": warnings.warn( "Mind that for 0 < p < 1, Minkowski metrics are not distance" - " metrics. Continuing the execution with `algo='brute'`." + " metrics. Continuing the execution with `algorithm='brute'`." ) else: # self._fit_method in ("kd_tree", "ball_tree") raise ValueError( - f'algo="{self._fit_method}" does not support 0 < p < 1 for ' + f'algorithm="{self._fit_method}" does not support 0 < p < 1 for ' "the Minkowski metric. To resolve this problem either " - 'set p >= 1 or algo="brute".' + 'set p >= 1 or algorithm="brute".' ) if self._fit_method == "ball_tree": From 670c8d38ce4cfb4c8861fe402e82904e023a0a36 Mon Sep 17 00:00:00 2001 From: RudreshVeerkhare Date: Sat, 5 Nov 2022 11:58:15 +0530 Subject: [PATCH 22/25] rearranged tests to use parameterization --- sklearn/neighbors/tests/test_neighbors.py | 53 +++++++++++------------ 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 158c93af11943..418ca7d4c505e 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1508,51 +1508,48 @@ def test_neighbors_validate_parameters(Estimator): ], ) @pytest.mark.parametrize("n_features", [2, 100]) -def test_neighbors_minkowski_semimetric(Estimator, n_features): +@pytest.mark.parametrize("algorithm", ["auto", "brute"]) +def test_neighbors_minkowski_semimetric_algo_warn(Estimator, n_features, algorithm): """ Validation of all classes extending NeighborsBase with - Minkowski semi-metrics (i.e. when 0 < p < 1). + Minkowski semi-metrics (i.e. when 0 < p < 1). That proper + Warning is raised for algorith "auto" and "brute" """ X = rng.random_sample((10, n_features)) y = np.ones(10) - model = Estimator(p=0) - with pytest.raises(ValueError): - # This Exception will be raised by _validate_params called in fit method - model.fit(X, y) - - model = Estimator(p=0.1, algorithm="auto") + model = Estimator(p=0.1, algorithm=algorithm) msg = ( - "Mind that for 0 < p < 1, Minkowski metrics are not distance metric. Continuing" - " the execution with `algo='brute'`." + "Mind that for 0 < p < 1, Minkowski metrics are not distance" + " metrics. Continuing the execution with `algorithm='brute'`." ) with pytest.warns(UserWarning, match=msg): model.fit(X, y) assert model._fit_method == "brute" - model = Estimator(p=0.1, algorithm="brute") - msg = ( - "Mind that for 0 < p < 1, Minkowski metrics are not distance metric. Continuing" - " the execution with `algo='brute'`." - ) - with pytest.warns(UserWarning, match=msg): - model.fit(X, y) - model = Estimator(algorithm="kd_tree", p=0.1) - msg = ( - 'algo="kd_tree" does not support 0 < p < 1 for ' - "the Minkowski metric. To resolve this problem either " - 'set p >= 1 or algo="brute".' - ) - with pytest.raises(ValueError, match=msg): - model.fit(X, y) +@pytest.mark.parametrize( + "Estimator", + [ + neighbors.KNeighborsClassifier, + neighbors.RadiusNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsRegressor, + ], +) +@pytest.mark.parametrize("n_features", [2, 100]) +@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"]) +def test_neighbors_minkowski_semimetric_algo_error(Estimator, n_features, algorithm): + """Check that we raise a proper error if `algorithm!='brute'` and `p<1`.""" + X = rng.random_sample((10, 2)) + y = np.ones(10) - model = Estimator(algorithm="ball_tree", p=0.1) + model = Estimator(algorithm=algorithm, p=0.1) msg = ( - 'algo="ball_tree" does not support 0 < p < 1 for ' + f'algorithm="{algorithm}" does not support 0 < p < 1 for ' "the Minkowski metric. To resolve this problem either " - 'set p >= 1 or algo="brute".' + 'set p >= 1 or algorithm="brute".' ) with pytest.raises(ValueError, match=msg): model.fit(X, y) From dfd6ada723d5971144d17b85365abe5525ce5583 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Sat, 5 Nov 2022 12:03:32 +0530 Subject: [PATCH 23/25] Update sklearn/neighbors/_base.py Co-authored-by: Guillaume Lemaitre --- sklearn/neighbors/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 3e653dbbfca3d..07b4e4d5996ff 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -626,7 +626,7 @@ def _fit(self, X, y=None): ): # For 0 < p < 1 Minkowski distances aren't valid distance # metric as they do not satisfy triangular inequality: - # they are semimetrics. + # they are semi-metrics. # algorithm="kd_tree" and algorithm="ball_tree" can't be used because # KDTree and BallTree require a proper distance metric to work properly. # However, the brute-force algorithm supports semi-metrics. From 1e9b9f4e903d1000fb4d38c02bae61864fa46f95 Mon Sep 17 00:00:00 2001 From: Rudresh Veerkhare <42195753+RudreshVeerkhare@users.noreply.github.com> Date: Sat, 5 Nov 2022 16:52:18 +0530 Subject: [PATCH 24/25] spelling fix --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 418ca7d4c505e..f885574ff3fae 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1513,7 +1513,7 @@ def test_neighbors_minkowski_semimetric_algo_warn(Estimator, n_features, algorit """ Validation of all classes extending NeighborsBase with Minkowski semi-metrics (i.e. when 0 < p < 1). That proper - Warning is raised for algorith "auto" and "brute" + Warning is raised for algorithm "auto" and "brute" """ X = rng.random_sample((10, n_features)) y = np.ones(10) From 92e620089ffb76836038cac028e23e0766286668 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 10 Nov 2022 10:15:30 +0100 Subject: [PATCH 25/25] Apply suggestions from code review --- doc/whats_new/v1.2.rst | 2 +- sklearn/neighbors/tests/test_neighbors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 7c94ab7e4ceb3..b2ab73606fca9 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -522,7 +522,7 @@ Changelog - |Enhancement| :class:`neighbors.NeighborsBase` now accepts Minkowski semi-metric (i.e. when :math:`0 < p < 1` for - `metric="minkowski"`) for `algo="auto"` or `algo="brute"`. + `metric="minkowski"`) for `algorithm="auto"` or `algorithm="brute"`. :pr:`24750` by :user:`Rudresh Veerkhare ` - |Efficiency| :class:`neighbors.NearestCentroid` is faster and requires diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index f885574ff3fae..b82d369f2c12c 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1513,7 +1513,7 @@ def test_neighbors_minkowski_semimetric_algo_warn(Estimator, n_features, algorit """ Validation of all classes extending NeighborsBase with Minkowski semi-metrics (i.e. when 0 < p < 1). That proper - Warning is raised for algorithm "auto" and "brute" + Warning is raised for `algorithm="auto"` and "brute". """ X = rng.random_sample((10, n_features)) y = np.ones(10)