From cb2bbed3b4e9e8dba14b70739fc488d877fbe45c Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Wed, 20 Apr 2016 16:14:57 +0200 Subject: [PATCH 01/34] Files for my dev environment with Docker --- hack-dev/Dockerfile | 20 ++++++++++++++++++++ hack-dev/Makefile | 5 +++++ 2 files changed, 25 insertions(+) create mode 100644 hack-dev/Dockerfile create mode 100644 hack-dev/Makefile diff --git a/hack-dev/Dockerfile b/hack-dev/Dockerfile new file mode 100644 index 0000000000000..de710cb710b02 --- /dev/null +++ b/hack-dev/Dockerfile @@ -0,0 +1,20 @@ +FROM ubuntu + +RUN apt-get update +RUN apt-get install -y -q \ + python-dev \ + python-setuptools \ + make \ + pkg-config \ + libfreetype6 \ + libfreetype6-dev \ + libpng12-dev \ + g++ \ + ipython +RUN easy_install pip +RUN pip install \ + 'scipy>=0.9' \ + 'nose>=1.1.2' \ + 'matplotlib>=1.1.1' \ + 'numpy>=1.6.1' \ + 'cython>=0.23' diff --git a/hack-dev/Makefile b/hack-dev/Makefile new file mode 100644 index 0000000000000..46bae75e818ba --- /dev/null +++ b/hack-dev/Makefile @@ -0,0 +1,5 @@ +dev-image: + docker build -t boechat107/scikit-dev . + +run-image: + docker run -it --rm --name scikit-dev -v $(PWD)/../../:/work/codes -w /work/codes boechat107/scikit-dev bash From cbde08218160a55a0772322faa598c13930fbb0b Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Wed, 20 Apr 2016 19:38:51 +0200 Subject: [PATCH 02/34] Fixing label clamping (alpha=0 for hard clamping) --- sklearn/semi_supervised/label_propagation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 1759b2c1d7572..5dc87e81f2ecc 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -242,7 +242,7 @@ def fit(self, X, y): y = np.asarray(y) unlabeled = y == -1 clamp_weights = np.ones((n_samples, 1)) - clamp_weights[unlabeled, 0] = self.alpha + clamp_weights[~unlabeled, 0] = self.alpha # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) From 979d872d5b5f5d0f9ba86f3f79b35720b0ecc02e Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Wed, 20 Apr 2016 20:00:14 +0200 Subject: [PATCH 03/34] Deprecating alpha, fixing its value to zero --- sklearn/semi_supervised/label_propagation.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 5dc87e81f2ecc..6369ca73c86ba 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -55,6 +55,7 @@ # License: BSD from abc import ABCMeta, abstractmethod +import warnings import numpy as np from scipy import sparse @@ -299,7 +300,7 @@ class LabelPropagation(BaseLabelPropagation): Parameter for knn kernel alpha : float - Clamping factor + (DEPRECATED) Clamping factor max_iter : float Change maximum number of iterations allowed @@ -350,6 +351,19 @@ class LabelPropagation(BaseLabelPropagation): LabelSpreading : Alternate label propagation strategy more robust to noise """ + def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, + alpha=None, max_iter=30, tol=1e-3, n_jobs=1): + # Deprecating the parameter 'alpha', fixing it to zero. + if alpha is not None: + warnings.warn("The parameter 'alpha' is deprecated", + category=DeprecationWarning) + alpha = 0.; + super(LabelPropagation, self).__init__(kernel=kernel, gamma=gamma, + n_neighbors=n_neighbors, + alpha=alpha, max_iter=max_iter, + tol=tol, + n_jobs=n_jobs) + def _build_graph(self): """Matrix representing a fully connected graph between each sample From 262f7589fc2dfe298dfcb44888f2ea02fbc7a3f0 Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Tue, 26 Apr 2016 18:25:06 +0200 Subject: [PATCH 04/34] Correct way to deprecate alpha for LabelPropagation The previous way was breaking the test sklearn.tests.test_common.test_all_estimators --- sklearn/semi_supervised/label_propagation.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 6369ca73c86ba..13a66fe0eeee6 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -300,7 +300,8 @@ class LabelPropagation(BaseLabelPropagation): Parameter for knn kernel alpha : float - (DEPRECATED) Clamping factor + (DEPRECATED) Clamping factor. It's internally fixed to zero and it'll + be removed in another release max_iter : float Change maximum number of iterations allowed @@ -352,12 +353,10 @@ class LabelPropagation(BaseLabelPropagation): """ def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, - alpha=None, max_iter=30, tol=1e-3, n_jobs=1): - # Deprecating the parameter 'alpha', fixing it to zero. - if alpha is not None: - warnings.warn("The parameter 'alpha' is deprecated", - category=DeprecationWarning) - alpha = 0.; + alpha=.0, max_iter=30, tol=1e-3, n_jobs=1): + # alpha is deprecated for LabelPropagation because it doesn't have any + # theoretical meaning (from the reference paper). + alpha = .0 super(LabelPropagation, self).__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, From 7e3a2ed1aac182802f126b340bd3fb054ef1980f Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Tue, 26 Apr 2016 21:08:33 +0200 Subject: [PATCH 05/34] Detailed info for LabelSpreading's alpha parameter Based on the original paper. --- sklearn/semi_supervised/label_propagation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 13a66fe0eeee6..0ccaca2ee1f66 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -404,7 +404,10 @@ class LabelSpreading(BaseLabelPropagation): parameter for knn kernel alpha : float - clamping factor + Clamping factor [0, 1], it specifies the relative amount of the + information from its neighbors and its initial label information. + alpha=0 means keeping the initial label information; alpha=1 means + replacing all initial information. max_iter : float maximum number of iterations allowed From 03311bf704580f9bdfa7237a3f00e5df59ab1d3f Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Wed, 27 Apr 2016 21:23:36 +0200 Subject: [PATCH 06/34] Minor changes in the deprecation message --- sklearn/semi_supervised/label_propagation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 0ccaca2ee1f66..391110d5826a0 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -300,8 +300,7 @@ class LabelPropagation(BaseLabelPropagation): Parameter for knn kernel alpha : float - (DEPRECATED) Clamping factor. It's internally fixed to zero and it'll - be removed in another release + DEPRECATED: It's internally fixed to zero and it'll be removed in 0.19 max_iter : float Change maximum number of iterations allowed From 9a68a41d8a202155fe04361356bb6c15eaf360c6 Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Thu, 8 Dec 2016 16:48:57 -0200 Subject: [PATCH 07/34] Improving "deprecated" doc string and raising DeprecationWarning --- sklearn/semi_supervised/label_propagation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 391110d5826a0..ae01b4d397460 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -300,7 +300,7 @@ class LabelPropagation(BaseLabelPropagation): Parameter for knn kernel alpha : float - DEPRECATED: It's internally fixed to zero and it'll be removed in 0.19 + DEPRECATED: Deprecated in 0.19 and it's going to be removed in 0.21. max_iter : float Change maximum number of iterations allowed @@ -352,9 +352,14 @@ class LabelPropagation(BaseLabelPropagation): """ def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, - alpha=.0, max_iter=30, tol=1e-3, n_jobs=1): + alpha=None, max_iter=30, tol=1e-3, n_jobs=1): # alpha is deprecated for LabelPropagation because it doesn't have any - # theoretical meaning (from the reference paper). + # theoretical meaning (from the reference paper). Look at PR 6727. + if alpha is not None: + warnings.warn( + "Deprecated in 0.19 and it's going to be removed in 0.21.", + DeprecationWarning + ) alpha = .0 super(LabelPropagation, self).__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, From c356a82f4aa2a723a2bf7de06d0b032c73b69217 Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Mon, 12 Dec 2016 15:16:20 -0200 Subject: [PATCH 08/34] Using a local "alpha" in "fit" to deprecate LabelPropagation's alpha This solution isn't great, but it sets the correct value for alpha without violating the restrictions imposed by the tests. --- sklearn/semi_supervised/label_propagation.py | 39 ++++++++++---------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index ae01b4d397460..7ceaec14a54f9 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -240,10 +240,13 @@ def fit(self, X, y): n_samples, n_classes = len(y), len(classes) + # Hack to deprecate "alpha" of LabelPropagation and set its correct + # value (look at LabelPropagation.fit). + alpha = getattr(self, '_alpha', self.alpha) y = np.asarray(y) unlabeled = y == -1 clamp_weights = np.ones((n_samples, 1)) - clamp_weights[~unlabeled, 0] = self.alpha + clamp_weights[~unlabeled, 0] = alpha # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) @@ -251,8 +254,8 @@ def fit(self, X, y): self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) - if self.alpha > 0.: - y_static *= 1 - self.alpha + if alpha > 0.: + y_static *= 1 - alpha y_static[unlabeled] = 0 l_previous = np.zeros((self.X_.shape[0], n_classes)) @@ -351,22 +354,6 @@ class LabelPropagation(BaseLabelPropagation): LabelSpreading : Alternate label propagation strategy more robust to noise """ - def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, - alpha=None, max_iter=30, tol=1e-3, n_jobs=1): - # alpha is deprecated for LabelPropagation because it doesn't have any - # theoretical meaning (from the reference paper). Look at PR 6727. - if alpha is not None: - warnings.warn( - "Deprecated in 0.19 and it's going to be removed in 0.21.", - DeprecationWarning - ) - alpha = .0 - super(LabelPropagation, self).__init__(kernel=kernel, gamma=gamma, - n_neighbors=n_neighbors, - alpha=alpha, max_iter=max_iter, - tol=tol, - n_jobs=n_jobs) - def _build_graph(self): """Matrix representing a fully connected graph between each sample @@ -383,6 +370,18 @@ class distributions will exceed 1 (normalization may be desired). affinity_matrix /= normalizer[:, np.newaxis] return affinity_matrix + def fit(self, X, y): + # alpha is deprecated for LabelPropagation because it doesn't have any + # theoretical meaning (from the reference paper). Look at PR 6727. + if self.alpha is not None: + warnings.warn( + "Deprecated in 0.19 and it's going to be removed in 0.21.", + DeprecationWarning + ) + # Extra property to set the correct value of the deprecated "alpha". + self._alpha = 0 + return super(LabelPropagation, self).fit(X, y) + class LabelSpreading(BaseLabelPropagation): """LabelSpreading model for semi-supervised learning @@ -409,7 +408,7 @@ class LabelSpreading(BaseLabelPropagation): alpha : float Clamping factor [0, 1], it specifies the relative amount of the - information from its neighbors and its initial label information. + information from its neighbors and its initial label information. alpha=0 means keeping the initial label information; alpha=1 means replacing all initial information. From 76310e53c086796f10431e132414a1e442e04801 Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Mon, 12 Dec 2016 15:30:46 -0200 Subject: [PATCH 09/34] Removal of my development files --- hack-dev/Dockerfile | 20 -------------------- hack-dev/Makefile | 5 ----- 2 files changed, 25 deletions(-) delete mode 100644 hack-dev/Dockerfile delete mode 100644 hack-dev/Makefile diff --git a/hack-dev/Dockerfile b/hack-dev/Dockerfile deleted file mode 100644 index de710cb710b02..0000000000000 --- a/hack-dev/Dockerfile +++ /dev/null @@ -1,20 +0,0 @@ -FROM ubuntu - -RUN apt-get update -RUN apt-get install -y -q \ - python-dev \ - python-setuptools \ - make \ - pkg-config \ - libfreetype6 \ - libfreetype6-dev \ - libpng12-dev \ - g++ \ - ipython -RUN easy_install pip -RUN pip install \ - 'scipy>=0.9' \ - 'nose>=1.1.2' \ - 'matplotlib>=1.1.1' \ - 'numpy>=1.6.1' \ - 'cython>=0.23' diff --git a/hack-dev/Makefile b/hack-dev/Makefile deleted file mode 100644 index 46bae75e818ba..0000000000000 --- a/hack-dev/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -dev-image: - docker build -t boechat107/scikit-dev . - -run-image: - docker run -it --rm --name scikit-dev -v $(PWD)/../../:/work/codes -w /work/codes boechat107/scikit-dev bash From ce15e2476ae37ae8540a9a0ee77b9135a50c7c72 Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Mon, 9 Jan 2017 14:51:48 -0200 Subject: [PATCH 10/34] Using sphinx's "deprecated" tag (jnothman's suggestion) --- sklearn/semi_supervised/label_propagation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 7ceaec14a54f9..5f8922c4213de 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -303,7 +303,11 @@ class LabelPropagation(BaseLabelPropagation): Parameter for knn kernel alpha : float - DEPRECATED: Deprecated in 0.19 and it's going to be removed in 0.21. + Clamping factor. + + .. deprecated:: 0.19 + This parameter will be removed in 0.21. + 'alpha' is fixed to zero in 'LabelPropagation'. max_iter : float Change maximum number of iterations allowed From e863810f16613020009bf4612f1c2fb75579e05c Mon Sep 17 00:00:00 2001 From: Andre Ambrosio Boechat Date: Mon, 9 Jan 2017 14:56:42 -0200 Subject: [PATCH 11/34] Deprecation warning: stating that the alpha's value will be ignored --- sklearn/semi_supervised/label_propagation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 5f8922c4213de..3d42062d1d066 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -379,7 +379,8 @@ def fit(self, X, y): # theoretical meaning (from the reference paper). Look at PR 6727. if self.alpha is not None: warnings.warn( - "Deprecated in 0.19 and it's going to be removed in 0.21.", + "alpha's value will be ignored." + "alpha is deprecated since 0.19 and will be removed in 0.21.", DeprecationWarning ) # Extra property to set the correct value of the deprecated "alpha". From 356ccc2e7dabe9d6ffc6da71efad7351b18a0d02 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 13 Jun 2017 00:15:18 +1000 Subject: [PATCH 12/34] Use __init__ with alpha=None --- sklearn/semi_supervised/label_propagation.py | 16 ++++++++-------- .../tests/test_label_propagation.py | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 3d42062d1d066..2a70800ae0b66 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -240,9 +240,9 @@ def fit(self, X, y): n_samples, n_classes = len(y), len(classes) - # Hack to deprecate "alpha" of LabelPropagation and set its correct - # value (look at LabelPropagation.fit). - alpha = getattr(self, '_alpha', self.alpha) + alpha = self.alpha + if alpha is None: + alpha = 0 y = np.asarray(y) unlabeled = y == -1 clamp_weights = np.ones((n_samples, 1)) @@ -357,6 +357,11 @@ class LabelPropagation(BaseLabelPropagation): -------- LabelSpreading : Alternate label propagation strategy more robust to noise """ + def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, + alpha=None, max_iter=30, tol=1e-3, n_jobs=1): + super(LabelPropagation, self).__init__( + kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, + max_iter=max_iter, tol=tol, n_jobs=n_jobs) def _build_graph(self): """Matrix representing a fully connected graph between each sample @@ -375,16 +380,11 @@ class distributions will exceed 1 (normalization may be desired). return affinity_matrix def fit(self, X, y): - # alpha is deprecated for LabelPropagation because it doesn't have any - # theoretical meaning (from the reference paper). Look at PR 6727. if self.alpha is not None: warnings.warn( - "alpha's value will be ignored." "alpha is deprecated since 0.19 and will be removed in 0.21.", DeprecationWarning ) - # Extra property to set the correct value of the deprecated "alpha". - self._alpha = 0 return super(LabelPropagation, self).fit(X, y) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 81e7dd028bf5d..b0b0e286dba4b 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -5,6 +5,8 @@ from sklearn.utils.testing import assert_equal from sklearn.semi_supervised import label_propagation from sklearn.metrics.pairwise import rbf_kernel +from sklearn.datasets import make_classification +from sklearn.utils.testing import assert_warns, assert_no_warnings from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal @@ -59,3 +61,15 @@ def test_predict_proba(): clf = estimator(**parameters).fit(samples, labels) assert_array_almost_equal(clf.predict_proba([[1., 1.]]), np.array([[0.5, 0.5]])) + + +def test_alpha_deprecation(): + X, y = make_classification(n_samples=100) + y[::3] = -1 + lp_default = label_propagation.LabelPropagation() + lp_default_y = assert_no_warnings(lp_default.fit, X, y).transduction_ + + lp_0 = label_propagation.LabelPropagation(alpha=0) + lp_0_y = assert_warns(DeprecationWarning, lp_0.fit, X, y).transduction_ + + assert_array_equal(lp_default_y, lp_0_y) From 1506b1b0e75d9653ef1ed9430b5cbfff6cf18c33 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 13 Jun 2017 12:19:02 +1000 Subject: [PATCH 13/34] Update what's new --- doc/whats_new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index d367c627c27c4..3d3bae79c8641 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -443,6 +443,11 @@ Bug fixes :class:`decomposition.IncrementalPCA`. :issue:`9105` by `Hanmin Qin `_. + - Fix :class:`semi_supervised.LabelPropagation` to always do hard clamping. + Its ``alpha`` parameter now defaults to 0 and the parameter is deprecated + to be removed in version 0.21. :issue:`6727` by + :user:`Andre Ambrosio Boechat ` and `Joel Nothman`_. + API changes summary ------------------- From 5508a240e5c2841b6a958c9dd13ab565a32129c6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 21 Jun 2017 10:10:01 +1000 Subject: [PATCH 14/34] Try fix RuntimeWarning in test_alpha_deprecation --- sklearn/semi_supervised/tests/test_label_propagation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index b0b0e286dba4b..0fbc5261d7925 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -66,10 +66,11 @@ def test_predict_proba(): def test_alpha_deprecation(): X, y = make_classification(n_samples=100) y[::3] = -1 - lp_default = label_propagation.LabelPropagation() + # Using kernel=knn as rbf appears to result in exp underflow + lp_default = label_propagation.LabelPropagation(kernel='knn') lp_default_y = assert_no_warnings(lp_default.fit, X, y).transduction_ - lp_0 = label_propagation.LabelPropagation(alpha=0) + lp_0 = label_propagation.LabelPropagation(alpha=0, kernel='knn') lp_0_y = assert_warns(DeprecationWarning, lp_0.fit, X, y).transduction_ assert_array_equal(lp_default_y, lp_0_y) From 31b1be1ad8b6ff8bf4419c150552d78843c368af Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 21 Jun 2017 10:24:51 +1000 Subject: [PATCH 15/34] DOC Indent deprecation details --- sklearn/semi_supervised/label_propagation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 2a70800ae0b66..307c3389a4f8c 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -306,8 +306,8 @@ class LabelPropagation(BaseLabelPropagation): Clamping factor. .. deprecated:: 0.19 - This parameter will be removed in 0.21. - 'alpha' is fixed to zero in 'LabelPropagation'. + This parameter will be removed in 0.21. + 'alpha' is fixed to zero in 'LabelPropagation'. max_iter : float Change maximum number of iterations allowed From 6d8cefc119d401e5e382819d74cfdd3774a9d06a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 21 Jun 2017 10:26:08 +1000 Subject: [PATCH 16/34] DOC wording --- sklearn/semi_supervised/label_propagation.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 307c3389a4f8c..85a5144d921fc 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -412,8 +412,9 @@ class LabelSpreading(BaseLabelPropagation): parameter for knn kernel alpha : float - Clamping factor [0, 1], it specifies the relative amount of the - information from its neighbors and its initial label information. + Clamping factor. A value in [0, 1] that specifies the relative amount + that an instance should adopt the information from its neighbors as + opposed to its initial label. alpha=0 means keeping the initial label information; alpha=1 means replacing all initial information. From 6a8ba661753159df7d504a7b0ccabbeb90f03b6d Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 21 Jun 2017 18:38:01 +1000 Subject: [PATCH 17/34] Update docs --- doc/modules/label_propagation.rst | 4 ++-- examples/semi_supervised/plot_label_propagation_structure.py | 2 +- sklearn/semi_supervised/label_propagation.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/label_propagation.rst b/doc/modules/label_propagation.rst index eddc34b7a8c7c..1aba742723f01 100644 --- a/doc/modules/label_propagation.rst +++ b/doc/modules/label_propagation.rst @@ -52,8 +52,8 @@ differ in modifications to the similarity matrix that graph and the clamping effect on the label distributions. Clamping allows the algorithm to change the weight of the true ground labeled data to some degree. The :class:`LabelPropagation` algorithm performs hard -clamping of input labels, which means :math:`\alpha=1`. This clamping factor -can be relaxed, to say :math:`\alpha=0.8`, which means that we will always +clamping of input labels, which means :math:`\alpha=0`. This clamping factor +can be relaxed, to say :math:`\alpha=0.2`, which means that we will always retain 80 percent of our original label distribution, but the algorithm gets to change its confidence of the distribution within 20 percent. diff --git a/examples/semi_supervised/plot_label_propagation_structure.py b/examples/semi_supervised/plot_label_propagation_structure.py index 7cc15d73f1b89..95f19ec108e82 100644 --- a/examples/semi_supervised/plot_label_propagation_structure.py +++ b/examples/semi_supervised/plot_label_propagation_structure.py @@ -30,7 +30,7 @@ # ############################################################################# # Learn with LabelSpreading -label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=1.0) +label_spread = label_propagation.LabelSpreading(kernel='knn', alpha=0.2) label_spread.fit(X, labels) # ############################################################################# diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 85a5144d921fc..7c65b12622b0d 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -17,8 +17,8 @@ The algorithm tries to learn distributions of labels over the dataset. In the "Hard Clamp" mode, the true ground labels are never allowed to change. They are clamped into position. In the "Soft Clamp" mode, they are allowed some - wiggle room, but some alpha of their original value will always be retained. - Hard clamp is the same as soft clamping with alpha set to 1. + wiggle room, but (1 - alpha) of their original value will be retained. + Hard clamp is the same as soft clamping with alpha set to 0. Kernel: A function which projects a vector into some higher dimensional space. This From 6ee6dd10541474da8bcabd8781a326fbdf6337dc Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sun, 25 Jun 2017 14:58:04 +0200 Subject: [PATCH 18/34] Change to the one true implementation. --- sklearn/semi_supervised/label_propagation.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 7c65b12622b0d..7888578362b88 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -245,8 +245,11 @@ def fit(self, X, y): alpha = 0 y = np.asarray(y) unlabeled = y == -1 - clamp_weights = np.ones((n_samples, 1)) - clamp_weights[~unlabeled, 0] = alpha + # clamp_weights = np.ones((n_samples, 1)) + # clamp_weights[~unlabeled, 0] = alpha + + # TODO TESTING + clamp_weights = alpha * np.ones((n_samples, 1)) # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) @@ -256,7 +259,8 @@ def fit(self, X, y): y_static = np.copy(self.label_distributions_) if alpha > 0.: y_static *= 1 - alpha - y_static[unlabeled] = 0 + # TODO TESTING + # y_static[unlabeled] = 0 l_previous = np.zeros((self.X_.shape[0], n_classes)) From 4d7433bf2c50ef516eebcbbcca9d1807d409176f Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Mon, 26 Jun 2017 00:12:11 +0200 Subject: [PATCH 19/34] Add sanity-checked impl. of Label{Propagation,Spreading} --- sklearn/semi_supervised/label_propagation.py | 24 ++++++---- .../tests/test_label_propagation.py | 48 +++++++++++++++++-- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 7888578362b88..320abaaf937f5 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -241,15 +241,16 @@ def fit(self, X, y): n_samples, n_classes = len(y), len(classes) alpha = self.alpha - if alpha is None: - alpha = 0 y = np.asarray(y) unlabeled = y == -1 - # clamp_weights = np.ones((n_samples, 1)) - # clamp_weights[~unlabeled, 0] = alpha - # TODO TESTING - clamp_weights = alpha * np.ones((n_samples, 1)) + if alpha is None: + # LabelPropagation + clamp_weights = np.ones((n_samples, 1)) + clamp_weights[~unlabeled, 0] = 0.0 + else: + # LabelSpreading + clamp_weights = alpha * np.ones((n_samples, 1)) # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) @@ -257,10 +258,12 @@ def fit(self, X, y): self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) - if alpha > 0.: + if alpha is None: + # LabelPropagation + y_static[unlabeled] = 0 + else: + # LabelSpreading y_static *= 1 - alpha - # TODO TESTING - # y_static[unlabeled] = 0 l_previous = np.zeros((self.X_.shape[0], n_classes)) @@ -272,6 +275,7 @@ def fit(self, X, y): l_previous = self.label_distributions_ self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) + # clamp self.label_distributions_ = np.multiply( clamp_weights, self.label_distributions_) + y_static @@ -279,6 +283,7 @@ def fit(self, X, y): normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer + # set the transduction item transduction = self.classes_[np.argmax(self.label_distributions_, axis=1)] @@ -389,6 +394,7 @@ def fit(self, X, y): "alpha is deprecated since 0.19 and will be removed in 0.21.", DeprecationWarning ) + self.alpha = None return super(LabelPropagation, self).fit(X, y) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 0fbc5261d7925..433cfae5e26db 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -66,11 +66,53 @@ def test_predict_proba(): def test_alpha_deprecation(): X, y = make_classification(n_samples=100) y[::3] = -1 - # Using kernel=knn as rbf appears to result in exp underflow - lp_default = label_propagation.LabelPropagation(kernel='knn') + + lp_default = label_propagation.LabelPropagation(kernel='rbf', gamma=0.1) lp_default_y = assert_no_warnings(lp_default.fit, X, y).transduction_ - lp_0 = label_propagation.LabelPropagation(alpha=0, kernel='knn') + lp_0 = label_propagation.LabelPropagation(alpha=0, kernel='rbf', gamma=0.1) lp_0_y = assert_warns(DeprecationWarning, lp_0.fit, X, y).transduction_ assert_array_equal(lp_default_y, lp_0_y) + + +def test_label_spreading(): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + y[::3] = -1 + clf = label_propagation.LabelSpreading().fit(X, y) + # adopting notation from Zhou et al (2004): + S = clf._build_graph() + Y = np.zeros((len(y), n_classes + 1)) + Y[np.arange(len(y)), y] = 1 + Y = Y[:, :-1] + for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]: + expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y) + expected /= expected.sum(axis=1)[:, np.newaxis] + clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha) + clf.fit(X, y) + assert_array_almost_equal(expected, clf.label_distributions_, 4) + + +def test_label_propagation(): + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + y[::3] = -1 + Y = np.zeros((len(y), n_classes + 1)) + Y[np.arange(len(y)), y] = 1 + unlabelled_idx = Y[:, (-1,)].nonzero()[0] + labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0] + + clf = label_propagation.LabelPropagation(max_iter=10000, gamma=0.1).fit(X, y) + T_bar = clf._build_graph() + Tuu = T_bar[np.meshgrid(unlabelled_idx, unlabelled_idx, indexing='ij')] + Tul = T_bar[np.meshgrid(unlabelled_idx, labelled_idx, indexing='ij')] + Y = Y[:, :-1] + Y_l = Y[labelled_idx, :] + Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l) + + expected = Y.copy() + expected[unlabelled_idx, :] = Y_u + expected /= expected.sum(axis=1)[:, np.newaxis] + + assert_array_almost_equal(expected, clf.label_distributions_, 4) From 3783ef35e61baeb599edb3721be0bd9d57af067a Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Mon, 26 Jun 2017 22:57:19 +0200 Subject: [PATCH 20/34] Raise ValueError if alpha is invalid in LabelSpreading. --- sklearn/semi_supervised/label_propagation.py | 3 +++ .../semi_supervised/tests/test_label_propagation.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 320abaaf937f5..385a965effff0 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -484,6 +484,9 @@ class LabelSpreading(BaseLabelPropagation): def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=1): + if alpha <= 0.0 or alpha >= 1.0: + raise ValueError('alpha must be inside the open interval (0, 1)') + # this one has different base parameters super(LabelSpreading, self).__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 433cfae5e26db..e73266d1aa607 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -3,6 +3,9 @@ import numpy as np from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_no_warnings from sklearn.semi_supervised import label_propagation from sklearn.metrics.pairwise import rbf_kernel from sklearn.datasets import make_classification @@ -116,3 +119,10 @@ def test_label_propagation(): expected /= expected.sum(axis=1)[:, np.newaxis] assert_array_almost_equal(expected, clf.label_distributions_, 4) + + +def test_valid_alpha(): + assert_raises(ValueError, label_propagation.LabelSpreading, alpha=-0.1) + assert_raises(ValueError, label_propagation.LabelSpreading, alpha=0) + assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1) + assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1.1) From 5cb7983d93df496edd96dc5e5795e8eaf3549483 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Mon, 26 Jun 2017 22:59:17 +0200 Subject: [PATCH 21/34] Add a normalizing step before clamping to LabelPropagation. --- sklearn/semi_supervised/label_propagation.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 385a965effff0..631c0d53c914e 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -276,6 +276,11 @@ def fit(self, X, y): self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) + if alpha is None: + # LabelPropagation + normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + self.label_distributions_ /= normalizer + # clamp self.label_distributions_ = np.multiply( clamp_weights, self.label_distributions_) + y_static From d5643d89550c85626b829bf89e06476ee5a487c8 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 29 Jun 2017 13:10:03 +0200 Subject: [PATCH 22/34] Fix flake8 errors. --- sklearn/semi_supervised/label_propagation.py | 3 ++- sklearn/semi_supervised/tests/test_label_propagation.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 631c0d53c914e..a36e7e449a431 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -278,7 +278,8 @@ def fit(self, X, y): if alpha is None: # LabelPropagation - normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] + normalizer = np.sum( + self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer # clamp diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index e73266d1aa607..ae5975894e33d 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -81,7 +81,8 @@ def test_alpha_deprecation(): def test_label_spreading(): n_classes = 2 - X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + X, y = make_classification(n_classes=n_classes, n_samples=200, + random_state=0) y[::3] = -1 clf = label_propagation.LabelSpreading().fit(X, y) # adopting notation from Zhou et al (2004): @@ -99,14 +100,16 @@ def test_label_spreading(): def test_label_propagation(): n_classes = 2 - X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) + X, y = make_classification(n_classes=n_classes, n_samples=200, + random_state=0) y[::3] = -1 Y = np.zeros((len(y), n_classes + 1)) Y[np.arange(len(y)), y] = 1 unlabelled_idx = Y[:, (-1,)].nonzero()[0] labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0] - clf = label_propagation.LabelPropagation(max_iter=10000, gamma=0.1).fit(X, y) + clf = label_propagation.LabelPropagation(max_iter=10000, + gamma=0.1).fit(X, y) T_bar = clf._build_graph() Tuu = T_bar[np.meshgrid(unlabelled_idx, unlabelled_idx, indexing='ij')] Tul = T_bar[np.meshgrid(unlabelled_idx, labelled_idx, indexing='ij')] From caf865500f67f369c974643da9522a35b4d907ba Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 29 Jun 2017 18:41:33 +0200 Subject: [PATCH 23/34] Remove duplicate imports. --- sklearn/semi_supervised/tests/test_label_propagation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index ae5975894e33d..134d0e7c57032 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -9,7 +9,6 @@ from sklearn.semi_supervised import label_propagation from sklearn.metrics.pairwise import rbf_kernel from sklearn.datasets import make_classification -from sklearn.utils.testing import assert_warns, assert_no_warnings from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal From ae234557b678b93d38b35f8a42c33f4a90a439df Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 29 Jun 2017 21:45:02 +0200 Subject: [PATCH 24/34] DOC Update What's New. --- doc/whats_new.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 3d3bae79c8641..0f6a6b4e25cc9 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -441,12 +441,16 @@ Bug fixes in :class:`decomposition.PCA`, :class:`decomposition.RandomizedPCA` and :class:`decomposition.IncrementalPCA`. - :issue:`9105` by `Hanmin Qin `_. + :issue:`9105` by `Hanmin Qin `_. + + - Fix :class:`semi_supervised.BaseLabelPropagation` to correctly implement + ``LabelPropagation`` and ``LabelSpreading`` as done in the referenced + papers. :class:`semi_supervised.LabelPropagation` now always does hard + clamping. Its ``alpha`` parameter now defaults to ``None`` and is + deprecated to be removed in 0.21. :issue:`6727` :issue:`3550` issue:`5770` + by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay + `, and `Joel Nothman`_. - - Fix :class:`semi_supervised.LabelPropagation` to always do hard clamping. - Its ``alpha`` parameter now defaults to 0 and the parameter is deprecated - to be removed in version 0.21. :issue:`6727` by - :user:`Andre Ambrosio Boechat ` and `Joel Nothman`_. API changes summary ------------------- From 1f70682d3fe91c21dc53a9cd15c8800a921e676d Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 29 Jun 2017 23:25:24 +0200 Subject: [PATCH 25/34] Specify alpha's value in the error. --- sklearn/semi_supervised/label_propagation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index a36e7e449a431..095713e00504b 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -491,7 +491,8 @@ def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=1): if alpha <= 0.0 or alpha >= 1.0: - raise ValueError('alpha must be inside the open interval (0, 1)') + raise ValueError('alpha=%s is invalid: it must be inside ' + 'the open interval (0, 1)' % alpha) # this one has different base parameters super(LabelSpreading, self).__init__(kernel=kernel, gamma=gamma, From 2705d732a4328c13e408d1775bfcce26ee7f2a56 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Thu, 29 Jun 2017 23:27:58 +0200 Subject: [PATCH 26/34] Tidy up tests. Add a test and add references, where needed. --- .../tests/test_label_propagation.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 134d0e7c57032..ee30265c1c5f5 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -78,7 +78,7 @@ def test_alpha_deprecation(): assert_array_equal(lp_default_y, lp_0_y) -def test_label_spreading(): +def test_label_spreading_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) @@ -97,7 +97,7 @@ def test_label_spreading(): assert_array_almost_equal(expected, clf.label_distributions_, 4) -def test_label_propagation(): +def test_label_propagation_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) @@ -109,6 +109,7 @@ def test_label_propagation(): clf = label_propagation.LabelPropagation(max_iter=10000, gamma=0.1).fit(X, y) + # adopting notation from Zhu et al 2002 T_bar = clf._build_graph() Tuu = T_bar[np.meshgrid(unlabelled_idx, unlabelled_idx, indexing='ij')] Tul = T_bar[np.meshgrid(unlabelled_idx, labelled_idx, indexing='ij')] @@ -128,3 +129,14 @@ def test_valid_alpha(): assert_raises(ValueError, label_propagation.LabelSpreading, alpha=0) assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1) assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1.1) + + +def test_clamping(): + X = np.array([[1., 0.], [0., 1.], [1., 2.5]]) + y = np.array([0, 1, -1]) + mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000) + mdl.fit(X, y) + + # this should converge quickly: + assert mdl.n_iter_ < 10 + assert_array_equal(mdl.predict(X), [0, 1, 1]) From a56f985d9f548d8f6185c2cb085808528c2b7abb Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Fri, 30 Jun 2017 13:18:05 +0200 Subject: [PATCH 27/34] Add comment to non-regression test. --- sklearn/semi_supervised/tests/test_label_propagation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index ee30265c1c5f5..5406b10c9fd2c 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -131,7 +131,8 @@ def test_valid_alpha(): assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1.1) -def test_clamping(): +def test_convergence_speed(): + # This is a non-regression test for #5774 X = np.array([[1., 0.], [0., 1.], [1., 2.5]]) y = np.array([0, 1, -1]) mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000) From 37b0dee5e9c983ca2922c7e93ce802f3892a0744 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sat, 1 Jul 2017 14:41:07 +0200 Subject: [PATCH 28/34] Fix documentation. --- sklearn/semi_supervised/label_propagation.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 095713e00504b..12af7a3bb8a92 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -14,11 +14,12 @@ Model Features -------------- Label clamping: - The algorithm tries to learn distributions of labels over the dataset. In the - "Hard Clamp" mode, the true ground labels are never allowed to change. They - are clamped into position. In the "Soft Clamp" mode, they are allowed some - wiggle room, but (1 - alpha) of their original value will be retained. - Hard clamp is the same as soft clamping with alpha set to 0. + The algorithm tries to learn distributions of labels over the dataset given + label assignments over an initial subset. In one variant, the algorithm does + not allow for any errors in the initial assignment (hard-clamping) while + in another variant, the algorithm allows for some wiggle room for the initial + assignments, allowing them to change by a fraction alpha in each iteration + (soft-clamping). Kernel: A function which projects a vector into some higher dimensional space. This From 8fd2dddc2ca16d578236cba65faefe175c84c21d Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sat, 1 Jul 2017 14:44:43 +0200 Subject: [PATCH 29/34] Move check for alpha into fit from __init__. --- sklearn/semi_supervised/label_propagation.py | 7 +++---- .../semi_supervised/tests/test_label_propagation.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 12af7a3bb8a92..8f18f8ce30686 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -242,6 +242,9 @@ def fit(self, X, y): n_samples, n_classes = len(y), len(classes) alpha = self.alpha + if alpha is not None and (alpha <= 0.0 or alpha >= 1.0): + raise ValueError('alpha=%s is invalid: it must be inside ' + 'the open interval (0, 1)' % alpha) y = np.asarray(y) unlabeled = y == -1 @@ -491,10 +494,6 @@ class LabelSpreading(BaseLabelPropagation): def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=1): - if alpha <= 0.0 or alpha >= 1.0: - raise ValueError('alpha=%s is invalid: it must be inside ' - 'the open interval (0, 1)' % alpha) - # this one has different base parameters super(LabelSpreading, self).__init__(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 5406b10c9fd2c..7286f79d63cef 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -125,10 +125,14 @@ def test_label_propagation_closed_form(): def test_valid_alpha(): - assert_raises(ValueError, label_propagation.LabelSpreading, alpha=-0.1) - assert_raises(ValueError, label_propagation.LabelSpreading, alpha=0) - assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1) - assert_raises(ValueError, label_propagation.LabelSpreading, alpha=1.1) + n_classes = 2 + X, y = make_classification(n_classes=n_classes, n_samples=200, + random_state=0) + for alpha in [-0.1, 0, 1, 1.1]: + assert_raises(ValueError, + lambda **kwargs: + label_propagation.LabelSpreading(**kwargs).fit(X, y), + alpha=alpha) def test_convergence_speed(): From 77174e868dd9e823821ceb39ac19672a9022bccb Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sun, 2 Jul 2017 11:08:06 +0200 Subject: [PATCH 30/34] Fix corner case of LabelSpreading with alpha=None. --- sklearn/semi_supervised/label_propagation.py | 14 ++++++++++---- .../tests/test_label_propagation.py | 4 ++-- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 8f18f8ce30686..61a42380e9254 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -242,13 +242,14 @@ def fit(self, X, y): n_samples, n_classes = len(y), len(classes) alpha = self.alpha - if alpha is not None and (alpha <= 0.0 or alpha >= 1.0): + if self.variant == 'spreading' and \ + (alpha is None or alpha <= 0.0 or alpha >= 1.0): raise ValueError('alpha=%s is invalid: it must be inside ' 'the open interval (0, 1)' % alpha) y = np.asarray(y) unlabeled = y == -1 - if alpha is None: + if self.variant == 'propagation': # LabelPropagation clamp_weights = np.ones((n_samples, 1)) clamp_weights[~unlabeled, 0] = 0.0 @@ -262,7 +263,7 @@ def fit(self, X, y): self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) - if alpha is None: + if self.variant == 'propagation': # LabelPropagation y_static[unlabeled] = 0 else: @@ -280,7 +281,7 @@ def fit(self, X, y): self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) - if alpha is None: + if alpha == 'propagation': # LabelPropagation normalizer = np.sum( self.label_distributions_, axis=1)[:, np.newaxis] @@ -376,6 +377,9 @@ class LabelPropagation(BaseLabelPropagation): -------- LabelSpreading : Alternate label propagation strategy more robust to noise """ + + variant = 'propagation' + def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=None, max_iter=30, tol=1e-3, n_jobs=1): super(LabelPropagation, self).__init__( @@ -491,6 +495,8 @@ class LabelSpreading(BaseLabelPropagation): LabelPropagation : Unregularized graph based semi-supervised learning """ + variant = 'spreading' + def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=1): diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 7286f79d63cef..3d5bd21a89110 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -128,10 +128,10 @@ def test_valid_alpha(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) - for alpha in [-0.1, 0, 1, 1.1]: + for alpha in [-0.1, 0, 1, 1.1, None]: assert_raises(ValueError, lambda **kwargs: - label_propagation.LabelSpreading(**kwargs).fit(X, y), + label_propagation.LabelSpreading(**kwargs).fit(X, y), alpha=alpha) From 81c0964b6f59f7473edb648dcfecc1074b5d9251 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sun, 2 Jul 2017 12:03:41 +0200 Subject: [PATCH 31/34] alpha -> self.variant --- sklearn/semi_supervised/label_propagation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 61a42380e9254..ec8365848aaf6 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -281,8 +281,7 @@ def fit(self, X, y): self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) - if alpha == 'propagation': - # LabelPropagation + if self.variant == 'propagation': normalizer = np.sum( self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer From 8f21418a1000fb7860096ed5f247ac89e91a35c4 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sun, 2 Jul 2017 17:43:09 +0200 Subject: [PATCH 32/34] Make Whats_new more explicit. --- doc/whats_new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0f6a6b4e25cc9..6fd3dec2cfb2b 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -446,7 +446,7 @@ Bug fixes - Fix :class:`semi_supervised.BaseLabelPropagation` to correctly implement ``LabelPropagation`` and ``LabelSpreading`` as done in the referenced papers. :class:`semi_supervised.LabelPropagation` now always does hard - clamping. Its ``alpha`` parameter now defaults to ``None`` and is + clamping. Its ``alpha`` parameter has no effect and is deprecated to be removed in 0.21. :issue:`6727` :issue:`3550` issue:`5770` by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay `, and `Joel Nothman`_. From 105bcf73fca3bb10462e7832e191f1964d8aadf5 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Sun, 2 Jul 2017 17:44:08 +0200 Subject: [PATCH 33/34] Simplify impl. of Label{Propagation,Spreading}. --- sklearn/semi_supervised/label_propagation.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index ec8365848aaf6..46e40d29a6421 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -249,14 +249,6 @@ def fit(self, X, y): y = np.asarray(y) unlabeled = y == -1 - if self.variant == 'propagation': - # LabelPropagation - clamp_weights = np.ones((n_samples, 1)) - clamp_weights[~unlabeled, 0] = 0.0 - else: - # LabelSpreading - clamp_weights = alpha * np.ones((n_samples, 1)) - # initialize distributions self.label_distributions_ = np.zeros((n_samples, n_classes)) for label in classes: @@ -273,6 +265,7 @@ def fit(self, X, y): l_previous = np.zeros((self.X_.shape[0], n_classes)) remaining_iter = self.max_iter + unlabeled = unlabeled[:, np.newaxis] if sparse.isspmatrix(graph_matrix): graph_matrix = graph_matrix.tocsr() while (_not_converged(self.label_distributions_, l_previous, self.tol) @@ -285,10 +278,13 @@ def fit(self, X, y): normalizer = np.sum( self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer - - # clamp - self.label_distributions_ = np.multiply( - clamp_weights, self.label_distributions_) + y_static + self.label_distributions_ = np.where(unlabeled, + self.label_distributions_, + y_static) + else: + # clamp + self.label_distributions_ = np.multiply( + alpha, self.label_distributions_) + y_static remaining_iter -= 1 normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis] From f8fa824999ebb8b40314a824edf5c7d916237935 Mon Sep 17 00:00:00 2001 From: Utkarsh Upadhyay Date: Mon, 3 Jul 2017 16:03:17 +0200 Subject: [PATCH 34/34] variant -> _variant. --- sklearn/semi_supervised/label_propagation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/semi_supervised/label_propagation.py b/sklearn/semi_supervised/label_propagation.py index 46e40d29a6421..ab0dd64bf81ea 100644 --- a/sklearn/semi_supervised/label_propagation.py +++ b/sklearn/semi_supervised/label_propagation.py @@ -242,7 +242,7 @@ def fit(self, X, y): n_samples, n_classes = len(y), len(classes) alpha = self.alpha - if self.variant == 'spreading' and \ + if self._variant == 'spreading' and \ (alpha is None or alpha <= 0.0 or alpha >= 1.0): raise ValueError('alpha=%s is invalid: it must be inside ' 'the open interval (0, 1)' % alpha) @@ -255,7 +255,7 @@ def fit(self, X, y): self.label_distributions_[y == label, classes == label] = 1 y_static = np.copy(self.label_distributions_) - if self.variant == 'propagation': + if self._variant == 'propagation': # LabelPropagation y_static[unlabeled] = 0 else: @@ -274,7 +274,7 @@ def fit(self, X, y): self.label_distributions_ = safe_sparse_dot( graph_matrix, self.label_distributions_) - if self.variant == 'propagation': + if self._variant == 'propagation': normalizer = np.sum( self.label_distributions_, axis=1)[:, np.newaxis] self.label_distributions_ /= normalizer @@ -373,7 +373,7 @@ class LabelPropagation(BaseLabelPropagation): LabelSpreading : Alternate label propagation strategy more robust to noise """ - variant = 'propagation' + _variant = 'propagation' def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=None, max_iter=30, tol=1e-3, n_jobs=1): @@ -490,7 +490,7 @@ class LabelSpreading(BaseLabelPropagation): LabelPropagation : Unregularized graph based semi-supervised learning """ - variant = 'spreading' + _variant = 'spreading' def __init__(self, kernel='rbf', gamma=20, n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=1):