From c199cb636eedc63598e8c8e47d7aa19b8a37d1b1 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 12:58:54 -0400 Subject: [PATCH 01/14] change sample_weight to sample_weight_arr --- sklearn/neighbors/binary_tree.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index 057f869fdefba..43866225474ad 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1138,7 +1138,7 @@ cdef class BinaryTree: int(self.n_splits), int(self.n_calls), self.dist_metric, - self.sample_weight) + self.sample_weight_arr) def __setstate__(self, state): """ From 3933e9907fb4cae3214da067b40880c035a0af97 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 20:28:38 -0400 Subject: [PATCH 02/14] add a test --- sklearn/neighbors/tests/test_kde.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 7d72c9f2cb8f2..5438047a269da 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -223,3 +223,20 @@ def test_pickling(tmpdir): scores_pickled = kde.score_samples(X) assert_allclose(scores, scores_pickled) + +def test_pickling_with_sample_weights(tmpdir): + # Test to see if KernelDensity object is pickled if supplied with sample weights + + kde = KernelDensity() + data = np.reshape([1., 2., 3.], (-1, 1)) + kde.fit(data, sample_weight=[0.1, 0.2, 0.3]) + + X = np.reshape([1.1, 2.1], (-1, 1)) + scores = kde.score_samples(X) + + file_path = str(tmpdir.join('dump_with_sample_weights.pkl')) + _joblib.dump(kde, file_path) + kde = _joblib.load(file_path) + scores_pickled = kde.score_samples(X) + + assert_allclose(scores, scores_pickled) From 0476cf38f920d5bcef4d5dc31e7b0bab16e36695 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 21:11:15 -0400 Subject: [PATCH 03/14] Fix 84 characters in a line --- sklearn/neighbors/tests/test_kde.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 5438047a269da..0ce95a56891cb 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -225,7 +225,8 @@ def test_pickling(tmpdir): assert_allclose(scores, scores_pickled) def test_pickling_with_sample_weights(tmpdir): - # Test to see if KernelDensity object is pickled if supplied with sample weights + # Test to see if KernelDensity object is pickled if supplied with + # sample weights. kde = KernelDensity() data = np.reshape([1., 2., 3.], (-1, 1)) From c136f2af07317234b80910e4c96f924f83f7f1e7 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 21:19:40 -0400 Subject: [PATCH 04/14] add one more line between functions --- sklearn/neighbors/tests/test_kde.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 0ce95a56891cb..7c09834200e03 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -224,6 +224,7 @@ def test_pickling(tmpdir): assert_allclose(scores, scores_pickled) + def test_pickling_with_sample_weights(tmpdir): # Test to see if KernelDensity object is pickled if supplied with # sample weights. From b6c483a0f90633d7164641929564871f1debe023 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 21:54:30 -0400 Subject: [PATCH 05/14] set sample_state_arr --- sklearn/neighbors/binary_tree.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index 43866225474ad..a16ba1088412d 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1164,7 +1164,7 @@ cdef class BinaryTree: self.dist_metric = state[11] self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') - self.sample_weight = state[12] + self.sample_weight_arr = state[12] def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) From 4baab0de2a6d8fc0b9f967acaea30f03be7f6700 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 22:34:59 -0400 Subject: [PATCH 06/14] assign sample weights after unpickling --- sklearn/neighbors/binary_tree.pxi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index a16ba1088412d..194bfe41a19fe 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1166,6 +1166,10 @@ cdef class BinaryTree: == 'EuclideanDistance') self.sample_weight_arr = state[12] + if self.sample_weight_arr is not None: + self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) + self.sum_weight = np.sum(self.sample_weight) + def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) From 7e51d138f39dee0c72ebd87ffd0d660c9eecc588 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Thu, 2 May 2019 22:59:06 -0400 Subject: [PATCH 07/14] save sample_weight and sum_weight too --- sklearn/neighbors/binary_tree.pxi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index 194bfe41a19fe..3009595fef4d7 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1138,7 +1138,9 @@ cdef class BinaryTree: int(self.n_splits), int(self.n_calls), self.dist_metric, - self.sample_weight_arr) + self.sample_weight_arr, + self.sample_weight, + self.sum_weight) def __setstate__(self, state): """ @@ -1165,10 +1167,8 @@ cdef class BinaryTree: self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') self.sample_weight_arr = state[12] - - if self.sample_weight_arr is not None: - self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) - self.sum_weight = np.sum(self.sample_weight) + self.sample_weight = state[13] + self.sum_weight = state[14] def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) From 22281559162a3c315ccebf9680f2aef83dc5d86b Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Fri, 3 May 2019 00:36:48 -0400 Subject: [PATCH 08/14] set sample_weight and sum_weight after unpickling --- sklearn/neighbors/binary_tree.pxi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index 3009595fef4d7..194bfe41a19fe 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1138,9 +1138,7 @@ cdef class BinaryTree: int(self.n_splits), int(self.n_calls), self.dist_metric, - self.sample_weight_arr, - self.sample_weight, - self.sum_weight) + self.sample_weight_arr) def __setstate__(self, state): """ @@ -1167,8 +1165,10 @@ cdef class BinaryTree: self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') self.sample_weight_arr = state[12] - self.sample_weight = state[13] - self.sum_weight = state[14] + + if self.sample_weight_arr is not None: + self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) + self.sum_weight = np.sum(self.sample_weight) def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) From 1ce2496e1c0ccd9821ceba7b26a67c39480ecd79 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Fri, 3 May 2019 01:43:20 -0400 Subject: [PATCH 09/14] also handle the case when sample_weight is None --- sklearn/neighbors/binary_tree.pxi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index 194bfe41a19fe..ae2f182ed215e 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1169,6 +1169,9 @@ cdef class BinaryTree: if self.sample_weight_arr is not None: self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) self.sum_weight = np.sum(self.sample_weight) + else: + self.sample_weight = None + self.sum_weight = n_samples def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) From 9442dab264595cde152c55b5b501aa825f3323ef Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Fri, 3 May 2019 01:52:55 -0400 Subject: [PATCH 10/14] initialise n_samples --- sklearn/neighbors/binary_tree.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index ae2f182ed215e..c56fe63672024 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1166,6 +1166,7 @@ cdef class BinaryTree: == 'EuclideanDistance') self.sample_weight_arr = state[12] + n_samples = self.data.shape[0] if self.sample_weight_arr is not None: self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) self.sum_weight = np.sum(self.sample_weight) From 2128a24c4e154af933a0b896b11c916739d09b10 Mon Sep 17 00:00:00 2001 From: Aditya Vyas Date: Fri, 3 May 2019 16:25:26 -0400 Subject: [PATCH 11/14] check if sample weight has proper size --- sklearn/neighbors/binary_tree.pxi | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index c56fe63672024..edfe2d53154b9 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1130,6 +1130,7 @@ cdef class BinaryTree: self.idx_array_arr, self.node_data_arr, self.node_bounds_arr, + self.sample_weight_arr, int(self.leaf_size), int(self.n_levels), int(self.n_nodes), @@ -1137,8 +1138,7 @@ cdef class BinaryTree: int(self.n_leaves), int(self.n_splits), int(self.n_calls), - self.dist_metric, - self.sample_weight_arr) + self.dist_metric) def __setstate__(self, state): """ @@ -1148,27 +1148,27 @@ cdef class BinaryTree: self.idx_array_arr = state[1] self.node_data_arr = state[2] self.node_bounds_arr = state[3] + self.sample_weight_arr = state[4] self.data = get_memview_DTYPE_2D(self.data_arr) self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) self.node_data = get_memview_NodeData_1D(self.node_data_arr) self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr) + self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) - self.leaf_size = state[4] - self.n_levels = state[5] - self.n_nodes = state[6] - self.n_trims = state[7] - self.n_leaves = state[8] - self.n_splits = state[9] - self.n_calls = state[10] - self.dist_metric = state[11] + self.leaf_size = state[5] + self.n_levels = state[6] + self.n_nodes = state[7] + self.n_trims = state[8] + self.n_leaves = state[9] + self.n_splits = state[10] + self.n_calls = state[11] + self.dist_metric = state[12] self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') - self.sample_weight_arr = state[12] n_samples = self.data.shape[0] - if self.sample_weight_arr is not None: - self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) + if self.sample_weight.size == n_samples: self.sum_weight = np.sum(self.sample_weight) else: self.sample_weight = None From d2e4581bec83430e9417b81c4eae1fc1763019ba Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 13 May 2019 16:36:34 +0200 Subject: [PATCH 12/14] Restore backward compat with 0.20 pickle files + factor redundant code --- sklearn/neighbors/binary_tree.pxi | 101 +++++++++++++++--------------- 1 file changed, 52 insertions(+), 49 deletions(-) diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index edfe2d53154b9..50450fed01c1a 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1064,10 +1064,17 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): - self.data_arr = np.asarray(data, dtype=DTYPE, order='C') - self.data = get_memview_DTYPE_2D(self.data_arr) + # validate data + if data.size == 0: + raise ValueError("X is an empty array") + + if leaf_size < 1: + raise ValueError("leaf_size must be greater than or equal to 1") + n_samples = data.shape[0] + n_features = data.shape[1] + self.data_arr = np.asarray(data, dtype=DTYPE, order='C') self.leaf_size = leaf_size self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ @@ -1079,26 +1086,6 @@ cdef class BinaryTree: '{BinaryTree}'.format(metric=metric, **DOC_DICT)) - # validate data - if self.data.size == 0: - raise ValueError("X is an empty array") - - if leaf_size < 1: - raise ValueError("leaf_size must be greater than or equal to 1") - - n_samples = self.data.shape[0] - n_features = self.data.shape[1] - - - if sample_weight is not None: - self.sample_weight_arr = np.asarray(sample_weight, dtype=DTYPE, order='C') - self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) - self.sum_weight = np.sum(self.sample_weight) - else: - self.sample_weight = None - self.sum_weight = n_samples - - # determine number of levels in the tree, and from this # the number of nodes in the tree. This results in leaf nodes # with numbers of points between leaf_size and 2 * leaf_size @@ -1107,15 +1094,34 @@ cdef class BinaryTree: # allocate arrays for storage self.idx_array_arr = np.arange(n_samples, dtype=ITYPE) - self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) - self.node_data_arr = np.zeros(self.n_nodes, dtype=NodeData) - self.node_data = get_memview_NodeData_1D(self.node_data_arr) + + self._update_sample_weight(n_samples, sample_weight) + self._update_memviews() # Allocate tree-specific data allocate_data(self, self.n_nodes, n_features) self._recursive_build(0, 0, n_samples) + def _update_sample_weight(self, n_samples, sample_weight): + if sample_weight is not None: + self.sample_weight_arr = np.asarray( + sample_weight, dtype=DTYPE, order='C') + self.sample_weight = get_memview_DTYPE_1D( + self.sample_weight_arr) + self.sum_weight = np.sum(self.sample_weight) + else: + self.sample_weight = None + self.sample_weight_arr = np.empty(1, dtype=DTYPE, order='C') + self.sum_weight = n_samples + + def _update_memviews(self): + self.data = get_memview_DTYPE_2D(self.data_arr) + self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) + self.node_data = get_memview_NodeData_1D(self.node_data_arr) + self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr) + + def __reduce__(self): """ reduce method used for pickling @@ -1126,11 +1132,17 @@ cdef class BinaryTree: """ get state for pickling """ + if self.sample_weight is not None: + # pass the numpy array + sample_weight_arr = self.sample_weight_arr + else: + # pass None to avoid confusion with the empty place holder + # of size 1 from __cinit__ + sample_weight_arr = None return (self.data_arr, self.idx_array_arr, self.node_data_arr, self.node_bounds_arr, - self.sample_weight_arr, int(self.leaf_size), int(self.n_levels), int(self.n_nodes), @@ -1138,7 +1150,8 @@ cdef class BinaryTree: int(self.n_leaves), int(self.n_splits), int(self.n_calls), - self.dist_metric) + self.dist_metric, + sample_weight_arr) def __setstate__(self, state): """ @@ -1148,31 +1161,21 @@ cdef class BinaryTree: self.idx_array_arr = state[1] self.node_data_arr = state[2] self.node_bounds_arr = state[3] - self.sample_weight_arr = state[4] - - self.data = get_memview_DTYPE_2D(self.data_arr) - self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) - self.node_data = get_memview_NodeData_1D(self.node_data_arr) - self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr) - self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) + self.leaf_size = state[4] + self.n_levels = state[5] + self.n_nodes = state[6] + self.n_trims = state[7] + self.n_leaves = state[8] + self.n_splits = state[9] + self.n_calls = state[10] + self.dist_metric = state[11] + sample_weight_arr = state[12] - self.leaf_size = state[5] - self.n_levels = state[6] - self.n_nodes = state[7] - self.n_trims = state[8] - self.n_leaves = state[9] - self.n_splits = state[10] - self.n_calls = state[11] - self.dist_metric = state[12] self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') - - n_samples = self.data.shape[0] - if self.sample_weight.size == n_samples: - self.sum_weight = np.sum(self.sample_weight) - else: - self.sample_weight = None - self.sum_weight = n_samples + n_samples = self.data_arr.shape[0] + self._update_sample_weight(n_samples, sample_weight_arr) + self._update_memviews() def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) From 3995316c45ed2640bf963c1c3d090dc23b6843e6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 14 May 2019 07:58:39 +1000 Subject: [PATCH 13/14] DOC add what's new --- doc/whats_new/v0.21.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index b2d28f3efb0f2..24453de2b9a7c 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -7,7 +7,7 @@ Version 0.21.1 ============== -**May 2019** +**17 May 2019** This is a bug-fix release with some minor documentation improvements and @@ -24,6 +24,13 @@ Changelog ``Y == None``. :issue:`13864` by :user:`Paresh Mathur `. +:mod:`sklearn.neighbors` +...................... + +- |Fix| Fixed a bug in :class:`neighbors.KernelDensity` which could not be + restored from a pickle if ``sample_weight`` had been used. + :issue:`13772` by :user:`Aditya Vyas `. + .. _changes_0_21: From 5539ff8a80bd54abf5bd0044d2414d3326d7930e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 14 May 2019 09:01:20 +0200 Subject: [PATCH 14/14] Factorize KDE pickling tests via pytest.mark.parametrize --- sklearn/neighbors/tests/test_kde.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 7c09834200e03..e930f0fb95e5c 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -205,14 +205,15 @@ def test_kde_sample_weights(): assert_allclose(scores_scaled_weight, scores_weight) -def test_pickling(tmpdir): +@pytest.mark.parametrize('sample_weight', [None, [0.1, 0.2, 0.3]]) +def test_pickling(tmpdir, sample_weight): # Make sure that predictions are the same before and after pickling. Used # to be a bug because sample_weights wasn't pickled and the resulting tree # would miss some info. kde = KernelDensity() data = np.reshape([1., 2., 3.], (-1, 1)) - kde.fit(data) + kde.fit(data, sample_weight=sample_weight) X = np.reshape([1.1, 2.1], (-1, 1)) scores = kde.score_samples(X) @@ -223,22 +224,3 @@ def test_pickling(tmpdir): scores_pickled = kde.score_samples(X) assert_allclose(scores, scores_pickled) - - -def test_pickling_with_sample_weights(tmpdir): - # Test to see if KernelDensity object is pickled if supplied with - # sample weights. - - kde = KernelDensity() - data = np.reshape([1., 2., 3.], (-1, 1)) - kde.fit(data, sample_weight=[0.1, 0.2, 0.3]) - - X = np.reshape([1.1, 2.1], (-1, 1)) - scores = kde.score_samples(X) - - file_path = str(tmpdir.join('dump_with_sample_weights.pkl')) - _joblib.dump(kde, file_path) - kde = _joblib.load(file_path) - scores_pickled = kde.score_samples(X) - - assert_allclose(scores, scores_pickled)