diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index b2d28f3efb0f2..24453de2b9a7c 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -7,7 +7,7 @@ Version 0.21.1 ============== -**May 2019** +**17 May 2019** This is a bug-fix release with some minor documentation improvements and @@ -24,6 +24,13 @@ Changelog ``Y == None``. :issue:`13864` by :user:`Paresh Mathur `. +:mod:`sklearn.neighbors` +...................... + +- |Fix| Fixed a bug in :class:`neighbors.KernelDensity` which could not be + restored from a pickle if ``sample_weight`` had been used. + :issue:`13772` by :user:`Aditya Vyas `. + .. _changes_0_21: diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi index 057f869fdefba..50450fed01c1a 100755 --- a/sklearn/neighbors/binary_tree.pxi +++ b/sklearn/neighbors/binary_tree.pxi @@ -1064,10 +1064,17 @@ cdef class BinaryTree: def __init__(self, data, leaf_size=40, metric='minkowski', sample_weight=None, **kwargs): - self.data_arr = np.asarray(data, dtype=DTYPE, order='C') - self.data = get_memview_DTYPE_2D(self.data_arr) + # validate data + if data.size == 0: + raise ValueError("X is an empty array") + + if leaf_size < 1: + raise ValueError("leaf_size must be greater than or equal to 1") + n_samples = data.shape[0] + n_features = data.shape[1] + self.data_arr = np.asarray(data, dtype=DTYPE, order='C') self.leaf_size = leaf_size self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) self.euclidean = (self.dist_metric.__class__.__name__ @@ -1079,26 +1086,6 @@ cdef class BinaryTree: '{BinaryTree}'.format(metric=metric, **DOC_DICT)) - # validate data - if self.data.size == 0: - raise ValueError("X is an empty array") - - if leaf_size < 1: - raise ValueError("leaf_size must be greater than or equal to 1") - - n_samples = self.data.shape[0] - n_features = self.data.shape[1] - - - if sample_weight is not None: - self.sample_weight_arr = np.asarray(sample_weight, dtype=DTYPE, order='C') - self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr) - self.sum_weight = np.sum(self.sample_weight) - else: - self.sample_weight = None - self.sum_weight = n_samples - - # determine number of levels in the tree, and from this # the number of nodes in the tree. This results in leaf nodes # with numbers of points between leaf_size and 2 * leaf_size @@ -1107,15 +1094,34 @@ cdef class BinaryTree: # allocate arrays for storage self.idx_array_arr = np.arange(n_samples, dtype=ITYPE) - self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) - self.node_data_arr = np.zeros(self.n_nodes, dtype=NodeData) - self.node_data = get_memview_NodeData_1D(self.node_data_arr) + + self._update_sample_weight(n_samples, sample_weight) + self._update_memviews() # Allocate tree-specific data allocate_data(self, self.n_nodes, n_features) self._recursive_build(0, 0, n_samples) + def _update_sample_weight(self, n_samples, sample_weight): + if sample_weight is not None: + self.sample_weight_arr = np.asarray( + sample_weight, dtype=DTYPE, order='C') + self.sample_weight = get_memview_DTYPE_1D( + self.sample_weight_arr) + self.sum_weight = np.sum(self.sample_weight) + else: + self.sample_weight = None + self.sample_weight_arr = np.empty(1, dtype=DTYPE, order='C') + self.sum_weight = n_samples + + def _update_memviews(self): + self.data = get_memview_DTYPE_2D(self.data_arr) + self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) + self.node_data = get_memview_NodeData_1D(self.node_data_arr) + self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr) + + def __reduce__(self): """ reduce method used for pickling @@ -1126,6 +1132,13 @@ cdef class BinaryTree: """ get state for pickling """ + if self.sample_weight is not None: + # pass the numpy array + sample_weight_arr = self.sample_weight_arr + else: + # pass None to avoid confusion with the empty place holder + # of size 1 from __cinit__ + sample_weight_arr = None return (self.data_arr, self.idx_array_arr, self.node_data_arr, @@ -1138,7 +1151,7 @@ cdef class BinaryTree: int(self.n_splits), int(self.n_calls), self.dist_metric, - self.sample_weight) + sample_weight_arr) def __setstate__(self, state): """ @@ -1148,12 +1161,6 @@ cdef class BinaryTree: self.idx_array_arr = state[1] self.node_data_arr = state[2] self.node_bounds_arr = state[3] - - self.data = get_memview_DTYPE_2D(self.data_arr) - self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr) - self.node_data = get_memview_NodeData_1D(self.node_data_arr) - self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr) - self.leaf_size = state[4] self.n_levels = state[5] self.n_nodes = state[6] @@ -1162,9 +1169,13 @@ cdef class BinaryTree: self.n_splits = state[9] self.n_calls = state[10] self.dist_metric = state[11] + sample_weight_arr = state[12] + self.euclidean = (self.dist_metric.__class__.__name__ == 'EuclideanDistance') - self.sample_weight = state[12] + n_samples = self.data_arr.shape[0] + self._update_sample_weight(n_samples, sample_weight_arr) + self._update_memviews() def get_tree_stats(self): return (self.n_trims, self.n_leaves, self.n_splits) diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 7d72c9f2cb8f2..e930f0fb95e5c 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -205,14 +205,15 @@ def test_kde_sample_weights(): assert_allclose(scores_scaled_weight, scores_weight) -def test_pickling(tmpdir): +@pytest.mark.parametrize('sample_weight', [None, [0.1, 0.2, 0.3]]) +def test_pickling(tmpdir, sample_weight): # Make sure that predictions are the same before and after pickling. Used # to be a bug because sample_weights wasn't pickled and the resulting tree # would miss some info. kde = KernelDensity() data = np.reshape([1., 2., 3.], (-1, 1)) - kde.fit(data) + kde.fit(data, sample_weight=sample_weight) X = np.reshape([1.1, 2.1], (-1, 1)) scores = kde.score_samples(X)