scikit-learn · ogrisel · May 14, 2019 · May 2, 2019 · May 3, 2019 · May 3, 2019
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -7,7 +7,7 @@
 Version 0.21.1
 ==============
 
-**May 2019**
+**17 May 2019**
 
 
 This is a bug-fix release with some minor documentation improvements and
@@ -24,6 +24,13 @@ Changelog
   ``Y == None``.
   :issue:`13864` by :user:`Paresh Mathur <rick2047>`.
 
+:mod:`sklearn.neighbors`
+......................
+
+- |Fix| Fixed a bug in :class:`neighbors.KernelDensity` which could not be
+  restored from a pickle if ``sample_weight`` had been used.
+  :issue:`13772` by :user:`Aditya Vyas <aditya1702>`.
+
 
 .. _changes_0_21:
 

diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi
@@ -1064,10 +1064,17 @@ cdef class BinaryTree:
 
     def __init__(self, data,
                  leaf_size=40, metric='minkowski', sample_weight=None, **kwargs):
-        self.data_arr = np.asarray(data, dtype=DTYPE, order='C')
-        self.data = get_memview_DTYPE_2D(self.data_arr)
+        # validate data
+        if data.size == 0:
+            raise ValueError("X is an empty array")
+
+        if leaf_size < 1:
+            raise ValueError("leaf_size must be greater than or equal to 1")
 
+        n_samples = data.shape[0]
+        n_features = data.shape[1]
 
+        self.data_arr = np.asarray(data, dtype=DTYPE, order='C')
         self.leaf_size = leaf_size
         self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
         self.euclidean = (self.dist_metric.__class__.__name__
@@ -1079,26 +1086,6 @@ cdef class BinaryTree:
                              '{BinaryTree}'.format(metric=metric,
                                                    **DOC_DICT))
 
-        # validate data
-        if self.data.size == 0:
-            raise ValueError("X is an empty array")
-
-        if leaf_size < 1:
-            raise ValueError("leaf_size must be greater than or equal to 1")
-
-        n_samples = self.data.shape[0]
-        n_features = self.data.shape[1]
-
-
-        if sample_weight is not None:
-            self.sample_weight_arr = np.asarray(sample_weight, dtype=DTYPE, order='C')
-            self.sample_weight = get_memview_DTYPE_1D(self.sample_weight_arr)
-            self.sum_weight = np.sum(self.sample_weight)
-        else:
-            self.sample_weight = None
-            self.sum_weight = <DTYPE_t> n_samples
-
-
         # determine number of levels in the tree, and from this
         # the number of nodes in the tree.  This results in leaf nodes
         # with numbers of points between leaf_size and 2 * leaf_size
@@ -1107,15 +1094,34 @@ cdef class BinaryTree:
 
         # allocate arrays for storage
         self.idx_array_arr = np.arange(n_samples, dtype=ITYPE)
-        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
-
         self.node_data_arr = np.zeros(self.n_nodes, dtype=NodeData)
-        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
+
+        self._update_sample_weight(n_samples, sample_weight)
+        self._update_memviews()
 
         # Allocate tree-specific data
         allocate_data(self, self.n_nodes, n_features)
         self._recursive_build(0, 0, n_samples)
 
+    def _update_sample_weight(self, n_samples, sample_weight):
+        if sample_weight is not None:
+            self.sample_weight_arr = np.asarray(
+                sample_weight, dtype=DTYPE, order='C')
+            self.sample_weight = get_memview_DTYPE_1D(
+                self.sample_weight_arr)
+            self.sum_weight = np.sum(self.sample_weight)
+        else:
+            self.sample_weight = None
+            self.sample_weight_arr = np.empty(1, dtype=DTYPE, order='C')
+            self.sum_weight = <DTYPE_t> n_samples
+
+    def _update_memviews(self):
+        self.data = get_memview_DTYPE_2D(self.data_arr)
+        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
+        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
+        self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
+
+
     def __reduce__(self):
         """
         reduce method used for pickling
@@ -1126,6 +1132,13 @@ cdef class BinaryTree:
         """
         get state for pickling
         """
+        if self.sample_weight is not None:
+            # pass the numpy array
+            sample_weight_arr = self.sample_weight_arr
+        else:
+            # pass None to avoid confusion with the empty place holder
+            # of size 1 from __cinit__
+            sample_weight_arr = None
         return (self.data_arr,
                 self.idx_array_arr,
                 self.node_data_arr,
@@ -1138,7 +1151,7 @@ cdef class BinaryTree:
                 int(self.n_splits),
                 int(self.n_calls),
                 self.dist_metric,
-                self.sample_weight)
+                sample_weight_arr)
 
     def __setstate__(self, state):
         """
@@ -1148,12 +1161,6 @@ cdef class BinaryTree:
         self.idx_array_arr = state[1]
         self.node_data_arr = state[2]
         self.node_bounds_arr = state[3]
-
-        self.data = get_memview_DTYPE_2D(self.data_arr)
-        self.idx_array = get_memview_ITYPE_1D(self.idx_array_arr)
-        self.node_data = get_memview_NodeData_1D(self.node_data_arr)
-        self.node_bounds = get_memview_DTYPE_3D(self.node_bounds_arr)
-
         self.leaf_size = state[4]
         self.n_levels = state[5]
         self.n_nodes = state[6]
@@ -1162,9 +1169,13 @@ cdef class BinaryTree:
         self.n_splits = state[9]
         self.n_calls = state[10]
         self.dist_metric = state[11]
+        sample_weight_arr = state[12]
+
         self.euclidean = (self.dist_metric.__class__.__name__
                           == 'EuclideanDistance')
-        self.sample_weight = state[12]
+        n_samples = self.data_arr.shape[0]
+        self._update_sample_weight(n_samples, sample_weight_arr)
+        self._update_memviews()
 
     def get_tree_stats(self):
         return (self.n_trims, self.n_leaves, self.n_splits)

diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py
@@ -205,14 +205,15 @@ def test_kde_sample_weights():
                     assert_allclose(scores_scaled_weight, scores_weight)
 
 
-def test_pickling(tmpdir):
+@pytest.mark.parametrize('sample_weight', [None, [0.1, 0.2, 0.3]])
+def test_pickling(tmpdir, sample_weight):
     # Make sure that predictions are the same before and after pickling. Used
     # to be a bug because sample_weights wasn't pickled and the resulting tree
     # would miss some info.
 
     kde = KernelDensity()
     data = np.reshape([1., 2., 3.], (-1, 1))
-    kde.fit(data)
+    kde.fit(data, sample_weight=sample_weight)
 
     X = np.reshape([1.1, 2.1], (-1, 1))
     scores = kde.score_samples(X)