SkuaD01
diff --git a/‎sklearn/impute/_knn.py
Lines changed: 59 additions & 47 deletions b/‎sklearn/impute/_knn.py
Lines changed: 59 additions & 47 deletions
diff --git a/‎sklearn/impute/tests/test_knn.py
Lines changed: 25 additions & 0 deletions b/‎sklearn/impute/tests/test_knn.py
Lines changed: 25 additions & 0 deletions
diff --git a/‎sklearn/impute/tests/test_time.py
Lines changed: 139 additions & 0 deletions b/‎sklearn/impute/tests/test_time.py
Lines changed: 139 additions & 0 deletions
diff --git a/‎sklearn/metrics/pairwise.py
Lines changed: 11 additions & 2 deletions b/‎sklearn/metrics/pairwise.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎sklearn/preprocessing/_data.py
Lines changed: 1 addition & 0 deletions b/‎sklearn/preprocessing/_data.py
Lines changed: 1 addition & 0 deletions
@@ -3,6 +3,8 @@
 # License: BSD 3 clause
 
 import numpy as np
+from multiprocessing import cpu_count
+from joblib import Parallel, delayed, effective_n_jobs
 
 from ._base import _BaseImputer
 from ..utils.validation import FLOAT_DTYPES
@@ -99,7 +101,7 @@ class KNNImputer(_BaseImputer):
     @_deprecate_positional_args
     def __init__(self, *, missing_values=np.nan, n_neighbors=5,
                  weights="uniform", metric="nan_euclidean", copy=True,
-                 add_indicator=False):
+                 add_indicator=False, n_jobs=cpu_count()):
         super().__init__(
             missing_values=missing_values,
             add_indicator=add_indicator
@@ -108,6 +110,7 @@ def __init__(self, *, missing_values=np.nan, n_neighbors=5,
         self.weights = weights
         self.metric = metric
         self.copy = copy
+        self.n_jobs = n_jobs
 
     def _calc_impute(self, dist_pot_donors, n_neighbors,
                      fit_X_col, mask_fit_X_col):
@@ -236,55 +239,63 @@ def transform(self, X):
         dist_idx_map = np.zeros(X.shape[0], dtype=int)
         dist_idx_map[row_missing_idx] = np.arange(row_missing_idx.shape[0])
 
+
+        def process_chunk_col(dist_chunk, start, row_missing_chunk, col):
+            if not valid_mask[col]:
+                # column was all missing during training
+                return
+
+            col_mask = mask[row_missing_chunk, col]
+            if not np.any(col_mask):
+                # column has no missing values
+                return
+
+            potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col])
+
+            # receivers_idx are indices in X
+            receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
+
+            # distances for samples that needed imputation for column
+            dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start]
+                            [:, potential_donors_idx])
+
+            # receivers with all nan distances impute with mean
+            all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
+            all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
+
+            if all_nan_receivers_idx.size:
+                col_mean = np.ma.array(self._fit_X[:, col],
+                                        mask=mask_fit_X[:, col]).mean()
+                X[all_nan_receivers_idx, col] = col_mean
+
+                if len(all_nan_receivers_idx) == len(receivers_idx):
+                    # all receivers imputed with mean
+                    return
+
+                # receivers with at least one defined distance
+                receivers_idx = receivers_idx[~all_nan_dist_mask]
+                dist_subset = (dist_chunk[dist_idx_map[receivers_idx]
+                                          - start]
+                                [:, potential_donors_idx])
+
+            n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
+            value = self._calc_impute(
+                dist_subset,
+                n_neighbors,
+                self._fit_X[potential_donors_idx, col],
+                mask_fit_X[potential_donors_idx, col])
+            X[receivers_idx, col] = value
+
         def process_chunk(dist_chunk, start):
             row_missing_chunk = row_missing_idx[start:start + len(dist_chunk)]
 
             # Find and impute missing by column
-            for col in range(X.shape[1]):
-                if not valid_mask[col]:
-                    # column was all missing during training
-                    continue
-
-                col_mask = mask[row_missing_chunk, col]
-                if not np.any(col_mask):
-                    # column has no missing values
-                    continue
-
-                potential_donors_idx, = np.nonzero(non_missing_fix_X[:, col])
-
-                # receivers_idx are indices in X
-                receivers_idx = row_missing_chunk[np.flatnonzero(col_mask)]
-
-                # distances for samples that needed imputation for column
-                dist_subset = (dist_chunk[dist_idx_map[receivers_idx] - start]
-                               [:, potential_donors_idx])
-
-                # receivers with all nan distances impute with mean
-                all_nan_dist_mask = np.isnan(dist_subset).all(axis=1)
-                all_nan_receivers_idx = receivers_idx[all_nan_dist_mask]
-
-                if all_nan_receivers_idx.size:
-                    col_mean = np.ma.array(self._fit_X[:, col],
-                                           mask=mask_fit_X[:, col]).mean()
-                    X[all_nan_receivers_idx, col] = col_mean
-
-                    if len(all_nan_receivers_idx) == len(receivers_idx):
-                        # all receivers imputed with mean
-                        continue
-
-                    # receivers with at least one defined distance
-                    receivers_idx = receivers_idx[~all_nan_dist_mask]
-                    dist_subset = (dist_chunk[dist_idx_map[receivers_idx]
-                                              - start]
-                                   [:, potential_donors_idx])
-
-                n_neighbors = min(self.n_neighbors, len(potential_donors_idx))
-                value = self._calc_impute(
-                    dist_subset,
-                    n_neighbors,
-                    self._fit_X[potential_donors_idx, col],
-                    mask_fit_X[potential_donors_idx, col])
-                X[receivers_idx, col] = value
+            if effective_n_jobs(self.n_jobs) > 1:
+                generator = (delayed(process_chunk_col)(dist_chunk, start, row_missing_chunk, col) for col in range(X.shape[1]))
+                Parallel(n_jobs=self.n_jobs, backend='threading')(generator)
+            else:
+                for col in range(X.shape[1]):
+                    process_chunk_col(dist_chunk, start, row_missing_chunk, col)
 
         # process in fixed-memory chunks
         gen = pairwise_distances_chunked(
@@ -293,7 +304,8 @@ def process_chunk(dist_chunk, start):
             metric=self.metric,
             missing_values=self.missing_values,
             force_all_finite=force_all_finite,
-            reduce_func=process_chunk)
+            reduce_func=process_chunk,
+            n_jobs=self.n_jobs)
         for chunk in gen:
             # process_chunk modifies X in place. No return value.
             pass
 
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+from multiprocessing import cpu_count
 
 from sklearn import config_context
 from sklearn.impute import KNNImputer
@@ -639,3 +640,27 @@ def test_knn_imputer_distance_weighted_not_enough_neighbors(na,
 def test_knn_tags(na, allow_nan):
     knn = KNNImputer(missing_values=na)
     assert knn._get_tags()["allow_nan"] == allow_nan
+
+
+@pytest.mark.parametrize("n", [1,2, cpu_count(), cpu_count()+1, 2*cpu_count()])
+def test_knn_inputer_varying_n_jobs(n):
+    knn = KNNImputer(n_jobs=n)
+    na = np.nan
+    X = np.array([
+        [1, na, 1, 1, 1.],
+        [2, 2, 2, 2, 2],
+        [3, 3, 3, 3, na],
+        [6, 6, na, 6, 6],
+    ])
+
+    r1c2 = 11 / 3
+    r4c3 = 2
+    r3c5 = 3
+
+    X_imputed = np.array([
+        [1, r1c2, 1, 1, 1.],
+        [2, 2, 2, 2, 2],
+        [3, 3, 3, 3, r3c5],
+        [6, 6, r4c3, 6, 6],
+    ])
+    assert_allclose(knn.fit_transform(X), X_imputed)
@@ -0,0 +1,139 @@
+import numpy as np
+import pandas as pd
+from sklearn.impute import KNNImputer
+import time
+import profile
+import pytest
+
+##################### THIS TEST SUITE WILL TAKE APPROXIMATELY 15 MINUTES, BUT VARIES BY MACHINE ############################
+
+# Initialise constants
+epsilon = 0.75 # Seconds of breathing room for smaller sized n tests
+perc = [0.1,0.5,0.9] # We want to test a varying amount of missing values to impute, each of these represent the precrentage of missing values in X
+small_n = [5,300,500]
+larg
2851
e_n = [1000,3000,5000]
+
+# generate random array of size x by y with (p*100)% missing values
+def gen_matrix(x, y, p):
+    np.random.seed(1)
+    X = np.random.random([x, y])
+    X = pd.DataFrame(X).mask(X <= p)
+    return X
+
+# Run (simulated) old time or new time, return start and end times
+def run_test(X, old=False):
+    start = time.time()
+    if (old):
+        imputer = KNNImputer(n_neighbors=5, n_jobs=1)
+    else:
+        imputer = KNNImputer(n_neighbors=5)
+    imputer.fit_transform(X)
+    end = time.time()
+
+    return start,end
+
+def relative_assert(new, old):
+    # Since smaller times will yield more sporatic results, an amount of seconds of amount epsilon are accounted for
+    assert ((new == pytest.approx(old, abs=epsilon)) or (new < old))
+
+def output_res(old_end,old_start,end,start):
+    print("\nOld time:", round((old_end-old_start),4) ,", New time:", round((end-start),4),",", str(round(((old_end-old_start)/(end-start) - 1)*100, 2))+"% improved")
+
+# Test arrays of size 1 by a small n with varying missing values
+@pytest.mark.parametrize("n,p",[(n, p) for p in perc for n in small_n])
+def test_time_1_by_n(n,p):
+    X = gen_matrix(1, n, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of size 1 by a large n with varying missing values
+@pytest.mark.parametrize("N,p",[(N, p) for p in perc for N in large_n])
+def test_time_1_by_N(N,p):
+    X = gen_matrix(1, N, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of a small n by 1 with varying missing values
+@pytest.mark.parametrize("n,p",[(n, p) for p in perc for n in small_n])
+def test_time_n_by_1(n,p):
+    X = gen_matrix(n, 1, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of a large n by 1 with varying missing values
+@pytest.mark.parametrize("N,p",[(N, p) for p in perc for N in large_n])
+def test_time_N_by_1(N,p):
+    X = gen_matrix(N, 1, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of a small n by a small n with varying missing values
+@pytest.mark.parametrize("n1,n2,p",[(n1,n2,p) for p in perc for n1 in small_n for n2 in small_n])
+def test_time_n_by_n(n1,n2,p):
+    X = gen_matrix(n1, n2, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of a small n by a large n with varying missing values
+@pytest.mark.parametrize("n,N,p",[(n,N,p) for p in perc for n in small_n for N in large_n])
+def test_time_n_by_N(n,N,p):
+    X = gen_matrix(n, N, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of a large n by a small n with varying missing values
+### This is the most important test case since it is the most likely scenario for usage
+### (More likely to be testing a large number of features)
+@pytest.mark.parametrize("N,n,p",[(N,n,p) for p in perc for n in small_n for N in large_n])
+def test_time_N_by_n(N,n,p):
+    X = gen_matrix(N, n, p)
+
+    start, end = run_test(X)
+    old_start, old_end = run_test(X, old=True)
+
+    output_res(old_end,old_start,end,start)
+
+    relative_assert(end-start, old_end-old_start)
+
+# Test arrays of a large n by a large n with varying missing values
+# (This takes a VERY long time to run, since they are more often called individually)
+## @pytest.mark.parametrize("N1,N2,p",[(N1,N2,p) for p in perc for N1 in large_n for N2 in large_n])
+## def test_time_N_by_N(N1, N2, p):
+##     X = gen_matrix(N1, N2, p)
+
+##     start, end = run_test(X)
+##     old_start, old_end = run_test(X, old=True)
+
+##     output_res(old_end,old_start,end,start)
+
+##     relative_assert(end-start, old_end-old_start)
@@ -18,6 +18,7 @@
 from scipy.sparse import csr_matrix
 from scipy.sparse import issparse
 from joblib import Parallel, effective_n_jobs
 from multiprocessing import cpu_count
 
 from ..utils.validation import _num_samples
 from ..utils.validation import check_non_negative
@@ -1644,7 +1645,7 @@ def pairwise_distances_chunked(X, Y=None, *, reduce_func=None,
     params = _precompute_metric_params(X, Y, metric=metric, **kwds)
     kwds.update(**params)
 
-    for sl in slices:
+    def _process_slice(sl, reduce_func):
         if sl.start == 0 and sl.stop == n_samples_X:
             X_chunk = X  # enable optimised paths for X is Y
         else:
@@ -1661,8 +1662,16 @@ def pairwise_distances_chunked(X, Y=None, *, reduce_func=None,
             chunk_size = D_chunk.shape[0]
             D_chunk = reduce_func(D_chunk, sl.start)
             _check_chunk_size(D_chunk, chunk_size)
-        yield D_chunk
+        return D_chunk
 
+    if effective_n_jobs(n_jobs) > 1:
+        generator = (delayed(_process_slice)(sl, reduce_func) for sl in slices)
+        par_res=Parallel(n_jobs, backend='threading')(generator)
+        for res in par_res:
+            yield res
+    else:
+        for sl in slices:
+            yield _process_slice(sl, reduce_func)
 
 @_deprecate_positional_args
 def pairwise_distances(X, Y=None, metric="euclidean", *, n_jobs=None,
 
@@ -1644,6 +1644,7 @@ def __init__(self, max_degree=None, *, interaction_only=False, include_bias=True
         self.interaction_only = interaction_only
         self.include_bias = include_bias
         self.order = order
+        self.degree = max_degree or degree # lazy evaluation to handle deprecaition
 
     @staticmethod
     @_deprecate_positional_args