scikit-learn
diff --git a/‎doc/whats_new/v0.24.rst
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new/v0.24.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/impute/_base.py
Lines changed: 16 additions & 13 deletions b/‎sklearn/impute/_base.py
Lines changed: 16 additions & 13 deletions
diff --git a/‎sklearn/impute/tests/test_impute.py
Lines changed: 26 additions & 0 deletions b/‎sklearn/impute/tests/test_impute.py
Lines changed: 26 additions & 0 deletions
@@ -369,6 +369,10 @@ Changelog
   estimator's `random_state` attribute, allowing to use it with more external classes.
   :pr:`15636` by :user:`David Cortes <david-cortes>`.
 
+- |Efficiency| :class:`impute.SimpleImputer` is now faster with `object` dtype array.
+  when `strategy='most_frequent'` in :class:`~sklearn.impute.SimpleImputer`.
+  :pr:`18987` by :user:`David Katz <DavidKatz-il>`.
+
 :mod:`sklearn.inspection`
 .........................
 
 
@@ -4,6 +4,7 @@
 
 import numbers
 import warnings
+from collections import Counter
 
 import numpy as np
 import numpy.ma as ma
@@ -34,15 +35,20 @@ def _most_frequent(array, extra_value, n_repeat):
        of the array."""
     # Compute the most frequent value in array only
     if array.size > 0:
-        with warnings.catch_warnings():
-            # stats.mode raises a warning when input array contains objects due
-            # to incapacity to detect NaNs. Irrelevant here since input array
-            # has already been NaN-masked.
-            warnings.simplefilter("ignore", RuntimeWarning)
+        if array.dtype == object:
+            # scipy.stats.mode is slow with object dtype array.
+            # Python Counter is more efficient
+            counter = Counter(array)
+            most_frequent_count = counter.most_common(1)[0][1]
+            # tie breaking similarly to scipy.stats.mode
+            most_frequent_value = min(
+                value for value, count in counter.items()
+                if count == most_frequent_count
+            )
+        else:
             mode = stats.mode(array)
-
-        most_frequent_value = mode[0][0]
-        most_frequent_count = mode[1][0]
+            most_frequent_value = mode[0][0]
+            most_frequent_count = mode[1][0]
     else:
         most_frequent_value = 0
         most_frequent_count = 0
@@ -55,11 +61,8 @@ def _most_frequent(array, extra_value, n_repeat):
     elif most_frequent_count > n_repeat:
         return most_frequent_value
     elif most_frequent_count == n_repeat:
-        # Ties the breaks. Copy the behaviour of scipy.stats.mode
-        if most_frequent_value < extra_value:
-            return most_frequent_value
-        else:
-            return extra_value
+        # tie breaking similarly to scipy.stats.mode
+        return min(most_frequent_value, extra_value)
 
 
 class _BaseImputer(TransformerMixin, BaseEstimator):
 
@@ -27,6 +27,7 @@
 from sklearn import tree
 from sklearn.random_projection import _sparse_random_matrix
 from sklearn.exceptions import ConvergenceWarning
+from sklearn.impute._base import _most_frequent
 
 
 def _check_statistics(X, X_true,
@@ -1474,3 +1475,28 @@ def test_simple_imputation_inverse_transform_exceptions(missing_value):
     with pytest.raises(ValueError,
                        match=f"Got 'add_indicator={imputer.add_indicator}'"):
         imputer.inverse_transform(X_1_trans)
+
+
+@pytest.mark.parametrize(
+    "expected,array,dtype,extra_value,n_repeat",
+    [
+        # array of object dtype
+        ("extra_value", ['a', 'b', 'c'], object, "extra_value", 2),
+        (
+            "most_frequent_value",
+            ['most_frequent_value', 'most_frequent_value', 'value'],
+            object, "extra_value", 1
+        ),
+        ("a", ['min_value', 'min_value' 'value'], object, "a", 2),
+        ("min_value", ['min_value', 'min_value', 'value'], object, "z", 2),
+        # array of numeric dtype
+        (10, [1, 2, 3], int, 10, 2),
+        (1, [1, 1, 2], int, 10, 1),
+        (10, [20, 20, 1], int, 10, 2),
+        (1, [1, 1, 20], int, 10, 2),
+    ]
+)
+def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
+    assert expected == _most_frequent(
+        np.array(array, dtype=dtype), extra_value, n_repeat
+    )