diff --git a/.gitignore b/.gitignore
index 05df19335be2..736597b6b7af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -215,5 +215,6 @@ numpy/core/src/_simd/_simd_inc.h
 # umath module
 numpy/core/src/umath/loops_unary_fp.dispatch.c
 numpy/core/src/umath/loops_arithm_fp.dispatch.c
+numpy/core/src/umath/loops_arithmetic.dispatch.c
 numpy/core/src/umath/loops_trigonometric.dispatch.c
 numpy/core/src/umath/loops_exponent_log.dispatch.c
diff --git a/benchmarks/benchmarks/bench_ufunc.py b/benchmarks/benchmarks/bench_ufunc.py
index 13b7382a1708..b036581e1aae 100644
--- a/benchmarks/benchmarks/bench_ufunc.py
+++ b/benchmarks/benchmarks/bench_ufunc.py
@@ -135,18 +135,19 @@ def time_less_than_scalar2(self, dtype):
 
 
 class CustomScalarFloorDivideInt(Benchmark):
-    params = ([np.int8, np.int16, np.int32, np.int64], [8, -8, 43, -43, 0])
+    params = (np.sctypes['int'] + np.sctypes['uint'], [8, -8, 43, -43])
     param_names = ['dtype', 'divisors']
-    max_value = 10**7
-    min_value = -10**7
 
     def setup(self, dtype, divisor):
+        if dtype in np.sctypes['uint'] and divisor < 0:
+            raise NotImplementedError(
+                    "Skipping test for negative divisor with unsigned type")
+
         iinfo = np.iinfo(dtype)
-        self.x = np.arange(
-                max(iinfo.min, self.min_value),
-                min(iinfo.max, self.max_value), dtype=dtype)
+        self.x = np.random.randint(
+                    iinfo.min, iinfo.max, size=10000, dtype=dtype)
 
-    def time_floor_divide_int(self, dtpye, divisor):
+    def time_floor_divide_int(self, dtype, divisor):
         self.x // divisor
 
 
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index b5305fbfce98..57c811ff3306 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -245,6 +245,8 @@ def english_upper(s):
 O = 'O'
 P = 'P'
 ints = 'bBhHiIlLqQ'
+sints = 'bhilq'
+uints = 'BHILQ'
 times = 'Mm'
 timedeltaonly = 'm'
 intsO = ints + O
@@ -325,7 +327,9 @@ def english_upper(s):
     Ufunc(2, 1, None, # One is only a unit to the right, not the left
           docstrings.get('numpy.core.umath.floor_divide'),
           'PyUFunc_DivisionTypeResolver',
-          TD(intfltcmplx),
+          TD(uints, cfunc_alias='divide',
+              dispatch=[('loops_arithmetic', 'BHILQ')]),
+          TD(sints + flts + cmplx),
           [TypeDescription('m', FullTypeDescr, 'mq', 'm'),
            TypeDescription('m', FullTypeDescr, 'md', 'm'),
            TypeDescription('m', FullTypeDescr, 'mm', 'q'),
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 8c34a3286d72..df405bcaf487 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -931,6 +931,7 @@ def generate_umath_c(ext, build_dir):
             join('src', 'umath', 'loops.c.src'),
             join('src', 'umath', 'loops_unary_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
+            join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 68e209fe9312..04665dc5296e 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1014,22 +1014,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 void
     UNARY_LOOP_FAST(@type@, @type@, *out = in > 0 ? 1 : 0);
 }
 
-NPY_NO_EXPORT void
-@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
-{
-    BINARY_LOOP {
-        const @type@ in1 = *(@type@ *)ip1;
-        const @type@ in2 = *(@type@ *)ip2;
-        if (in2 == 0) {
-            npy_set_floatstatus_divbyzero();
-            *((@type@ *)op1) = 0;
-        }
-        else {
-            *((@type@ *)op1)= in1/in2;
-        }
-    }
-}
-
 NPY_NO_EXPORT void
 @TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index b3a19be12d62..0301aa5ed7b8 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -53,6 +53,17 @@ BOOL_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void
  *****************************************************************************
  */
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+/**begin repeat
+ * #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG#
+ */
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_divide,
+     (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+/**end repeat**/
+
 /**begin repeat
  * #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
  */
@@ -141,7 +152,7 @@ NPY_NO_EXPORT void
 @S@@TYPE@_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
-@S@@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+@TYPE@_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
 
 NPY_NO_EXPORT void
 @S@@TYPE@_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
diff --git a/numpy/core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
new file mode 100644
index 000000000000..7e9f464636c5
--- /dev/null
+++ b/numpy/core/src/umath/loops_arithmetic.dispatch.c.src
@@ -0,0 +1,118 @@
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41 avx2 avx512f avx512_skx
+ ** vsx2
+ ** neon
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+//###############################################################################
+//## Division
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#if NPY_SIMD
+/**begin repeat
+ *  #sfx = u8, u16, u32, u64#
+ */
+static NPY_INLINE void
+simd_divide_by_scalar_contig_@sfx@(char **args, npy_intp len)
+{
+    npyv_lanetype_@sfx@ *src   = (npyv_lanetype_@sfx@ *) args[0];
+    npyv_lanetype_@sfx@ scalar = *(npyv_lanetype_@sfx@ *) args[1];
+    npyv_lanetype_@sfx@ *dst   = (npyv_lanetype_@sfx@ *) args[2];
+    const int vstep            = npyv_nlanes_@sfx@;
+    const npyv_@sfx@x3 divisor = npyv_divisor_@sfx@(scalar);
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_@sfx@ a = npyv_load_@sfx@(src);
+        npyv_@sfx@ c = npyv_divc_@sfx@(a, divisor);
+        npyv_store_@sfx@(dst, c);
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_@sfx@ a = *src;
+        *dst = a / scalar;
+    }
+
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+
+/**begin repeat
+ * Unsigned types
+ *  #type  = npy_ubyte, npy_ushort, npy_uint, npy_ulong, npy_ulonglong#
+ *  #TYPE  = UBYTE,     USHORT,     UINT,     ULONG,     ULONGLONG#
+ *  #STYPE = BYTE,      SHORT,      INT,      LONG,      LONGLONG#
+ */
+#undef TO_SIMD_SFX
+#if 0
+/**begin repeat1
+ * #len = 8, 16, 32, 64#
+ */
+#elif NPY_BITSOF_@STYPE@ == @len@
+    #define TO_SIMD_SFX(X) X##_u@len@
+/**end repeat1**/
+#endif
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division,
+ * note neither infrastructure nor NPYV has supported VSX4 yet.
+ */
+#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(@type@) {
+            const @type@ d = *(@type@ *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((@type@ *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(@type@), NPY_SIMD_WIDTH) &&
+             (*(@type@ *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const @type@ in1 = *(@type@ *)ip1;
+            const @type@ in2 = *(@type@ *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
+                npy_set_floatstatus_divbyzero();
+                *((@type@ *)op1) = 0;
+            } else{
+                *((@type@ *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 2249c866caf5..b31b84d0cc2b 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -250,13 +250,22 @@ def test_division_int(self):
         assert_equal(x % 100, [5, 10, 90, 0, 95, 90, 10, 0, 80])
 
     @pytest.mark.parametrize("input_dtype",
-            [np.int8, np.int16, np.int32, np.int64])
+            np.sctypes['int'] + np.sctypes['uint'])
     def test_division_int_boundary(self, input_dtype):
         iinfo = np.iinfo(input_dtype)
 
+        # Unsigned:
+        # Create list with 0, 25th, 50th, 75th percentile and max
+        if iinfo.min == 0:
+            lst = [0, iinfo.max//4, iinfo.max//2,
+                    int(iinfo.max/1.33), iinfo.max]
+            divisors = [iinfo.max//4, iinfo.max//2,
+                    int(iinfo.max/1.33), iinfo.max]
+        # Signed:
         # Create list with min, 25th percentile, 0, 75th percentile, max
-        lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
-        divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
+        else:
+            lst = [iinfo.min, iinfo.min//2, 0, iinfo.max//2, iinfo.max]
+            divisors = [iinfo.min, iinfo.min//2, iinfo.max//2, iinfo.max]
         a = np.array(lst, dtype=input_dtype)
 
         for divisor in divisors:
@@ -926,7 +935,7 @@ def test_log_values(self):
             assert_raises(FloatingPointError, np.log, np.float32(-np.inf))
             assert_raises(FloatingPointError, np.log, np.float32(-1.0))
 
-        # See https://github.com/numpy/numpy/issues/18005 
+        # See https://github.com/numpy/numpy/issues/18005
         with assert_no_warnings():
             a = np.array(1e9, dtype='float32')
             np.log(a)