diff --git a/ports/bare-arm/Makefile b/ports/bare-arm/Makefile
index 1a21eb56a867b..e48342924b951 100644
--- a/ports/bare-arm/Makefile
+++ b/ports/bare-arm/Makefile
@@ -16,7 +16,6 @@ PYDFU ?= $(TOP)/tools/pydfu.py
 CFLAGS += -I. -I$(TOP) -I$(BUILD)
 CFLAGS += -Wall -Werror -std=c99 -nostdlib
 CFLAGS += -mthumb -mtune=cortex-m4 -mcpu=cortex-m4 -msoft-float
-CSUPEROPT = -Os # save some code space for performance-critical code
 
 # Select debugging or optimisation build.
 ifeq ($(DEBUG), 1)
diff --git a/ports/bare-arm/mpconfigport.h b/ports/bare-arm/mpconfigport.h
index 65bb67f7b9a70..7a8fa55d30209 100644
--- a/ports/bare-arm/mpconfigport.h
+++ b/ports/bare-arm/mpconfigport.h
@@ -37,6 +37,9 @@
 // Python internal features
 #define MICROPY_ERROR_REPORTING                 (MICROPY_ERROR_REPORTING_NONE)
 
+// Just use -Os for everything to generate the smallest possible binary.
+#define MICROPY_APPLY_COMPILER_EXTRA_OPTIMISATIONS(f) f
+
 // Type definitions for the specific machine
 
 typedef int32_t mp_int_t; // must be pointer size
diff --git a/ports/esp32/mpconfigport.h b/ports/esp32/mpconfigport.h
index 5dc4a9c758790..624415c08b085 100644
--- a/ports/esp32/mpconfigport.h
+++ b/ports/esp32/mpconfigport.h
@@ -231,18 +231,12 @@ void *esp_native_code_commit(void *, size_t, void *);
 #endif
 
 // Functions that should go in IRAM
+#define MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f) IRAM_ATTR MICROPY_APPLY_COMPILER_OPTIMISATIONS(f)
+#if !(CONFIG_IDF_TARGET_ESP32 && CONFIG_SPIRAM && CONFIG_SPIRAM_CACHE_WORKAROUND)
 // For ESP32 with SPIRAM workaround, firmware is larger and uses more static IRAM,
 // so in that configuration don't put too many functions in IRAM.
-#if !(CONFIG_IDF_TARGET_ESP32 && CONFIG_SPIRAM && CONFIG_SPIRAM_CACHE_WORKAROUND)
-#define MICROPY_WRAP_MP_BINARY_OP(f) IRAM_ATTR f
+#define MICROPY_PERFORMANCE_CRITICAL_LEVEL_2(f) IRAM_ATTR MICROPY_APPLY_COMPILER_OPTIMISATIONS(f)
 #endif
-#define MICROPY_WRAP_MP_EXECUTE_BYTECODE(f) IRAM_ATTR f
-#define MICROPY_WRAP_MP_LOAD_GLOBAL(f) IRAM_ATTR f
-#define MICROPY_WRAP_MP_LOAD_NAME(f) IRAM_ATTR f
-#define MICROPY_WRAP_MP_MAP_LOOKUP(f) IRAM_ATTR f
-#define MICROPY_WRAP_MP_OBJ_GET_TYPE(f) IRAM_ATTR f
-#define MICROPY_WRAP_MP_SCHED_EXCEPTION(f) IRAM_ATTR f
-#define MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT(f) IRAM_ATTR f
 
 #define UINT_FMT "%u"
 #define INT_FMT "%d"
diff --git a/ports/minimal/Makefile b/ports/minimal/Makefile
index 050c4ddf526a5..554a024c66041 100644
--- a/ports/minimal/Makefile
+++ b/ports/minimal/Makefile
@@ -31,8 +31,6 @@ CFLAGS += $(INC) -Wall -Werror -Wdouble-promotion -Wfloat-conversion -std=c99 $(
 LDFLAGS += -Wl,-Map=$@.map,--cref -Wl,--gc-sections
 endif
 
-CSUPEROPT = -Os # save some code space
-
 # Tune for Debugging or Optimization
 CFLAGS += -g  # always include debug info in the ELF
 ifeq ($(DEBUG), 1)
diff --git a/ports/minimal/mpconfigport.h b/ports/minimal/mpconfigport.h
index 56bef165facda..eaf5abe76397a 100644
--- a/ports/minimal/mpconfigport.h
+++ b/ports/minimal/mpconfigport.h
@@ -21,6 +21,9 @@
 // Use the minimum headroom in the chunk allocator for parse nodes.
 #define MICROPY_ALLOC_PARSE_CHUNK_INIT    (16)
 
+// Just use -Os for everything to generate the smallest possible binary.
+#define MICROPY_APPLY_COMPILER_EXTRA_OPTIMISATIONS(f) f
+
 // type definitions for the specific machine
 
 typedef intptr_t mp_int_t; // must be pointer size
diff --git a/ports/stm32/Makefile b/ports/stm32/Makefile
index e44a542395182..edc59b742a875 100644
--- a/ports/stm32/Makefile
+++ b/ports/stm32/Makefile
@@ -284,7 +284,6 @@ SRC_O += \
 	$(SYSTEM_FILE)
     
 ifeq ($(MCU_SERIES),$(filter $(MCU_SERIES),f0 g0 l0))
-CSUPEROPT = -Os # save some code space
 SRC_O += \
 	resethandler_m0.o \
 	shared/runtime/gchelper_thumb1.o
diff --git a/ports/stm32/mpconfigport.h b/ports/stm32/mpconfigport.h
index 300ad086bf473..0a1ce27c2258d 100644
--- a/ports/stm32/mpconfigport.h
+++ b/ports/stm32/mpconfigport.h
@@ -55,6 +55,12 @@
 #define MICROPY_OPT_MAP_LOOKUP_CACHE (__CORTEX_M > 0)
 #endif
 
+#if __CORTEX_M == 0
+// Just use -Os for everything to avoid using extra flash. Using M0 as a stand-in
+// for "likely has small flash", this could be moved to the board config instead.
+#define MICROPY_APPLY_COMPILER_EXTRA_OPTIMISATIONS(f) f
+#endif
+
 // emitters
 #define MICROPY_PERSISTENT_CODE_LOAD (1)
 #ifndef MICROPY_EMIT_THUMB
diff --git a/py/gc.c b/py/gc.c
index b6969dfd42429..f331c6ee0bbf7 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -723,7 +723,12 @@ void gc_info(gc_info_t *info) {
     GC_EXIT();
 }
 
-void *gc_alloc(size_t n_bytes, unsigned int alloc_flags) {
+#ifndef MICROPY_WRAP_GC_ALLOC
+// Optimising gc for speed; 5ms down to 4ms on pybv2
+#define MICROPY_WRAP_GC_ALLOC(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_3(f)
+#endif
+
+void *MICROPY_WRAP_GC_ALLOC(gc_alloc)(size_t n_bytes, unsigned int alloc_flags) {
     bool has_finaliser = alloc_flags & GC_ALLOC_FLAG_HAS_FINALISER;
     size_t n_blocks = ((n_bytes + BYTES_PER_BLOCK - 1) & (~(BYTES_PER_BLOCK - 1))) / BYTES_PER_BLOCK;
     DEBUG_printf("gc_alloc(" UINT_FMT " bytes -> " UINT_FMT " blocks)\n", n_bytes, n_blocks);
@@ -890,9 +895,13 @@ void *gc_alloc_with_finaliser(mp_uint_t n_bytes) {
 }
 */
 
+#ifndef MICROPY_WRAP_GC_FREE
+#define MICROPY_WRAP_GC_FREE(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_3(f)
+#endif
+
 // force the freeing of a piece of memory
 // TODO: freeing here does not call finaliser
-void gc_free(void *ptr) {
+void MICROPY_WRAP_GC_FREE(gc_free)(void *ptr) {
     if (MP_STATE_THREAD(gc_lock_depth) > 0) {
         // Cannot free while the GC is locked. However free is an optimisation
         // to reclaim the memory immediately, this means it will now be left
@@ -1021,7 +1030,11 @@ void *gc_realloc(void *ptr, mp_uint_t n_bytes) {
 
 #else // Alternative gc_realloc impl
 
-void *gc_realloc(void *ptr_in, size_t n_bytes, bool allow_move) {
+#ifndef MICROPY_WRAP_GC_REALLOC
+#define MICROPY_WRAP_GC_REALLOC(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_3(f)
+#endif
+
+void *MICROPY_WRAP_GC_REALLOC(gc_realloc)(void *ptr_in, size_t n_bytes, bool allow_move) {
     // check for pure allocation
     if (ptr_in == NULL) {
         return gc_alloc(n_bytes, false);
diff --git a/py/map.c b/py/map.c
index c18df5a9f333c..2157c4d7d823b 100644
--- a/py/map.c
+++ b/py/map.c
@@ -147,6 +147,10 @@ STATIC void mp_map_rehash(mp_map_t *map) {
     m_del(mp_map_elem_t, old_table, old_alloc);
 }
 
+#ifndef MICROPY_WRAP_MP_MAP_LOOKUP
+#define MICROPY_WRAP_MP_MAP_LOOKUP(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 // MP_MAP_LOOKUP behaviour:
 //  - returns NULL if not found, else the slot it was found in with key,value non-null
 // MP_MAP_LOOKUP_ADD_IF_NOT_FOUND behaviour:
diff --git a/py/mpconfig.h b/py/mpconfig.h
index a1e9660bf46a6..47534a64cb5c2 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -1784,46 +1784,39 @@ typedef double mp_float_t;
 #endif
 
 /*****************************************************************************/
-/* Hooks for a port to wrap functions with attributes                        */
+/* Hooks for a port to wrap functions with performance-tuning attributes     */
 
-#ifndef MICROPY_WRAP_MP_BINARY_OP
-#define MICROPY_WRAP_MP_BINARY_OP(f) f
-#endif
-
-#ifndef MICROPY_WRAP_MP_EXECUTE_BYTECODE
-#define MICROPY_WRAP_MP_EXECUTE_BYTECODE(f) f
-#endif
-
-#ifndef MICROPY_WRAP_MP_LOAD_GLOBAL
-#define MICROPY_WRAP_MP_LOAD_GLOBAL(f) f
-#endif
-
-#ifndef MICROPY_WRAP_MP_LOAD_NAME
-#define MICROPY_WRAP_MP_LOAD_NAME(f) f
-#endif
-
-#ifndef MICROPY_WRAP_MP_MAP_LOOKUP
-#define MICROPY_WRAP_MP_MAP_LOOKUP(f) f
+#ifndef MICROPY_APPLY_COMPILER_OPTIMISATIONS
+#if defined(__GNUC__) && !defined(__clang__)
+// Enable -O3 optimisations.
+#define MICROPY_APPLY_COMPILER_OPTIMISATIONS(f) __attribute__((optimize("O2"))) f
+#else
+// Unsupported on other compilers, will use global optimisation setting (typically -Os).
+#define MICROPY_APPLY_COMPILER_OPTIMISATIONS(f) f
 #endif
-
-#ifndef MICROPY_WRAP_MP_OBJ_GET_TYPE
-#define MICROPY_WRAP_MP_OBJ_GET_TYPE(f) f
 #endif
 
-#ifndef MICROPY_WRAP_MP_SCHED_EXCEPTION
-#define MICROPY_WRAP_MP_SCHED_EXCEPTION(f) f
+// Ideally apply full compiler optimisations and place in RAM.
+// Use this on small functions that need the highest possible performance.
+#ifndef MICROPY_PERFORMANCE_CRITICAL_LEVEL_1
+#define MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f) MICROPY_APPLY_COMPILER_OPTIMISATIONS(f)
 #endif
 
-#ifndef MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT
-#define MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT(f) f
+// Ideally apply full compiler optimisations and optionally place in RAM (if IRAM available).
+// Use this on larger functions that should go in RAM if possible.
+#ifndef MICROPY_PERFORMANCE_CRITICAL_LEVEL_2
+#define MICROPY_PERFORMANCE_CRITICAL_LEVEL_2(f) MICROPY_APPLY_COMPILER_OPTIMISATIONS(f)
 #endif
 
-#ifndef MICROPY_WRAP_MP_SCHED_SCHEDULE
-#define MICROPY_WRAP_MP_SCHED_SCHEDULE(f) f
+// Ideally apply full compiler optimisation if flash available.
+// Use this on functions that are not important enough to place in RAM.
+#ifndef MICROPY_PERFORMANCE_CRITICAL_LEVEL_3
+#define MICROPY_PERFORMANCE_CRITICAL_LEVEL_3(f) MICROPY_APPLY_COMPILER_OPTIMISATIONS(f)
 #endif
 
-#ifndef MICROPY_WRAP_MP_SCHED_VM_ABORT
-#define MICROPY_WRAP_MP_SCHED_VM_ABORT(f) f
+// Ideally apply full compiler optimisation if flash available (but lower priority than level 3).
+#ifndef MICROPY_PERFORMANCE_CRITICAL_LEVEL_4
+#define MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f) f
 #endif
 
 /*****************************************************************************/
diff --git a/py/mpz.c b/py/mpz.c
index b61997e2fd4ed..c274457d00de6 100644
--- a/py/mpz.c
+++ b/py/mpz.c
@@ -36,18 +36,16 @@
 #define DIG_MSB  (MPZ_LONG_1 << (DIG_SIZE - 1))
 #define DIG_BASE (MPZ_LONG_1 << DIG_SIZE)
 
-/*
- mpz is an arbitrary precision integer type with a public API.
+// mpz is an arbitrary precision integer type with a public API.
 
- mpn functions act on non-negative integers represented by an array of generalised
- digits (eg a word per digit).  You also need to specify separately the length of the
- array.  There is no public API for mpn.  Rather, the functions are used by mpz to
- implement its features.
+// mpn functions act on non-negative integers represented by an array of generalised
+// digits (eg a word per digit).  You also need to specify separately the length of the
+// array.  There is no public API for mpn.  Rather, the functions are used by mpz to
+// implement its features.
 
- Integer values are stored little endian (first digit is first in memory).
+// Integer values are stored little endian (first digit is first in memory).
 
- Definition of normalise: ?
-*/
+// Definition of normalise: ?
 
 STATIC size_t mpn_remove_trailing_zeros(mpz_dig_t *oidig, mpz_dig_t *idig) {
     for (--idig; idig >= oidig && *idig == 0; --idig) {
@@ -55,10 +53,9 @@ STATIC size_t mpn_remove_trailing_zeros(mpz_dig_t *oidig, mpz_dig_t *idig) {
     return idig + 1 - oidig;
 }
 
-/* compares i with j
-   returns sign(i - j)
-   assumes i, j are normalised
-*/
+// Compares i with j
+// Returns sign(i - j)
+// Assumes i, j are normalised
 STATIC int mpn_cmp(const mpz_dig_t *idig, size_t ilen, const mpz_dig_t *jdig, size_t jlen) {
     if (ilen < jlen) {
         return -1;
@@ -80,11 +77,10 @@ STATIC int mpn_cmp(const mpz_dig_t *idig, size_t ilen, const mpz_dig_t *jdig, si
     return 0;
 }
 
-/* computes i = j << n
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j; assumes n > 0
-   can have i, j pointing to same memory
-*/
+// Computes i = j << n
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j; assumes n > 0
+// Can have i, j pointing to same memory
 STATIC size_t mpn_shl(mpz_dig_t *idig, mpz_dig_t *jdig, size_t jlen, mp_uint_t n) {
     mp_uint_t n_whole = (n + DIG_SIZE - 1) / DIG_SIZE;
     mp_uint_t n_part = n % DIG_SIZE;
@@ -119,11 +115,10 @@ STATIC size_t mpn_shl(mpz_dig_t *idig, mpz_dig_t *jdig, size_t jlen, mp_uint_t n
     return jlen;
 }
 
-/* computes i = j >> n
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j; assumes n > 0
-   can have i, j pointing to same memory
-*/
+// Computes i = j >> n
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j; assumes n > 0
+// Can have i, j pointing to same memory
 STATIC size_t mpn_shr(mpz_dig_t *idig, mpz_dig_t *jdig, size_t jlen, mp_uint_t n) {
     mp_uint_t n_whole = n / DIG_SIZE;
     mp_uint_t n_part = n % DIG_SIZE;
@@ -151,11 +146,10 @@ STATIC size_t mpn_shr(mpz_dig_t *idig, mpz_dig_t *jdig, size_t jlen, mp_uint_t n
     return jlen;
 }
 
-/* computes i = j + k
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen
-   can have i, j, k pointing to same memory
-*/
+// Computes i = j + k
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_add(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen) {
     mpz_dig_t *oidig = idig;
     mpz_dbl_dig_t carry = 0;
@@ -181,11 +175,10 @@ STATIC size_t mpn_add(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const
     return idig - oidig;
 }
 
-/* computes i = j - k
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes j >= k
-   can have i, j, k pointing to same memory
-*/
+// Computes i = j - k
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes j >= k
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_sub(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen) {
     mpz_dig_t *oidig = idig;
     mpz_dbl_dig_signed_t borrow = 0;
@@ -208,12 +201,10 @@ STATIC size_t mpn_sub(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const
 }
 
 #if MICROPY_OPT_MPZ_BITWISE
-
-/* computes i = j & k
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen (jlen argument not needed)
-   can have i, j, k pointing to same memory
-*/
+// Computes i = j & k
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen (jlen argument not needed)
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_and(mpz_dig_t *idig, const mpz_dig_t *jdig, const mpz_dig_t *kdig, size_t klen) {
     mpz_dig_t *oidig = idig;
 
@@ -223,18 +214,16 @@ STATIC size_t mpn_and(mpz_dig_t *idig, const mpz_dig_t *jdig, const mpz_dig_t *k
 
     return mpn_remove_trailing_zeros(oidig, idig);
 }
-
 #endif
 
-/*  i = -((-j) & (-k))                = ~((~j + 1) & (~k + 1)) + 1
-    i =  (j & (-k)) =  (j & (~k + 1)) =  (  j      & (~k + 1))
-    i =  ((-j) & k) =  ((~j + 1) & k) =  ((~j + 1) &   k     )
-   computes general form:
-   i = (im ^ (((j ^ jm) + jc) & ((k ^ km) + kc))) + ic  where Xm = Xc == 0 ? 0 : DIG_MASK
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes length j >= length k
-   can have i, j, k pointing to same memory
-*/
+//  i = -((-j) & (-k))                = ~((~j + 1) & (~k + 1)) + 1
+//  i =  (j & (-k)) =  (j & (~k + 1)) =  (  j      & (~k + 1))
+//  i =  ((-j) & k) =  ((~j + 1) & k) =  ((~j + 1) &   k     )
+// Computes general form:
+//  i = (im ^ (((j ^ jm) + jc) & ((k ^ km) + kc))) + ic  where Xm = Xc == 0 ? 0 : DIG_MASK
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes length j >= length k
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_and_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen,
     mpz_dbl_dig_t carryi, mpz_dbl_dig_t carryj, mpz_dbl_dig_t carryk) {
     mpz_dig_t *oidig = idig;
@@ -260,12 +249,10 @@ STATIC size_t mpn_and_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, c
 }
 
 #if MICROPY_OPT_MPZ_BITWISE
-
-/* computes i = j | k
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen
-   can have i, j, k pointing to same memory
-*/
+// Computes i = j | k
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_or(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen) {
     mpz_dig_t *oidig = idig;
 
@@ -281,21 +268,17 @@ STATIC size_t mpn_or(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const
 
     return idig - oidig;
 }
-
 #endif
 
-/*  i = -((-j) | (-k))                = ~((~j + 1) | (~k + 1)) + 1
-    i = -(j | (-k)) = -(j | (~k + 1)) = ~(  j      | (~k + 1)) + 1
-    i = -((-j) | k) = -((~j + 1) | k) = ~((~j + 1) |   k     ) + 1
-   computes general form:
-   i = ~(((j ^ jm) + jc) | ((k ^ km) + kc)) + 1  where Xm = Xc == 0 ? 0 : DIG_MASK
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes length j >= length k
-   can have i, j, k pointing to same memory
-*/
-
+//  i = -((-j) | (-k))                = ~((~j + 1) | (~k + 1)) + 1
+//  i = -(j | (-k)) = -(j | (~k + 1)) = ~(  j      | (~k + 1)) + 1
+//  i = -((-j) | k) = -((~j + 1) | k) = ~((~j + 1) |   k     ) + 1
+// Computes general form:
+//  i = ~(((j ^ jm) + jc) | ((k ^ km) + kc)) + 1  where Xm = Xc == 0 ? 0 : DIG_MASK
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes length j >= length k
+// Can have i, j, k pointing to same memory
 #if MICROPY_OPT_MPZ_BITWISE
-
 STATIC size_t mpn_or_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen,
     mpz_dbl_dig_t carryj, mpz_dbl_dig_t carryk) {
     mpz_dig_t *oidig = idig;
@@ -323,9 +306,7 @@ STATIC size_t mpn_or_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, co
 
     return mpn_remove_trailing_zeros(oidig, idig);
 }
-
 #else
-
 STATIC size_t mpn_or_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen,
     mpz_dbl_dig_t carryi, mpz_dbl_dig_t carryj, mpz_dbl_dig_t carryk) {
     mpz_dig_t *oidig = idig;
@@ -348,16 +329,13 @@ STATIC size_t mpn_or_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, co
 
     return mpn_remove_trailing_zeros(oidig, idig);
 }
-
 #endif
 
 #if MICROPY_OPT_MPZ_BITWISE
-
-/* computes i = j ^ k
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen
-   can have i, j, k pointing to same memory
-*/
+// Computes i = j ^ k
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes jlen >= klen
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_xor(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen) {
     mpz_dig_t *oidig = idig;
 
@@ -373,18 +351,16 @@ STATIC size_t mpn_xor(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const
 
     return mpn_remove_trailing_zeros(oidig, idig);
 }
-
 #endif
 
-/*  i = (-j) ^ (-k) = ~(j - 1) ^ ~(k - 1)                   = (j - 1) ^ (k - 1)
-    i = -(j ^ (-k)) = -(j ^ ~(k - 1)) = ~(j ^ ~(k - 1)) + 1 = (j ^ (k - 1)) + 1
-    i = -((-j) ^ k) = -(~(j - 1) ^ k) = ~(~(j - 1) ^ k) + 1 = ((j - 1) ^ k) + 1
-   computes general form:
-   i = ((j - 1 + jc) ^ (k - 1 + kc)) + ic
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised j, k; assumes length j >= length k
-   can have i, j, k pointing to same memory
-*/
+//  i = (-j) ^ (-k) = ~(j - 1) ^ ~(k - 1)                   = (j - 1) ^ (k - 1)
+//  i = -(j ^ (-k)) = -(j ^ ~(k - 1)) = ~(j ^ ~(k - 1)) + 1 = (j ^ (k - 1)) + 1
+//  i = -((-j) ^ k) = -(~(j - 1) ^ k) = ~(~(j - 1) ^ k) + 1 = ((j - 1) ^ k) + 1
+// Computes general form:
+//  i = ((j - 1 + jc) ^ (k - 1 + kc)) + ic
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised j, k; assumes length j >= length k
+// Can have i, j, k pointing to same memory
 STATIC size_t mpn_xor_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, const mpz_dig_t *kdig, size_t klen,
     mpz_dbl_dig_t carryi, mpz_dbl_dig_t carryj, mpz_dbl_dig_t carryk) {
     mpz_dig_t *oidig = idig;
@@ -406,10 +382,9 @@ STATIC size_t mpn_xor_neg(mpz_dig_t *idig, const mpz_dig_t *jdig, size_t jlen, c
     return mpn_remove_trailing_zeros(oidig, idig);
 }
 
-/* computes i = i * d1 + d2
-   returns number of digits in i
-   assumes enough memory in i; assumes normalised i; assumes dmul != 0
-*/
+// Computes i = i * d1 + d2
+// Returns number of digits in i
+// Assumes enough memory in i; assumes normalised i; assumes dmul != 0
 STATIC size_t mpn_mul_dig_add_dig(mpz_dig_t *idig, size_t ilen, mpz_dig_t dmul, mpz_dig_t dadd) {
     mpz_dig_t *oidig = idig;
     mpz_dbl_dig_t carry = dadd;
@@ -427,11 +402,10 @@ STATIC size_t mpn_mul_dig_add_dig(mpz_dig_t *idig, size_t ilen, mpz_dig_t dmul,
     return idig - oidig;
 }
 
-/* computes i = j * k
-   returns number of digits in i
-   assumes enough memory in i; assumes i is zeroed; assumes normalised j, k
-   can have j, k point to same memory
-*/
+// Computes i = j * k
+// Returns number of digits in i
+// Assumes enough memory in i; assumes i is zeroed; assumes normalised j, k
+// Can have j, k point to same memory
 STATIC size_t mpn_mul(mpz_dig_t *idig, mpz_dig_t *jdig, size_t jlen, mpz_dig_t *kdig, size_t klen) {
     mpz_dig_t *oidig = idig;
     size_t ilen = 0;
@@ -457,12 +431,11 @@ STATIC size_t mpn_mul(mpz_dig_t *idig, mpz_dig_t *jdig, size_t jlen, mpz_dig_t *
     return ilen;
 }
 
-/* natural_div - quo * den + new_num = old_num (ie num is replaced with rem)
-   assumes den != 0
-   assumes num_dig has enough memory to be extended by 1 digit
-   assumes quo_dig has enough memory (as many digits as num)
-   assumes quo_dig is filled with zeros
-*/
+// natural_div - quo * den + new_num = old_num (ie num is replaced with rem)
+// Assumes den != 0
+// Assumes num_dig has enough memory to be extended by 1 digit
+// Assumes quo_dig has enough memory (as many digits as num)
+// Assumes quo_dig is filled with zeros
 STATIC void mpn_div(mpz_dig_t *num_dig, size_t *num_len, const mpz_dig_t *den_dig, size_t den_len, mpz_dig_t *quo_dig, size_t *quo_len) {
     mpz_dig_t *orig_num_dig = num_dig;
     mpz_dig_t *orig_quo_dig = quo_dig;
@@ -632,42 +605,6 @@ void mpz_deinit(mpz_t *z) {
     }
 }
 
-#if 0
-these functions are unused
-
-mpz_t *mpz_zero(void) {
-    mpz_t *z = m_new_obj(mpz_t);
-    mpz_init_zero(z);
-    return z;
-}
-
-mpz_t *mpz_from_int(mp_int_t val) {
-    mpz_t *z = mpz_zero();
-    mpz_set_from_int(z, val);
-    return z;
-}
-
-mpz_t *mpz_from_ll(long long val, bool is_signed) {
-    mpz_t *z = mpz_zero();
-    mpz_set_from_ll(z, val, is_signed);
-    return z;
-}
-
-#if MICROPY_PY_BUILTINS_FLOAT
-mpz_t *mpz_from_float(mp_float_t val) {
-    mpz_t *z = mpz_zero();
-    mpz_set_from_float(z, val);
-    return z;
-}
-#endif
-
-mpz_t *mpz_from_str(const char *str, size_t len, bool neg, unsigned int base) {
-    mpz_t *z = mpz_zero();
-    mpz_set_from_str(z, str, len, neg, base);
-    return z;
-}
-#endif
-
 STATIC void mpz_free(mpz_t *z) {
     if (z != NULL) {
         m_del(mpz_dig_t, z->dig, z->alloc);
@@ -701,17 +638,116 @@ STATIC mpz_t *mpz_clone(const mpz_t *src) {
     return z;
 }
 
-/* sets dest = src
-   can have dest, src the same
-*/
-void mpz_set(mpz_t *dest, const mpz_t *src) {
+// Sets dest = src
+// Can have dest, src the same
+STATIC void mpz_set(mpz_t *dest, const mpz_t *src) {
     mpz_need_dig(dest, src->len);
     dest->neg = src->neg;
     dest->len = src->len;
     memcpy(dest->dig, src->dig, src->len * sizeof(mpz_dig_t));
 }
 
-void mpz_set_from_int(mpz_t *z, mp_int_t val) {
+#ifndef MICROPY_WRAP_MPZ_SET_FROM_INT
+#define MICROPY_WRAP_MPZ_SET_FROM_INT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SET_FROM_LL
+#define MICROPY_WRAP_MPZ_SET_FROM_LL(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SET_FROM_FLOAT
+#define MICROPY_WRAP_MPZ_SET_FROM_FLOAT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SET_FROM_STR
+#define MICROPY_WRAP_MPZ_SET_FROM_STR(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SET_FROM_BYTES
+#define MICROPY_WRAP_MPZ_SET_FROM_BYTES(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_CMP
+#define MICROPY_WRAP_MPZ_CMP(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_ABS
+#define MICROPY_WRAP_MPZ_ABS(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_NEG
+#define MICROPY_WRAP_MPZ_NEG(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_NOT
+#define MICROPY_WRAP_MPZ_NOT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SHL
+#define MICROPY_WRAP_MPZ_SHL(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SHR
+#define MICROPY_WRAP_MPZ_SHR(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_ADD
+#define MICROPY_WRAP_MPZ_ADD(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_SUB
+#define MICROPY_WRAP_MPZ_SUB(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_AND
+#define MICROPY_WRAP_MPZ_AND(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_OR
+#define MICROPY_WRAP_MPZ_OR(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_XOR
+#define MICROPY_WRAP_MPZ_XOR(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_MUL
+#define MICROPY_WRAP_MPZ_MUL(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_POW
+#define MICROPY_WRAP_MPZ_POW(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_POW3
+#define MICROPY_WRAP_MPZ_POW3(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_DIVMOD
+#define MICROPY_WRAP_MPZ_DIVMOD(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_HASH
+#define MICROPY_WRAP_MPZ_HASH(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_AS_INT
+#define MICROPY_WRAP_MPZ_AS_INT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_AS_UINT
+#define MICROPY_WRAP_MPZ_AS_UINT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_AS_BYTES
+#define MICROPY_WRAP_MPZ_AS_BYTES(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+#ifndef MICROPY_WRAP_MPZ_AS_STR
+#define MICROPY_WRAP_MPZ_AS_STR(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_4(f)
+#endif
+
+void MICROPY_WRAP_MPZ_SET_FROM_INT(mpz_set_from_int)(mpz_t * z, mp_int_t val) {
     if (val == 0) {
         z->neg = 0;
         z->len = 0;
@@ -736,7 +772,7 @@ void mpz_set_from_int(mpz_t *z, mp_int_t val) {
     }
 }
 
-void mpz_set_from_ll(mpz_t *z, long long val, bool is_signed) {
+void MICROPY_WRAP_MPZ_SET_FROM_LL(mpz_set_from_ll)(mpz_t * z, long long val, bool is_signed) {
     mpz_need_dig(z, MPZ_NUM_DIG_FOR_LL);
 
     unsigned long long uval;
@@ -756,7 +792,7 @@ void mpz_set_from_ll(mpz_t *z, long long val, bool is_signed) {
 }
 
 #if MICROPY_PY_BUILTINS_FLOAT
-void mpz_set_from_float(mpz_t *z, mp_float_t src) {
+void MICROPY_WRAP_MPZ_SET_FROM_FLOAT(mpz_set_from_float)(mpz_t * z, mp_float_t src) {
     mp_float_union_t u = {src};
     z->neg = u.p.sgn;
     if (u.p.exp == 0) {
@@ -813,8 +849,8 @@ void mpz_set_from_float(mpz_t *z, mp_float_t src) {
 }
 #endif
 
-// returns number of bytes from str that were processed
-size_t mpz_set_from_str(mpz_t *z, const char *str, size_t len, bool neg, unsigned int base) {
+// Returns number of bytes from str that were processed
+size_t MICROPY_WRAP_MPZ_SET_FROM_STR(mpz_set_from_str)(mpz_t * z, const char *str, size_t len, bool neg, unsigned int base) {
     assert(base <= 36);
 
     const char *cur = str;
@@ -850,7 +886,7 @@ size_t mpz_set_from_str(mpz_t *z, const char *str, size_t len, bool neg, unsigne
     return cur - str;
 }
 
-void mpz_set_from_bytes(mpz_t *z, bool big_endian, size_t len, const byte *buf) {
+void MICROPY_WRAP_MPZ_SET_FROM_BYTES(mpz_set_from_bytes)(mpz_t * z, bool big_endian, size_t len, const byte *buf) {
     int delta = 1;
     if (big_endian) {
         buf += len - 1;
@@ -883,23 +919,7 @@ void mpz_set_from_bytes(mpz_t *z, bool big_endian, size_t len, const byte *buf)
     z->len = mpn_remove_trailing_zeros(z->dig, z->dig + z->len);
 }
 
-#if 0
-these functions are unused
-
-bool mpz_is_pos(const mpz_t *z) {
-    return z->len > 0 && z->neg == 0;
-}
-
-bool mpz_is_odd(const mpz_t *z) {
-    return z->len > 0 && (z->dig[0] & 1) != 0;
-}
-
-bool mpz_is_even(const mpz_t *z) {
-    return z->len == 0 || (z->dig[0] & 1) == 0;
-}
-#endif
-
-int mpz_cmp(const mpz_t *z1, const mpz_t *z2) {
+int MICROPY_WRAP_MPZ_CMP(mpz_cmp)(const mpz_t * z1, const mpz_t *z2) {
     int cmp = (int)z2->neg - (int)z1->neg;
     if (cmp != 0) {
         return cmp;
@@ -911,141 +931,18 @@ int mpz_cmp(const mpz_t *z1, const mpz_t *z2) {
     return cmp;
 }
 
-#if 0
-// obsolete
-// compares mpz with an integer that fits within DIG_SIZE bits
-mp_int_t mpz_cmp_sml_int(const mpz_t *z, mp_int_t sml_int) {
-    mp_int_t cmp;
-    if (z->neg == 0) {
-        if (sml_int < 0) {
-            return 1;
-        }
-        if (sml_int == 0) {
-            if (z->len == 0) {
-                return 0;
-            }
-            return 1;
-        }
-        if (z->len == 0) {
-            return -1;
-        }
-        assert(sml_int < (1 << DIG_SIZE));
-        if (z->len != 1) {
-            return 1;
-        }
-        cmp = z->dig[0] - sml_int;
-    } else {
-        if (sml_int > 0) {
-            return -1;
-        }
-        if (sml_int == 0) {
-            if (z->len == 0) {
-                return 0;
-            }
-            return -1;
-        }
-        if (z->len == 0) {
-            return 1;
-        }
-        assert(sml_int > -(1 << DIG_SIZE));
-        if (z->len != 1) {
-            return -1;
-        }
-        cmp = -z->dig[0] - sml_int;
-    }
-    if (cmp < 0) {
-        return -1;
-    }
-    if (cmp > 0) {
-        return 1;
-    }
-    return 0;
-}
-#endif
-
-#if 0
-these functions are unused
-
-/* returns abs(z)
-*/
-mpz_t *mpz_abs(const mpz_t *z) {
-    // TODO: handle case of z->alloc=0
-    mpz_t *z2 = mpz_clone(z);
-    z2->neg = 0;
-    return z2;
-}
-
-/* returns -z
-*/
-mpz_t *mpz_neg(const mpz_t *z) {
-    // TODO: handle case of z->alloc=0
-    mpz_t *z2 = mpz_clone(z);
-    z2->neg = 1 - z2->neg;
-    return z2;
-}
-
-/* returns lhs + rhs
-   can have lhs, rhs the same
-*/
-mpz_t *mpz_add(const mpz_t *lhs, const mpz_t *rhs) {
-    mpz_t *z = mpz_zero();
-    mpz_add_inpl(z, lhs, rhs);
-    return z;
-}
-
-/* returns lhs - rhs
-   can have lhs, rhs the same
-*/
-mpz_t *mpz_sub(const mpz_t *lhs, const mpz_t *rhs) {
-    mpz_t *z = mpz_zero();
-    mpz_sub_inpl(z, lhs, rhs);
-    return z;
-}
-
-/* returns lhs * rhs
-   can have lhs, rhs the same
-*/
-mpz_t *mpz_mul(const mpz_t *lhs, const mpz_t *rhs) {
-    mpz_t *z = mpz_zero();
-    mpz_mul_inpl(z, lhs, rhs);
-    return z;
-}
-
-/* returns lhs ** rhs
-   can have lhs, rhs the same
-*/
-mpz_t *mpz_pow(const mpz_t *lhs, const mpz_t *rhs) {
-    mpz_t *z = mpz_zero();
-    mpz_pow_inpl(z, lhs, rhs);
-    return z;
-}
-
-/* computes new integers in quo and rem such that:
-       quo * rhs + rem = lhs
-       0 <= rem < rhs
-   can have lhs, rhs the same
-*/
-void mpz_divmod(const mpz_t *lhs, const mpz_t *rhs, mpz_t **quo, mpz_t **rem) {
-    *quo = mpz_zero();
-    *rem = mpz_zero();
-    mpz_divmod_inpl(*quo, *rem, lhs, rhs);
-}
-#endif
-
-/* computes dest = abs(z)
-   can have dest, z the same
-*/
-void mpz_abs_inpl(mpz_t *dest, const mpz_t *z) {
+// Computes dest = abs(z)
+// Can have dest, z the same
+void MICROPY_WRAP_MPZ_ABS(mpz_abs_inpl)(mpz_t * dest, const mpz_t *z) {
     if (dest != z) {
         mpz_set(dest, z);
     }
     dest->neg = 0;
 }
 
-/* computes dest = -z
-   can have dest, z the same
-*/
-void mpz_neg_inpl(mpz_t *dest, const mpz_t *z) {
+// Computes dest = -z
+// Can have dest, z the same
+void MICROPY_WRAP_MPZ_NEG(mpz_neg_inpl)(mpz_t * dest, const mpz_t *z) {
     if (dest != z) {
         mpz_set(dest, z);
     }
@@ -1054,10 +951,9 @@ void mpz_neg_inpl(mpz_t *dest, const mpz_t *z) {
     }
 }
 
-/* computes dest = ~z (= -z - 1)
-   can have dest, z the same
-*/
-void mpz_not_inpl(mpz_t *dest, const mpz_t *z) {
+// Computes dest = ~z (= -z - 1)
+// Can have dest, z the same
+void MICROPY_WRAP_MPZ_NOT(mpz_not_inpl)(mpz_t * dest, const mpz_t *z) {
     if (dest != z) {
         mpz_set(dest, z);
     }
@@ -1078,10 +974,9 @@ void mpz_not_inpl(mpz_t *dest, const mpz_t *z) {
     }
 }
 
-/* computes dest = lhs << rhs
-   can have dest, lhs the same
-*/
-void mpz_shl_inpl(mpz_t *dest, const mpz_t *lhs, mp_uint_t rhs) {
+// Computes dest = lhs << rhs
+// Can have dest, lhs the same
+void MICROPY_WRAP_MPZ_SHL(mpz_shl_inpl)(mpz_t * dest, const mpz_t *lhs, mp_uint_t rhs) {
     if (lhs->len == 0 || rhs == 0) {
         mpz_set(dest, lhs);
     } else {
@@ -1091,10 +986,9 @@ void mpz_shl_inpl(mpz_t *dest, const mpz_t *lhs, mp_uint_t rhs) {
     }
 }
 
-/* computes dest = lhs >> rhs
-   can have dest, lhs the same
-*/
-void mpz_shr_inpl(mpz_t *dest, const mpz_t *lhs, mp_uint_t rhs) {
+// Computes dest = lhs >> rhs
+// Can have dest, lhs the same
+void MICROPY_WRAP_MPZ_SHR(mpz_shr_inpl)(mpz_t * dest, const mpz_t *lhs, mp_uint_t rhs) {
     if (lhs->len == 0 || rhs == 0) {
         mpz_set(dest, lhs);
     } else {
@@ -1129,10 +1023,9 @@ void mpz_shr_inpl(mpz_t *dest, const mpz_t *lhs, mp_uint_t rhs) {
     }
 }
 
-/* computes dest = lhs + rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_add_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs + rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_ADD(mpz_add_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     if (mpn_cmp(lhs->dig, lhs->len, rhs->dig, rhs->len) < 0) {
         const mpz_t *temp = lhs;
         lhs = rhs;
@@ -1150,10 +1043,9 @@ void mpz_add_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     dest->neg = lhs->neg & !!dest->len;
 }
 
-/* computes dest = lhs - rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_sub_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs - rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_SUB(mpz_sub_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     bool neg = false;
 
     if (mpn_cmp(lhs->dig, lhs->len, rhs->dig, rhs->len) < 0) {
@@ -1180,10 +1072,9 @@ void mpz_sub_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     }
 }
 
-/* computes dest = lhs & rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_and_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs & rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_AND(mpz_and_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     // make sure lhs has the most digits
     if (lhs->len < rhs->len) {
         const mpz_t *temp = lhs;
@@ -1214,10 +1105,9 @@ void mpz_and_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     #endif
 }
 
-/* computes dest = lhs | rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_or_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs | rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_OR(mpz_or_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     // make sure lhs has the most digits
     if (lhs->len < rhs->len) {
         const mpz_t *temp = lhs;
@@ -1248,10 +1138,9 @@ void mpz_or_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     #endif
 }
 
-/* computes dest = lhs ^ rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_xor_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs ^ rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_XOR(mpz_xor_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     // make sure lhs has the most digits
     if (lhs->len < rhs->len) {
         const mpz_t *temp = lhs;
@@ -1286,10 +1175,9 @@ void mpz_xor_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     #endif
 }
 
-/* computes dest = lhs * rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_mul_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs * rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_MUL(mpz_mul_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     if (lhs->len == 0 || rhs->len == 0) {
         mpz_set_from_int(dest, 0);
         return;
@@ -1318,10 +1206,9 @@ void mpz_mul_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     mpz_free(temp);
 }
 
-/* computes dest = lhs ** rhs
-   can have dest, lhs, rhs the same
-*/
-void mpz_pow_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes dest = lhs ** rhs
+// Can have dest, lhs, rhs the same
+void MICROPY_WRAP_MPZ_POW(mpz_pow_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs) {
     if (lhs->len == 0 || rhs->neg != 0) {
         mpz_set_from_int(dest, 0);
         return;
@@ -1352,10 +1239,9 @@ void mpz_pow_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
     mpz_free(n);
 }
 
-/* computes dest = (lhs ** rhs) % mod
-   can have dest, lhs, rhs the same; mod can't be the same as dest
-*/
-void mpz_pow3_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs, const mpz_t *mod) {
+// Computes dest = (lhs ** rhs) % mod
+// Can have dest, lhs, rhs the same; mod can't be the same as dest
+void MICROPY_WRAP_MPZ_POW3(mpz_pow3_inpl)(mpz_t * dest, const mpz_t *lhs, const mpz_t *rhs, const mpz_t *mod) {
     if (lhs->len == 0 || rhs->neg != 0 || (mod->len == 1 && mod->dig[0] == 1)) {
         mpz_set_from_int(dest, 0);
         return;
@@ -1390,97 +1276,12 @@ void mpz_pow3_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs, const mpz_t
     mpz_free(n);
 }
 
-#if 0
-these functions are unused
-
-/* computes gcd(z1, z2)
-   based on Knuth's modified gcd algorithm (I think?)
-   gcd(z1, z2) >= 0
-   gcd(0, 0) = 0
-   gcd(z, 0) = abs(z)
-*/
-mpz_t *mpz_gcd(const mpz_t *z1, const mpz_t *z2) {
-    if (z1->len == 0) {
-        // TODO: handle case of z2->alloc=0
-        mpz_t *a = mpz_clone(z2);
-        a->neg = 0;
-        return a;
-    } else if (z2->len == 0) {
-        mpz_t *a = mpz_clone(z1);
-        a->neg = 0;
-        return a;
-    }
-
-    mpz_t *a = mpz_clone(z1);
-    mpz_t *b = mpz_clone(z2);
-    mpz_t c;
-    mpz_init_zero(&c);
-    a->neg = 0;
-    b->neg = 0;
-
-    for (;;) {
-        if (mpz_cmp(a, b) < 0) {
-            if (a->len == 0) {
-                mpz_free(a);
-                mpz_deinit(&c);
-                return b;
-            }
-            mpz_t *t = a;
-            a = b;
-            b = t;
-        }
-        if (!(b->len >= 2 || (b->len == 1 && b->dig[0] > 1))) { // compute b > 0; could be mpz_cmp_small_int(b, 1) > 0
-            break;
-        }
-        mpz_set(&c, b);
-        do {
-            mpz_add_inpl(&c, &c, &c);
-        } while (mpz_cmp(&c, a) <= 0);
-        c.len = mpn_shr(c.dig, c.dig, c.len, 1);
-        mpz_sub_inpl(a, a, &c);
-    }
-
-    mpz_deinit(&c);
-
-    if (b->len == 1 && b->dig[0] == 1) { // compute b == 1; could be mpz_cmp_small_int(b, 1) == 0
-        mpz_free(a);
-        return b;
-    } else {
-        mpz_free(b);
-        return a;
-    }
-}
-
-/* computes lcm(z1, z2)
-     = abs(z1) / gcd(z1, z2) * abs(z2)
-  lcm(z1, z1) >= 0
-  lcm(0, 0) = 0
-  lcm(z, 0) = 0
-*/
-mpz_t *mpz_lcm(const mpz_t *z1, const mpz_t *z2) {
-    if (z1->len == 0 || z2->len == 0) {
-        return mpz_zero();
-    }
-
-    mpz_t *gcd = mpz_gcd(z1, z2);
-    mpz_t *quo = mpz_zero();
-    mpz_t *rem = mpz_zero();
-    mpz_divmod_inpl(quo, rem, z1, gcd);
-    mpz_mul_inpl(rem, quo, z2);
-    mpz_free(gcd);
-    mpz_free(quo);
-    rem->neg = 0;
-    return rem;
-}
-#endif
-
-/* computes new integers in quo and rem such that:
-       quo * rhs + rem = lhs
-       0 <= rem < rhs
-   can have lhs, rhs the same
-   assumes rhs != 0 (undefined behaviour if it is)
-*/
-void mpz_divmod_inpl(mpz_t *dest_quo, mpz_t *dest_rem, const mpz_t *lhs, const mpz_t *rhs) {
+// Computes new integers in quo and rem such that:
+//  quo * rhs + rem = lhs
+//  0 <= rem < rhs
+// Can have lhs, rhs the same
+// Assumes rhs != 0 (undefined behaviour if it is)
+void MICROPY_WRAP_MPZ_DIVMOD(mpz_divmod_inpl)(mpz_t * dest_quo, mpz_t *dest_rem, const mpz_t *lhs, const mpz_t *rhs) {
     assert(!mpz_is_zero(rhs));
 
     mpz_need_dig(dest_quo, lhs->len + 1); // +1 necessary?
@@ -1504,36 +1305,8 @@ void mpz_divmod_inpl(mpz_t *dest_quo, mpz_t *dest_rem, const mpz_t *lhs, const m
     }
 }
 
-#if 0
-these functions are unused
-
-/* computes floor(lhs / rhs)
-   can have lhs, rhs the same
-*/
-mpz_t *mpz_div(const mpz_t *lhs, const mpz_t *rhs) {
-    mpz_t *quo = mpz_zero();
-    mpz_t rem;
-    mpz_init_zero(&rem);
-    mpz_divmod_inpl(quo, &rem, lhs, rhs);
-    mpz_deinit(&rem);
-    return quo;
-}
-
-/* computes lhs % rhs ( >= 0)
-   can have lhs, rhs the same
-*/
-mpz_t *mpz_mod(const mpz_t *lhs, const mpz_t *rhs) {
-    mpz_t quo;
-    mpz_init_zero(&quo);
-    mpz_t *rem = mpz_zero();
-    mpz_divmod_inpl(&quo, rem, lhs, rhs);
-    mpz_deinit(&quo);
-    return rem;
-}
-#endif
-
-// must return actual int value if it fits in mp_int_t
-mp_int_t mpz_hash(const mpz_t *z) {
+// Must return actual int value if it fits in mp_int_t
+mp_int_t MICROPY_WRAP_MPZ_HASH(mpz_hash)(const mpz_t * z) {
     mp_uint_t val = 0;
     mpz_dig_t *d = z->dig + z->len;
 
@@ -1548,7 +1321,7 @@ mp_int_t mpz_hash(const mpz_t *z) {
     return val;
 }
 
-bool mpz_as_int_checked(const mpz_t *i, mp_int_t *value) {
+bool MICROPY_WRAP_MPZ_AS_INT(mpz_as_int_checked)(const mpz_t * i, mp_int_t *value) {
     mp_uint_t val = 0;
     mpz_dig_t *d = i->dig + i->len;
 
@@ -1568,7 +1341,7 @@ bool mpz_as_int_checked(const mpz_t *i, mp_int_t *value) {
     return true;
 }
 
-bool mpz_as_uint_checked(const mpz_t *i, mp_uint_t *value) {
+bool MICROPY_WRAP_MPZ_AS_UINT(mpz_as_uint_checked)(const mpz_t * i, mp_uint_t *value) {
     if (i->neg != 0) {
         // can't represent signed values
         return false;
@@ -1589,7 +1362,7 @@ bool mpz_as_uint_checked(const mpz_t *i, mp_uint_t *value) {
     return true;
 }
 
-void mpz_as_bytes(const mpz_t *z, bool big_endian, size_t len, byte *buf) {
+void MICROPY_WRAP_MPZ_AS_BYTES(mpz_as_bytes)(const mpz_t * z, bool big_endian, size_t len, byte *buf) {
     byte *b = buf;
     if (big_endian) {
         b += len;
@@ -1648,19 +1421,10 @@ mp_float_t mpz_as_float(const mpz_t *i) {
 }
 #endif
 
-#if 0
-this function is unused
-char *mpz_as_str(const mpz_t *i, unsigned int base) {
-    char *s = m_new(char, mp_int_format_size(mpz_max_num_bits(i), base, NULL, '\0'));
-    mpz_as_str_inpl(i, base, NULL, 'a', '\0', s);
-    return s;
-}
-#endif
-
-// assumes enough space in str as calculated by mp_int_format_size
+// Assumes enough space in str as calculated by mp_int_format_size
 // base must be between 2 and 32 inclusive
-// returns length of string, not including null byte
-size_t mpz_as_str_inpl(const mpz_t *i, unsigned int base, const char *prefix, char base_char, char comma, char *str) {
+// Returns length of string, not including null byte
+size_t MICROPY_WRAP_MPZ_AS_STR(mpz_as_str_inpl)(const mpz_t * i, unsigned int base, const char *prefix, char base_char, char comma, char *str) {
     assert(str != NULL);
     assert(2 <= base && base <= 32);
 
diff --git a/py/mpz.h b/py/mpz.h
index d27f5724047ae..d01d646ccefd3 100644
--- a/py/mpz.h
+++ b/py/mpz.h
@@ -107,7 +107,6 @@ void mpz_init_from_int(mpz_t *z, mp_int_t val);
 void mpz_init_fixed_from_int(mpz_t *z, mpz_dig_t *dig, size_t dig_alloc, mp_int_t val);
 void mpz_deinit(mpz_t *z);
 
-void mpz_set(mpz_t *dest, const mpz_t *src);
 void mpz_set_from_int(mpz_t *z, mp_int_t src);
 void mpz_set_from_ll(mpz_t *z, long long i, bool is_signed);
 #if MICROPY_PY_BUILTINS_FLOAT
diff --git a/py/obj.c b/py/obj.c
index 5e01198b6fb4c..6b620170739d1 100644
--- a/py/obj.c
+++ b/py/obj.c
@@ -44,6 +44,10 @@ MP_NOINLINE void *mp_obj_malloc_helper(size_t num_bytes, const mp_obj_type_t *ty
     return base;
 }
 
+#ifndef MICROPY_WRAP_MP_OBJ_GET_TYPE
+#define MICROPY_WRAP_MP_OBJ_GET_TYPE(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 const mp_obj_type_t *MICROPY_WRAP_MP_OBJ_GET_TYPE(mp_obj_get_type)(mp_const_obj_t o_in) {
     #if MICROPY_OBJ_IMMEDIATE_OBJS && MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_A
 
diff --git a/py/py.mk b/py/py.mk
index e81df52fb7f99..18beed3452961 100644
--- a/py/py.mk
+++ b/py/py.mk
@@ -18,9 +18,6 @@ endif
 QSTR_GLOBAL_DEPENDENCIES += $(PY_SRC)/mpconfig.h mpconfigport.h
 QSTR_GLOBAL_REQUIREMENTS += $(HEADER_BUILD)/mpversion.h
 
-# some code is performance bottleneck and compiled with other optimization options
-CSUPEROPT = -O3
-
 # Enable building 32-bit code on 64-bit host.
 ifeq ($(MICROPY_FORCE_32BIT),1)
 CC += -m32
@@ -254,17 +251,3 @@ $(BUILD)/shared/libc/string0.o: CFLAGS += $(CFLAGS_BUILTIN)
 # Force nlr code to always be compiled with space-saving optimisation so
 # that the function preludes are of a minimal and predictable form.
 $(PY_BUILD)/nlr%.o: CFLAGS += -Os
-
-# optimising gc for speed; 5ms down to 4ms on pybv2
-$(PY_BUILD)/gc.o: CFLAGS += $(CSUPEROPT)
-
-# optimising vm for speed, adds only a small amount to code size but makes a huge difference to speed (20% faster)
-$(PY_BUILD)/vm.o: CFLAGS += $(CSUPEROPT)
-# Optimizing vm.o for modern deeply pipelined CPUs with branch predictors
-# may require disabling tail jump optimization. This will make sure that
-# each opcode has its own dispatching jump which will improve branch
-# branch predictor efficiency.
-# https://marc.info/?l=lua-l&m=129778596120851
-# http://hg.python.org/cpython/file/b127046831e2/Python/ceval.c#l828
-# http://www.emulators.com/docs/nx25_nostradamus.htm
-#-fno-crossjumping
diff --git a/py/runtime.c b/py/runtime.c
index 6d8eddedc8646..5342dd9251bca 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -203,6 +203,10 @@ void mp_call_function_1_from_nlr_jump_callback(void *ctx_in) {
     ctx->func(ctx->arg);
 }
 
+#ifndef MICROPY_WRAP_MP_LOAD_NAME
+#define MICROPY_WRAP_MP_LOAD_NAME(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 mp_obj_t MICROPY_WRAP_MP_LOAD_NAME(mp_load_name)(qstr qst) {
     // logic: search locals, globals, builtins
     DEBUG_OP_printf("load name %s\n", qstr_str(qst));
@@ -216,6 +220,10 @@ mp_obj_t MICROPY_WRAP_MP_LOAD_NAME(mp_load_name)(qstr qst) {
     return mp_load_global(qst);
 }
 
+#ifndef MICROPY_WRAP_MP_LOAD_GLOBAL
+#define MICROPY_WRAP_MP_LOAD_GLOBAL(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 mp_obj_t MICROPY_WRAP_MP_LOAD_GLOBAL(mp_load_global)(qstr qst) {
     // logic: search globals, builtins
     DEBUG_OP_printf("load global %s\n", qstr_str(qst));
@@ -360,6 +368,10 @@ mp_obj_t mp_unary_op(mp_unary_op_t op, mp_obj_t arg) {
     }
 }
 
+#ifndef MICROPY_WRAP_MP_BINARY_OP
+#define MICROPY_WRAP_MP_BINARY_OP(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_2(f)
+#endif
+
 mp_obj_t MICROPY_WRAP_MP_BINARY_OP(mp_binary_op)(mp_binary_op_t op, mp_obj_t lhs, mp_obj_t rhs) {
     DEBUG_OP_printf("binary " UINT_FMT " %q %p %p\n", op, mp_binary_op_method_name[op], lhs, rhs);
 
diff --git a/py/scheduler.c b/py/scheduler.c
index 3eae8b4fa366c..682b8dbd70b6c 100644
--- a/py/scheduler.c
+++ b/py/scheduler.c
@@ -29,6 +29,10 @@
 #include "py/mphal.h"
 #include "py/runtime.h"
 
+#ifndef MICROPY_WRAP_MP_SCHED_EXCEPTION
+#define MICROPY_WRAP_MP_SCHED_EXCEPTION(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 // Schedules an exception on the main thread (for exceptions "thrown" by async
 // sources such as interrupts and UNIX signal handlers).
 void MICROPY_WRAP_MP_SCHED_EXCEPTION(mp_sched_exception)(mp_obj_t exc) {
@@ -45,6 +49,10 @@ void MICROPY_WRAP_MP_SCHED_EXCEPTION(mp_sched_exception)(mp_obj_t exc) {
 }
 
 #if MICROPY_KBD_EXCEPTION
+#ifndef MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT
+#define MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 // This function may be called asynchronously at any time so only do the bare minimum.
 void MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT(mp_sched_keyboard_interrupt)(void) {
     MP_STATE_VM(mp_kbd_exception).traceback_data = NULL;
@@ -53,6 +61,10 @@ void MICROPY_WRAP_MP_SCHED_KEYBOARD_INTERRUPT(mp_sched_keyboard_interrupt)(void)
 #endif
 
 #if MICROPY_ENABLE_VM_ABORT
+#ifndef MICROPY_WRAP_MP_SCHED_VM_ABORT
+#define MICROPY_WRAP_MP_SCHED_VM_ABORT(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 void MICROPY_WRAP_MP_SCHED_VM_ABORT(mp_sched_vm_abort)(void) {
     MP_STATE_VM(vm_abort) = true;
 }
@@ -156,6 +168,10 @@ void mp_sched_unlock(void) {
     MICROPY_END_ATOMIC_SECTION(atomic_state);
 }
 
+#ifndef MICROPY_WRAP_MP_SCHED_SCHEDULE
+#define MICROPY_WRAP_MP_SCHED_SCHEDULE(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+#endif
+
 bool MICROPY_WRAP_MP_SCHED_SCHEDULE(mp_sched_schedule)(mp_obj_t function, mp_obj_t arg) {
     mp_uint_t atomic_state = MICROPY_BEGIN_ATOMIC_SECTION();
     bool ret;
diff --git a/py/vm.c b/py/vm.c
index a7902d9276732..9e816b4e80bf1 100644
--- a/py/vm.c
+++ b/py/vm.c
@@ -195,6 +195,21 @@
 #define TRACE_TICK(current_ip, current_sp, is_exception)
 #endif // MICROPY_PY_SYS_SETTRACE
 
+#ifndef MICROPY_WRAP_MP_EXECUTE_BYTECODE
+// Using -O3 (rather than -Os) only a small amount to code size but makes a huge difference to speed (20% faster)
+#define MICROPY_WRAP_MP_EXECUTE_BYTECODE(f) MICROPY_PERFORMANCE_CRITICAL_LEVEL_1(f)
+
+// Note:
+// Optimizing vm.o for modern deeply pipelined CPUs with branch predictors
+// may require disabling tail jump optimization. This will make sure that
+// each opcode has its own dispatching jump which will improve branch
+// branch predictor efficiency.
+// https://marc.info/?l=lua-l&m=129778596120851
+// http://hg.python.org/cpython/file/b127046831e2/Python/ceval.c#l828
+// http://www.emulators.com/docs/nx25_nostradamus.htm
+// -fno-crossjumping
+#endif
+
 // fastn has items in reverse order (fastn[0] is local[0], fastn[-1] is local[1], etc)
 // sp points to bottom of stack which grows up
 // returns: