micropython · Gadgetoid · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/ports/rp2/mpconfigport.h b/ports/rp2/mpconfigport.h
@@ -97,6 +97,7 @@
 #endif
 #define MICROPY_ALLOC_PATH_MAX                  (128)
 #define MICROPY_QSTR_BYTES_IN_HASH              (1)
+#define MICROPY_GC_FAST_TABLE_SCANS             (1)
 
 // MicroPython emitters
 #define MICROPY_PERSISTENT_CODE_LOAD            (1)

diff --git a/py/gc.c b/py/gc.c
@@ -78,6 +78,38 @@
 #define ATB_2_IS_FREE(a) (((a) & ATB_MASK_2) == 0)
 #define ATB_3_IS_FREE(a) (((a) & ATB_MASK_3) == 0)
 
+#if MICROPY_GC_FAST_TABLE_SCANS
+// An ATB byte/word that is entirely AT_TAIL (0b10) entries: four / sixteen
+// consecutive tail blocks, i.e. the interior of a large allocation. Used by
+// the sweep to coalesce long tail runs instead of walking them block by block.
+#define ATB_ALL_TAIL_BYTE (0xaa)
+#define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL)
+// An ATB byte/word that is entirely AT_FREE (0b00): four / sixteen consecutive
+// free blocks, i.e. a large gap (e.g. a freed buffer below still-live data).
+// The sweep coalesces these the same way it coalesces all-tail runs.
+#define ATB_ALL_FREE_BYTE (0x00)
+#define ATB_ALL_FREE_WORD (0x00000000UL)
+#define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4)
+
+// Read four ATB bytes as a word during the sweep's tail-run fast path.
+// We deliberately avoid `*(uint32_t *)atb`: the allocation table is a byte
+// array, so casting it to uint32_t* violates strict aliasing, and py/ is built
+// with strict aliasing enabled on most ports - an optimiser would then be free
+// to assume this word read doesn't alias the byte stores to the same table and
+// reorder them. memcpy is the well-defined idiom; with the alignment hint (the
+// caller guarantees `atb` is word-aligned, via the byte-step dance) the compiler
+// lowers it to a single aligned load, so it is as fast as the cast and never
+// triggers an unaligned-access fault on Cortex-M0+ / Xtensa / RISC-V.
+static inline uint32_t gc_read_atb_word(const byte *atb) {
+    #if defined(__GNUC__)
+    atb = __builtin_assume_aligned(atb, sizeof(uint32_t));
+    #endif
+    uint32_t w;
+    memcpy(&w, atb, sizeof(w));
+    return w;
+}
+#endif
+
 #if MICROPY_GC_SPLIT_HEAP
 #define NEXT_AREA(area) ((area)->next)
 #else
@@ -625,6 +657,22 @@ static void gc_sweep_run_finalisers(void) {
         // Small speed optimisation: skip over empty FTB blocks
         size_t ftb_end = area->gc_last_used_block / BLOCKS_PER_FTB; // index is inclusive
         for (size_t ftb_idx = 0; ftb_idx <= ftb_end; ftb_idx++) {
+            #if MICROPY_GC_FAST_TABLE_SCANS && MICROPY_ENABLE_FINALISER && !MICROPY_PY_WEAKREF
+            // Sibling of the sweep's uniform-run coalescing, for the finaliser
+            // table: skip spans with no finaliser bits a word (32 blocks) at a
+            // time. The FTB over a large pure-data buffer is all zero but is
+            // otherwise scanned byte-by-byte up to the high-water mark (costly on
+            // a PSRAM heap). Aligned word loads only (byte-walk to alignment via
+            // the for-loop first); a non-zero word always falls through to the
+            // per-byte handler, so finaliser execution is unaffected. Restricted
+            // to the FTB-only case (weakref disabled), the table walked here.
+            if (((uintptr_t)&area->gc_finaliser_table_start[ftb_idx] & 3) == 0
+                && ftb_idx + 3 <= ftb_end
+                && gc_read_atb_word(&area->gc_finaliser_table_start[ftb_idx]) == 0) {
+                ftb_idx += 3; // the for-loop's ++ advances a full 4-byte word
+                continue;
+            }
+            #endif
             #if MICROPY_ENABLE_FINALISER
             byte ftb = area->gc_finaliser_table_start[ftb_idx];
             size_t block = ftb_idx * BLOCKS_PER_FTB;
@@ -689,10 +737,68 @@ static void gc_sweep_free_blocks(void) {
 
     for (mp_state_mem_area_t *area = &MP_STATE_MEM(area); area != NULL; area = NEXT_AREA(area)) {
         size_t last_used_block = 0;
-        assert(area->gc_last_used_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB);
+        size_t end_block = area->gc_last_used_block;
+        assert(end_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB);
 
-        for (size_t block = 0; block <= area->gc_last_used_block; block++) {
+        for (size_t block = 0; block <= end_block; block++) {
             MICROPY_GC_HOOK_LOOP(block);
+
+            // Fast path: coalesce long runs of tail blocks (the body of a large
+            // allocation) a whole ATB word/byte at a time instead of block by
+            // block. A run of AT_TAIL never contains a HEAD/MARK, so free_tail is
+            // constant across it; an all-tail word is therefore wholly live or
+            // wholly dead. This is the dominant sweep cost for multi-block buffers.
+            #if MICROPY_GC_FAST_TABLE_SCANS
+            if ((block & (BLOCKS_PER_ATB - 1)) == 0) {
+                byte *atb = &area->gc_alloc_table_start[block / BLOCKS_PER_ATB];
+                // Coalesce the run: word-step (16 blocks) when the ATB pointer is
+                // word-aligned and a full all-tail word remains, otherwise byte-step
+                // (4 blocks). The byte steps also walk up to alignment and mop up the
+                // final partial word. The word read only runs when atb is aligned, as
+                // an unaligned access would fault on Cortex-M0+ / RISC-V. In both
+                // cases: free the run if dead (free_tail), else extend last_used_block.
+                // Coalesce both all-tail runs (allocation bodies) and all-free
+                // runs (large gaps below live data). An all-tail byte/word is freed
+                // when dead (free_tail) else extends last_used_block; an all-free
+                // byte/word needs no action - it is already free and not "used".
+                while (block + BLOCKS_PER_ATB - 1 <= end_block
+                       && (*atb == ATB_ALL_TAIL_BYTE || *atb == ATB_ALL_FREE_BYTE)) {
+                    int is_tail = (*atb == ATB_ALL_TAIL_BYTE);
+                    if (((uintptr_t)atb & 3) == 0
+                        && block + BLOCKS_PER_ATB_WORD - 1 <= end_block) {
+                        uint32_t w = gc_read_atb_word(atb);
+                        if (w == ATB_ALL_TAIL_WORD || w == ATB_ALL_FREE_WORD) {
+                            // Long runs are swept here; let ports run their GC-loop hook.
+                            MICROPY_GC_HOOK_LOOP(block);
+                            if (w == ATB_ALL_TAIL_WORD) {
+                                if (free_tail) {
+                                    memset(atb, 0, sizeof(uint32_t));
+                                } else {
+                                    last_used_block = block + BLOCKS_PER_ATB_WORD - 1;
+                                }
+                            }
+                            block += BLOCKS_PER_ATB_WORD;
+                            atb += 4;
+                            continue;
+                        }
+                        // aligned but a mixed word: fall to the byte step
+                    }
+                    if (is_tail) {
+                        if (free_tail) {
+                            *atb = 0;
+                        } else {
+                            last_used_block = block + BLOCKS_PER_ATB - 1;
+                        }
+                    }
+                    block += BLOCKS_PER_ATB;
+                    atb += 1;
+                }
+                if (block > end_block) {
+                    break;
+                }
+            }
+            #endif
+
             switch (ATB_GET_KIND(area, block)) {
                 case AT_HEAD:
                     free_tail = 1;

diff --git a/py/mpconfig.h b/py/mpconfig.h
@@ -780,6 +780,15 @@ typedef uint64_t mp_uint_t;
 #define MICROPY_GC_SPLIT_HEAP_AUTO (0)
 #endif
 
+// Whether the GC scans its per-block tables a word (16/32 blocks) at a time
+// instead of one block at a time, where a whole word is uniform. Covers the
+// sweep's allocation-table tail-block and free runs, and the finaliser-table
+// scan. Greatly speeds collection when large pure-data buffers (e.g. multi-
+// KB/MB bytearrays) or large free gaps are present; costs a little code.
+#ifndef MICROPY_GC_FAST_TABLE_SCANS
+#define MICROPY_GC_FAST_TABLE_SCANS (0)
+#endif
+
 // Hook to run code during time consuming garbage collector operations
 // *i* is the loop index variable (e.g. can be used to run every x loops)
 #ifndef MICROPY_GC_HOOK_LOOP