From dafd3a25b961c90a0e6dbfc0512c7d04087c6de9 Mon Sep 17 00:00:00 2001
From: Phil Howard <github@gadgetoid.com>
Date: Mon, 22 Jun 2026 17:54:57 +0100
Subject: [PATCH 1/4] py/gc: Add fast tail-block sweep behind
 MICROPY_GC_FAST_TABLE_SCANS.

The GC sweep walked the allocation table one 16-byte block at a time.
The interior of a large allocation is a long run of AT_TAIL entries (an
ATB byte of 0xaa is four tail blocks, a 32-bit word 0xaaaaaaaa sixteen).

Such a run never contains a HEAD/MARK, so it is wholly live or wholly
dead and can be processed a word at a time - free the whole word when
dead, or just advance the high-water mark when live - instead of
touching every block. This is the dominant sweep cost for large
pure-data buffers: on RP2040 a 96 KB bytearray collect drops from 7363
to 5747 us.

This introduces MICROPY_GC_FAST_TABLE_SCANS (default disabled), which
gates a family of word-at-a-time scans over the GC's per-block tables;
when off the behaviour is unchanged. This commit adds the first:
coalescing tail-block runs in the sweep. Following commits add free-run
coalescing and a finaliser-table skip under the same option. Enabling
this commit alone costs +112 bytes of flash and no RAM on RP2040.

The four ATB bytes are read via memcpy (with an alignment hint) rather
than casting to uint32_t*, to avoid a strict-aliasing violation. A
byte-step-to-alignment then word-step loop issues only aligned word
accesses, so it is safe on cores that fault on unaligned loads
(Cortex-M0+, Hazard3 RISC-V); validated on M0+, Cortex-M33 and RISC-V.
MICROPY_GC_HOOK_LOOP is called within the word loop.

Signed-off-by: Phil Howard <github@gadgetoid.com>
---
 py/gc.c       | 76 +++++++++++++++++++++++++++++++++++++++++++++++++--
 py/mpconfig.h |  9 ++++++
 2 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/py/gc.c b/py/gc.c
index c1a19da3efadb..2384ba2c63de0 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -78,6 +78,33 @@
 #define ATB_2_IS_FREE(a) (((a) & ATB_MASK_2) == 0)
 #define ATB_3_IS_FREE(a) (((a) & ATB_MASK_3) == 0)
 
+#if MICROPY_GC_FAST_TABLE_SCANS
+// An ATB byte/word that is entirely AT_TAIL (0b10) entries: four / sixteen
+// consecutive tail blocks, i.e. the interior of a large allocation. Used by
+// the sweep to coalesce long tail runs instead of walking them block by block.
+#define ATB_ALL_TAIL_BYTE (0xaa)
+#define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL)
+#define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4)
+
+// Read four ATB bytes as a word during the sweep's tail-run fast path.
+// We deliberately avoid `*(uint32_t *)atb`: the allocation table is a byte
+// array, so casting it to uint32_t* violates strict aliasing, and py/ is built
+// with strict aliasing enabled on most ports - an optimiser would then be free
+// to assume this word read doesn't alias the byte stores to the same table and
+// reorder them. memcpy is the well-defined idiom; with the alignment hint (the
+// caller guarantees `atb` is word-aligned, via the byte-step dance) the compiler
+// lowers it to a single aligned load, so it is as fast as the cast and never
+// triggers an unaligned-access fault on Cortex-M0+ / Xtensa / RISC-V.
+static inline uint32_t gc_read_atb_word(const byte *atb) {
+    #if defined(__GNUC__)
+    atb = __builtin_assume_aligned(atb, sizeof(uint32_t));
+    #endif
+    uint32_t w;
+    memcpy(&w, atb, sizeof(w));
+    return w;
+}
+#endif
+
 #if MICROPY_GC_SPLIT_HEAP
 #define NEXT_AREA(area) ((area)->next)
 #else
@@ -689,10 +716,55 @@ static void gc_sweep_free_blocks(void) {
 
     for (mp_state_mem_area_t *area = &MP_STATE_MEM(area); area != NULL; area = NEXT_AREA(area)) {
         size_t last_used_block = 0;
-        assert(area->gc_last_used_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB);
+        size_t end_block = area->gc_last_used_block;
+        assert(end_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB);
 
-        for (size_t block = 0; block <= area->gc_last_used_block; block++) {
+        for (size_t block = 0; block <= end_block; block++) {
             MICROPY_GC_HOOK_LOOP(block);
+
+            // Fast path: coalesce long runs of tail blocks (the body of a large
+            // allocation) a whole ATB word/byte at a time instead of block by
+            // block. A run of AT_TAIL never contains a HEAD/MARK, so free_tail is
+            // constant across it; an all-tail word is therefore wholly live or
+            // wholly dead. This is the dominant sweep cost for multi-block buffers.
+            #if MICROPY_GC_FAST_TABLE_SCANS
+            if ((block & (BLOCKS_PER_ATB - 1)) == 0) {
+                byte *atb = &area->gc_alloc_table_start[block / BLOCKS_PER_ATB];
+                // Coalesce the run: word-step (16 blocks) when the ATB pointer is
+                // word-aligned and a full all-tail word remains, otherwise byte-step
+                // (4 blocks). The byte steps also walk up to alignment and mop up the
+                // final partial word. The word read only runs when atb is aligned, as
+                // an unaligned access would fault on Cortex-M0+ / RISC-V. In both
+                // cases: free the run if dead (free_tail), else extend last_used_block.
+                while (block + BLOCKS_PER_ATB - 1 <= end_block && *atb == ATB_ALL_TAIL_BYTE) {
+                    if (((uintptr_t)atb & 3) == 0
+                        && block + BLOCKS_PER_ATB_WORD - 1 <= end_block
+                        && gc_read_atb_word(atb) == ATB_ALL_TAIL_WORD) {
+                        // Long runs are swept here; let ports run their GC-loop hook.
+                        MICROPY_GC_HOOK_LOOP(block);
+                        if (free_tail) {
+                            memset(atb, 0, sizeof(uint32_t));
+                        } else {
+                            last_used_block = block + BLOCKS_PER_ATB_WORD - 1;
+                        }
+                        block += BLOCKS_PER_ATB_WORD;
+                        atb += 4;
+                    } else {
+                        if (free_tail) {
+                            *atb = 0;
+                        } else {
+                            last_used_block = block + BLOCKS_PER_ATB - 1;
+                        }
+                        block += BLOCKS_PER_ATB;
+                        atb += 1;
+                    }
+                }
+                if (block > end_block) {
+                    break;
+                }
+            }
+            #endif
+
             switch (ATB_GET_KIND(area, block)) {
                 case AT_HEAD:
                     free_tail = 1;
diff --git a/py/mpconfig.h b/py/mpconfig.h
index 6bd179e3b8351..bb4b847aa2be6 100644
--- a/py/mpconfig.h
+++ b/py/mpconfig.h
@@ -780,6 +780,15 @@ typedef uint64_t mp_uint_t;
 #define MICROPY_GC_SPLIT_HEAP_AUTO (0)
 #endif
 
+// Whether the GC scans its per-block tables a word (16/32 blocks) at a time
+// instead of one block at a time, where a whole word is uniform. Covers the
+// sweep's allocation-table tail-block and free runs, and the finaliser-table
+// scan. Greatly speeds collection when large pure-data buffers (e.g. multi-
+// KB/MB bytearrays) or large free gaps are present; costs a little code.
+#ifndef MICROPY_GC_FAST_TABLE_SCANS
+#define MICROPY_GC_FAST_TABLE_SCANS (0)
+#endif
+
 // Hook to run code during time consuming garbage collector operations
 // *i* is the loop index variable (e.g. can be used to run every x loops)
 #ifndef MICROPY_GC_HOOK_LOOP

From e2b2435f3cbd2d7d62c1c78accaee35d9f92a156 Mon Sep 17 00:00:00 2001
From: Phil Howard <github@gadgetoid.com>
Date: Mon, 22 Jun 2026 17:54:57 +0100
Subject: [PATCH 2/4] py/gc: Coalesce free runs in the sweep.

Extend the sweep's fast path (MICROPY_GC_FAST_TABLE_SCANS) to skip long
runs of free blocks a word at a time, not just all-tail runs. An
all-free ATB byte/word (0x00) needs no work - the blocks are already
free and not in use - so a large gap (e.g. a freed buffer below
still-live data, which keeps the high-water mark up) is crossed 16
blocks per word instead of one at a time. Shares the same word read and
byte-step-to-alignment dance as the tail-run coalescing.

On a Pico LiPo 2 (RP2350B, 8 MB PSRAM) a collect crossing a 2 MB free
hole below live data saves ~.5us/KB.

Signed-off-by: Phil Howard <github@gadgetoid.com>
---
 py/gc.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/py/gc.c b/py/gc.c
index 2384ba2c63de0..6b5cb1d5649a1 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -84,6 +84,11 @@
 // the sweep to coalesce long tail runs instead of walking them block by block.
 #define ATB_ALL_TAIL_BYTE (0xaa)
 #define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL)
+// An ATB byte/word that is entirely AT_FREE (0b00): four / sixteen consecutive
+// free blocks, i.e. a large gap (e.g. a freed buffer below still-live data).
+// The sweep coalesces these the same way it coalesces all-tail runs.
+#define ATB_ALL_FREE_BYTE (0x00)
+#define ATB_ALL_FREE_WORD (0x00000000UL)
 #define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4)
 
 // Read four ATB bytes as a word during the sweep's tail-run fast path.
@@ -736,28 +741,41 @@ static void gc_sweep_free_blocks(void) {
                 // final partial word. The word read only runs when atb is aligned, as
                 // an unaligned access would fault on Cortex-M0+ / RISC-V. In both
                 // cases: free the run if dead (free_tail), else extend last_used_block.
-                while (block + BLOCKS_PER_ATB - 1 <= end_block && *atb == ATB_ALL_TAIL_BYTE) {
+                // Coalesce both all-tail runs (allocation bodies) and all-free
+                // runs (large gaps below live data). An all-tail byte/word is freed
+                // when dead (free_tail) else extends last_used_block; an all-free
+                // byte/word needs no action - it is already free and not "used".
+                while (block + BLOCKS_PER_ATB - 1 <= end_block
+                       && (*atb == ATB_ALL_TAIL_BYTE || *atb == ATB_ALL_FREE_BYTE)) {
+                    int is_tail = (*atb == ATB_ALL_TAIL_BYTE);
                     if (((uintptr_t)atb & 3) == 0
-                        && block + BLOCKS_PER_ATB_WORD - 1 <= end_block
-                        && gc_read_atb_word(atb) == ATB_ALL_TAIL_WORD) {
-                        // Long runs are swept here; let ports run their GC-loop hook.
-                        MICROPY_GC_HOOK_LOOP(block);
-                        if (free_tail) {
-                            memset(atb, 0, sizeof(uint32_t));
-                        } else {
-                            last_used_block = block + BLOCKS_PER_ATB_WORD - 1;
+                        && block + BLOCKS_PER_ATB_WORD - 1 <= end_block) {
+                        uint32_t w = gc_read_atb_word(atb);
+                        if (w == ATB_ALL_TAIL_WORD || w == ATB_ALL_FREE_WORD) {
+                            // Long runs are swept here; let ports run their GC-loop hook.
+                            MICROPY_GC_HOOK_LOOP(block);
+                            if (w == ATB_ALL_TAIL_WORD) {
+                                if (free_tail) {
+                                    memset(atb, 0, sizeof(uint32_t));
+                                } else {
+                                    last_used_block = block + BLOCKS_PER_ATB_WORD - 1;
+                                }
+                            }
+                            block += BLOCKS_PER_ATB_WORD;
+                            atb += 4;
+                            continue;
                         }
-                        block += BLOCKS_PER_ATB_WORD;
-                        atb += 4;
-                    } else {
+                        // aligned but a mixed word: fall to the byte step
+                    }
+                    if (is_tail) {
                         if (free_tail) {
                             *atb = 0;
                         } else {
                             last_used_block = block + BLOCKS_PER_ATB - 1;
                         }
-                        block += BLOCKS_PER_ATB;
-                        atb += 1;
                     }
+                    block += BLOCKS_PER_ATB;
+                    atb += 1;
                 }
                 if (block > end_block) {
                     break;

From 22f3887df37582a9af61b7ebba66015ca73c2e84 Mon Sep 17 00:00:00 2001
From: Phil Howard <github@gadgetoid.com>
Date: Mon, 22 Jun 2026 17:54:57 +0100
Subject: [PATCH 3/4] py/gc: Skip empty finaliser-table words when running
 finalisers.

gc_sweep_run_finalisers() reads the finaliser table (FTB) byte by byte
up to the high-water mark to find blocks that may have a __del__. The
FTB over a large pure-data buffer is all zero, yet is still scanned a
byte at a time - costly when the table lives in slow PSRAM. Under
MICROPY_GC_FAST_TABLE_SCANS, skip all-zero FTB words (32 blocks) at a
time, using the same aligned-word read as the sweep. A non-zero word (a
block that may have a finaliser) always falls through to the unchanged
per-byte handler, so finaliser execution is unaffected.

Applied only when the FTB is the sole table scanned here (weakref
disabled). On a Pico LiPo 2 the finaliser scan for a live 4 MB buffer
drops from ~4.0 to ~1.8 ms.

Signed-off-by: Phil Howard <github@gadgetoid.com>
---
 py/gc.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/py/gc.c b/py/gc.c
index 6b5cb1d5649a1..86d2c76b95d87 100644
--- a/py/gc.c
+++ b/py/gc.c
@@ -657,6 +657,22 @@ static void gc_sweep_run_finalisers(void) {
         // Small speed optimisation: skip over empty FTB blocks
         size_t ftb_end = area->gc_last_used_block / BLOCKS_PER_FTB; // index is inclusive
         for (size_t ftb_idx = 0; ftb_idx <= ftb_end; ftb_idx++) {
+            #if MICROPY_GC_FAST_TABLE_SCANS && MICROPY_ENABLE_FINALISER && !MICROPY_PY_WEAKREF
+            // Sibling of the sweep's uniform-run coalescing, for the finaliser
+            // table: skip spans with no finaliser bits a word (32 blocks) at a
+            // time. The FTB over a large pure-data buffer is all zero but is
+            // otherwise scanned byte-by-byte up to the high-water mark (costly on
+            // a PSRAM heap). Aligned word loads only (byte-walk to alignment via
+            // the for-loop first); a non-zero word always falls through to the
+            // per-byte handler, so finaliser execution is unaffected. Restricted
+            // to the FTB-only case (weakref disabled), the table walked here.
+            if (((uintptr_t)&area->gc_finaliser_table_start[ftb_idx] & 3) == 0
+                && ftb_idx + 3 <= ftb_end
+                && gc_read_atb_word(&area->gc_finaliser_table_start[ftb_idx]) == 0) {
+                ftb_idx += 3; // the for-loop's ++ advances a full 4-byte word
+                continue;
+            }
+            #endif
             #if MICROPY_ENABLE_FINALISER
             byte ftb = area->gc_finaliser_table_start[ftb_idx];
             size_t block = ftb_idx * BLOCKS_PER_FTB;

From 46b83a0b27a5007638af0df2cae372f58e8ef3e1 Mon Sep 17 00:00:00 2001
From: Phil Howard <github@gadgetoid.com>
Date: Mon, 22 Jun 2026 17:54:57 +0100
Subject: [PATCH 4/4] rp2: Enable fast table scans.

For CI, build tests only.

Signed-off-by: Phil Howard <github@gadgetoid.com>
---
 ports/rp2/mpconfigport.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ports/rp2/mpconfigport.h b/ports/rp2/mpconfigport.h
index 0bfaf6098ad4c..0514243446903 100644
--- a/ports/rp2/mpconfigport.h
+++ b/ports/rp2/mpconfigport.h
@@ -97,6 +97,7 @@
 #endif
 #define MICROPY_ALLOC_PATH_MAX                  (128)
 #define MICROPY_QSTR_BYTES_IN_HASH              (1)
+#define MICROPY_GC_FAST_TABLE_SCANS             (1)
 
 // MicroPython emitters
 #define MICROPY_PERSISTENT_CODE_LOAD            (1)