From dafd3a25b961c90a0e6dbfc0512c7d04087c6de9 Mon Sep 17 00:00:00 2001 From: Phil Howard Date: Mon, 22 Jun 2026 17:54:57 +0100 Subject: [PATCH 1/4] py/gc: Add fast tail-block sweep behind MICROPY_GC_FAST_TABLE_SCANS. The GC sweep walked the allocation table one 16-byte block at a time. The interior of a large allocation is a long run of AT_TAIL entries (an ATB byte of 0xaa is four tail blocks, a 32-bit word 0xaaaaaaaa sixteen). Such a run never contains a HEAD/MARK, so it is wholly live or wholly dead and can be processed a word at a time - free the whole word when dead, or just advance the high-water mark when live - instead of touching every block. This is the dominant sweep cost for large pure-data buffers: on RP2040 a 96 KB bytearray collect drops from 7363 to 5747 us. This introduces MICROPY_GC_FAST_TABLE_SCANS (default disabled), which gates a family of word-at-a-time scans over the GC's per-block tables; when off the behaviour is unchanged. This commit adds the first: coalescing tail-block runs in the sweep. Following commits add free-run coalescing and a finaliser-table skip under the same option. Enabling this commit alone costs +112 bytes of flash and no RAM on RP2040. The four ATB bytes are read via memcpy (with an alignment hint) rather than casting to uint32_t*, to avoid a strict-aliasing violation. A byte-step-to-alignment then word-step loop issues only aligned word accesses, so it is safe on cores that fault on unaligned loads (Cortex-M0+, Hazard3 RISC-V); validated on M0+, Cortex-M33 and RISC-V. MICROPY_GC_HOOK_LOOP is called within the word loop. Signed-off-by: Phil Howard --- py/gc.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++-- py/mpconfig.h | 9 ++++++ 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/py/gc.c b/py/gc.c index c1a19da3efadb..2384ba2c63de0 100644 --- a/py/gc.c +++ b/py/gc.c @@ -78,6 +78,33 @@ #define ATB_2_IS_FREE(a) (((a) & ATB_MASK_2) == 0) #define ATB_3_IS_FREE(a) (((a) & ATB_MASK_3) == 0) +#if MICROPY_GC_FAST_TABLE_SCANS +// An ATB byte/word that is entirely AT_TAIL (0b10) entries: four / sixteen +// consecutive tail blocks, i.e. the interior of a large allocation. Used by +// the sweep to coalesce long tail runs instead of walking them block by block. +#define ATB_ALL_TAIL_BYTE (0xaa) +#define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL) +#define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4) + +// Read four ATB bytes as a word during the sweep's tail-run fast path. +// We deliberately avoid `*(uint32_t *)atb`: the allocation table is a byte +// array, so casting it to uint32_t* violates strict aliasing, and py/ is built +// with strict aliasing enabled on most ports - an optimiser would then be free +// to assume this word read doesn't alias the byte stores to the same table and +// reorder them. memcpy is the well-defined idiom; with the alignment hint (the +// caller guarantees `atb` is word-aligned, via the byte-step dance) the compiler +// lowers it to a single aligned load, so it is as fast as the cast and never +// triggers an unaligned-access fault on Cortex-M0+ / Xtensa / RISC-V. +static inline uint32_t gc_read_atb_word(const byte *atb) { + #if defined(__GNUC__) + atb = __builtin_assume_aligned(atb, sizeof(uint32_t)); + #endif + uint32_t w; + memcpy(&w, atb, sizeof(w)); + return w; +} +#endif + #if MICROPY_GC_SPLIT_HEAP #define NEXT_AREA(area) ((area)->next) #else @@ -689,10 +716,55 @@ static void gc_sweep_free_blocks(void) { for (mp_state_mem_area_t *area = &MP_STATE_MEM(area); area != NULL; area = NEXT_AREA(area)) { size_t last_used_block = 0; - assert(area->gc_last_used_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB); + size_t end_block = area->gc_last_used_block; + assert(end_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB); - for (size_t block = 0; block <= area->gc_last_used_block; block++) { + for (size_t block = 0; block <= end_block; block++) { MICROPY_GC_HOOK_LOOP(block); + + // Fast path: coalesce long runs of tail blocks (the body of a large + // allocation) a whole ATB word/byte at a time instead of block by + // block. A run of AT_TAIL never contains a HEAD/MARK, so free_tail is + // constant across it; an all-tail word is therefore wholly live or + // wholly dead. This is the dominant sweep cost for multi-block buffers. + #if MICROPY_GC_FAST_TABLE_SCANS + if ((block & (BLOCKS_PER_ATB - 1)) == 0) { + byte *atb = &area->gc_alloc_table_start[block / BLOCKS_PER_ATB]; + // Coalesce the run: word-step (16 blocks) when the ATB pointer is + // word-aligned and a full all-tail word remains, otherwise byte-step + // (4 blocks). The byte steps also walk up to alignment and mop up the + // final partial word. The word read only runs when atb is aligned, as + // an unaligned access would fault on Cortex-M0+ / RISC-V. In both + // cases: free the run if dead (free_tail), else extend last_used_block. + while (block + BLOCKS_PER_ATB - 1 <= end_block && *atb == ATB_ALL_TAIL_BYTE) { + if (((uintptr_t)atb & 3) == 0 + && block + BLOCKS_PER_ATB_WORD - 1 <= end_block + && gc_read_atb_word(atb) == ATB_ALL_TAIL_WORD) { + // Long runs are swept here; let ports run their GC-loop hook. + MICROPY_GC_HOOK_LOOP(block); + if (free_tail) { + memset(atb, 0, sizeof(uint32_t)); + } else { + last_used_block = block + BLOCKS_PER_ATB_WORD - 1; + } + block += BLOCKS_PER_ATB_WORD; + atb += 4; + } else { + if (free_tail) { + *atb = 0; + } else { + last_used_block = block + BLOCKS_PER_ATB - 1; + } + block += BLOCKS_PER_ATB; + atb += 1; + } + } + if (block > end_block) { + break; + } + } + #endif + switch (ATB_GET_KIND(area, block)) { case AT_HEAD: free_tail = 1; diff --git a/py/mpconfig.h b/py/mpconfig.h index 6bd179e3b8351..bb4b847aa2be6 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -780,6 +780,15 @@ typedef uint64_t mp_uint_t; #define MICROPY_GC_SPLIT_HEAP_AUTO (0) #endif +// Whether the GC scans its per-block tables a word (16/32 blocks) at a time +// instead of one block at a time, where a whole word is uniform. Covers the +// sweep's allocation-table tail-block and free runs, and the finaliser-table +// scan. Greatly speeds collection when large pure-data buffers (e.g. multi- +// KB/MB bytearrays) or large free gaps are present; costs a little code. +#ifndef MICROPY_GC_FAST_TABLE_SCANS +#define MICROPY_GC_FAST_TABLE_SCANS (0) +#endif + // Hook to run code during time consuming garbage collector operations // *i* is the loop index variable (e.g. can be used to run every x loops) #ifndef MICROPY_GC_HOOK_LOOP From e2b2435f3cbd2d7d62c1c78accaee35d9f92a156 Mon Sep 17 00:00:00 2001 From: Phil Howard Date: Mon, 22 Jun 2026 17:54:57 +0100 Subject: [PATCH 2/4] py/gc: Coalesce free runs in the sweep. Extend the sweep's fast path (MICROPY_GC_FAST_TABLE_SCANS) to skip long runs of free blocks a word at a time, not just all-tail runs. An all-free ATB byte/word (0x00) needs no work - the blocks are already free and not in use - so a large gap (e.g. a freed buffer below still-live data, which keeps the high-water mark up) is crossed 16 blocks per word instead of one at a time. Shares the same word read and byte-step-to-alignment dance as the tail-run coalescing. On a Pico LiPo 2 (RP2350B, 8 MB PSRAM) a collect crossing a 2 MB free hole below live data saves ~.5us/KB. Signed-off-by: Phil Howard --- py/gc.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/py/gc.c b/py/gc.c index 2384ba2c63de0..6b5cb1d5649a1 100644 --- a/py/gc.c +++ b/py/gc.c @@ -84,6 +84,11 @@ // the sweep to coalesce long tail runs instead of walking them block by block. #define ATB_ALL_TAIL_BYTE (0xaa) #define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL) +// An ATB byte/word that is entirely AT_FREE (0b00): four / sixteen consecutive +// free blocks, i.e. a large gap (e.g. a freed buffer below still-live data). +// The sweep coalesces these the same way it coalesces all-tail runs. +#define ATB_ALL_FREE_BYTE (0x00) +#define ATB_ALL_FREE_WORD (0x00000000UL) #define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4) // Read four ATB bytes as a word during the sweep's tail-run fast path. @@ -736,28 +741,41 @@ static void gc_sweep_free_blocks(void) { // final partial word. The word read only runs when atb is aligned, as // an unaligned access would fault on Cortex-M0+ / RISC-V. In both // cases: free the run if dead (free_tail), else extend last_used_block. - while (block + BLOCKS_PER_ATB - 1 <= end_block && *atb == ATB_ALL_TAIL_BYTE) { + // Coalesce both all-tail runs (allocation bodies) and all-free + // runs (large gaps below live data). An all-tail byte/word is freed + // when dead (free_tail) else extends last_used_block; an all-free + // byte/word needs no action - it is already free and not "used". + while (block + BLOCKS_PER_ATB - 1 <= end_block + && (*atb == ATB_ALL_TAIL_BYTE || *atb == ATB_ALL_FREE_BYTE)) { + int is_tail = (*atb == ATB_ALL_TAIL_BYTE); if (((uintptr_t)atb & 3) == 0 - && block + BLOCKS_PER_ATB_WORD - 1 <= end_block - && gc_read_atb_word(atb) == ATB_ALL_TAIL_WORD) { - // Long runs are swept here; let ports run their GC-loop hook. - MICROPY_GC_HOOK_LOOP(block); - if (free_tail) { - memset(atb, 0, sizeof(uint32_t)); - } else { - last_used_block = block + BLOCKS_PER_ATB_WORD - 1; + && block + BLOCKS_PER_ATB_WORD - 1 <= end_block) { + uint32_t w = gc_read_atb_word(atb); + if (w == ATB_ALL_TAIL_WORD || w == ATB_ALL_FREE_WORD) { + // Long runs are swept here; let ports run their GC-loop hook. + MICROPY_GC_HOOK_LOOP(block); + if (w == ATB_ALL_TAIL_WORD) { + if (free_tail) { + memset(atb, 0, sizeof(uint32_t)); + } else { + last_used_block = block + BLOCKS_PER_ATB_WORD - 1; + } + } + block += BLOCKS_PER_ATB_WORD; + atb += 4; + continue; } - block += BLOCKS_PER_ATB_WORD; - atb += 4; - } else { + // aligned but a mixed word: fall to the byte step + } + if (is_tail) { if (free_tail) { *atb = 0; } else { last_used_block = block + BLOCKS_PER_ATB - 1; } - block += BLOCKS_PER_ATB; - atb += 1; } + block += BLOCKS_PER_ATB; + atb += 1; } if (block > end_block) { break; From 22f3887df37582a9af61b7ebba66015ca73c2e84 Mon Sep 17 00:00:00 2001 From: Phil Howard Date: Mon, 22 Jun 2026 17:54:57 +0100 Subject: [PATCH 3/4] py/gc: Skip empty finaliser-table words when running finalisers. gc_sweep_run_finalisers() reads the finaliser table (FTB) byte by byte up to the high-water mark to find blocks that may have a __del__. The FTB over a large pure-data buffer is all zero, yet is still scanned a byte at a time - costly when the table lives in slow PSRAM. Under MICROPY_GC_FAST_TABLE_SCANS, skip all-zero FTB words (32 blocks) at a time, using the same aligned-word read as the sweep. A non-zero word (a block that may have a finaliser) always falls through to the unchanged per-byte handler, so finaliser execution is unaffected. Applied only when the FTB is the sole table scanned here (weakref disabled). On a Pico LiPo 2 the finaliser scan for a live 4 MB buffer drops from ~4.0 to ~1.8 ms. Signed-off-by: Phil Howard --- py/gc.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/py/gc.c b/py/gc.c index 6b5cb1d5649a1..86d2c76b95d87 100644 --- a/py/gc.c +++ b/py/gc.c @@ -657,6 +657,22 @@ static void gc_sweep_run_finalisers(void) { // Small speed optimisation: skip over empty FTB blocks size_t ftb_end = area->gc_last_used_block / BLOCKS_PER_FTB; // index is inclusive for (size_t ftb_idx = 0; ftb_idx <= ftb_end; ftb_idx++) { + #if MICROPY_GC_FAST_TABLE_SCANS && MICROPY_ENABLE_FINALISER && !MICROPY_PY_WEAKREF + // Sibling of the sweep's uniform-run coalescing, for the finaliser + // table: skip spans with no finaliser bits a word (32 blocks) at a + // time. The FTB over a large pure-data buffer is all zero but is + // otherwise scanned byte-by-byte up to the high-water mark (costly on + // a PSRAM heap). Aligned word loads only (byte-walk to alignment via + // the for-loop first); a non-zero word always falls through to the + // per-byte handler, so finaliser execution is unaffected. Restricted + // to the FTB-only case (weakref disabled), the table walked here. + if (((uintptr_t)&area->gc_finaliser_table_start[ftb_idx] & 3) == 0 + && ftb_idx + 3 <= ftb_end + && gc_read_atb_word(&area->gc_finaliser_table_start[ftb_idx]) == 0) { + ftb_idx += 3; // the for-loop's ++ advances a full 4-byte word + continue; + } + #endif #if MICROPY_ENABLE_FINALISER byte ftb = area->gc_finaliser_table_start[ftb_idx]; size_t block = ftb_idx * BLOCKS_PER_FTB; From 46b83a0b27a5007638af0df2cae372f58e8ef3e1 Mon Sep 17 00:00:00 2001 From: Phil Howard Date: Mon, 22 Jun 2026 17:54:57 +0100 Subject: [PATCH 4/4] rp2: Enable fast table scans. For CI, build tests only. Signed-off-by: Phil Howard --- ports/rp2/mpconfigport.h | 1 + 1 file changed, 1 insertion(+) diff --git a/ports/rp2/mpconfigport.h b/ports/rp2/mpconfigport.h index 0bfaf6098ad4c..0514243446903 100644 --- a/ports/rp2/mpconfigport.h +++ b/ports/rp2/mpconfigport.h @@ -97,6 +97,7 @@ #endif #define MICROPY_ALLOC_PATH_MAX (128) #define MICROPY_QSTR_BYTES_IN_HASH (1) +#define MICROPY_GC_FAST_TABLE_SCANS (1) // MicroPython emitters #define MICROPY_PERSISTENT_CODE_LOAD (1)