diff --git a/ports/rp2/mpconfigport.h b/ports/rp2/mpconfigport.h index 0bfaf6098ad4c..0514243446903 100644 --- a/ports/rp2/mpconfigport.h +++ b/ports/rp2/mpconfigport.h @@ -97,6 +97,7 @@ #endif #define MICROPY_ALLOC_PATH_MAX (128) #define MICROPY_QSTR_BYTES_IN_HASH (1) +#define MICROPY_GC_FAST_TABLE_SCANS (1) // MicroPython emitters #define MICROPY_PERSISTENT_CODE_LOAD (1) diff --git a/py/gc.c b/py/gc.c index c1a19da3efadb..86d2c76b95d87 100644 --- a/py/gc.c +++ b/py/gc.c @@ -78,6 +78,38 @@ #define ATB_2_IS_FREE(a) (((a) & ATB_MASK_2) == 0) #define ATB_3_IS_FREE(a) (((a) & ATB_MASK_3) == 0) +#if MICROPY_GC_FAST_TABLE_SCANS +// An ATB byte/word that is entirely AT_TAIL (0b10) entries: four / sixteen +// consecutive tail blocks, i.e. the interior of a large allocation. Used by +// the sweep to coalesce long tail runs instead of walking them block by block. +#define ATB_ALL_TAIL_BYTE (0xaa) +#define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL) +// An ATB byte/word that is entirely AT_FREE (0b00): four / sixteen consecutive +// free blocks, i.e. a large gap (e.g. a freed buffer below still-live data). +// The sweep coalesces these the same way it coalesces all-tail runs. +#define ATB_ALL_FREE_BYTE (0x00) +#define ATB_ALL_FREE_WORD (0x00000000UL) +#define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4) + +// Read four ATB bytes as a word during the sweep's tail-run fast path. +// We deliberately avoid `*(uint32_t *)atb`: the allocation table is a byte +// array, so casting it to uint32_t* violates strict aliasing, and py/ is built +// with strict aliasing enabled on most ports - an optimiser would then be free +// to assume this word read doesn't alias the byte stores to the same table and +// reorder them. memcpy is the well-defined idiom; with the alignment hint (the +// caller guarantees `atb` is word-aligned, via the byte-step dance) the compiler +// lowers it to a single aligned load, so it is as fast as the cast and never +// triggers an unaligned-access fault on Cortex-M0+ / Xtensa / RISC-V. +static inline uint32_t gc_read_atb_word(const byte *atb) { + #if defined(__GNUC__) + atb = __builtin_assume_aligned(atb, sizeof(uint32_t)); + #endif + uint32_t w; + memcpy(&w, atb, sizeof(w)); + return w; +} +#endif + #if MICROPY_GC_SPLIT_HEAP #define NEXT_AREA(area) ((area)->next) #else @@ -625,6 +657,22 @@ static void gc_sweep_run_finalisers(void) { // Small speed optimisation: skip over empty FTB blocks size_t ftb_end = area->gc_last_used_block / BLOCKS_PER_FTB; // index is inclusive for (size_t ftb_idx = 0; ftb_idx <= ftb_end; ftb_idx++) { + #if MICROPY_GC_FAST_TABLE_SCANS && MICROPY_ENABLE_FINALISER && !MICROPY_PY_WEAKREF + // Sibling of the sweep's uniform-run coalescing, for the finaliser + // table: skip spans with no finaliser bits a word (32 blocks) at a + // time. The FTB over a large pure-data buffer is all zero but is + // otherwise scanned byte-by-byte up to the high-water mark (costly on + // a PSRAM heap). Aligned word loads only (byte-walk to alignment via + // the for-loop first); a non-zero word always falls through to the + // per-byte handler, so finaliser execution is unaffected. Restricted + // to the FTB-only case (weakref disabled), the table walked here. + if (((uintptr_t)&area->gc_finaliser_table_start[ftb_idx] & 3) == 0 + && ftb_idx + 3 <= ftb_end + && gc_read_atb_word(&area->gc_finaliser_table_start[ftb_idx]) == 0) { + ftb_idx += 3; // the for-loop's ++ advances a full 4-byte word + continue; + } + #endif #if MICROPY_ENABLE_FINALISER byte ftb = area->gc_finaliser_table_start[ftb_idx]; size_t block = ftb_idx * BLOCKS_PER_FTB; @@ -689,10 +737,68 @@ static void gc_sweep_free_blocks(void) { for (mp_state_mem_area_t *area = &MP_STATE_MEM(area); area != NULL; area = NEXT_AREA(area)) { size_t last_used_block = 0; - assert(area->gc_last_used_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB); + size_t end_block = area->gc_last_used_block; + assert(end_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB); - for (size_t block = 0; block <= area->gc_last_used_block; block++) { + for (size_t block = 0; block <= end_block; block++) { MICROPY_GC_HOOK_LOOP(block); + + // Fast path: coalesce long runs of tail blocks (the body of a large + // allocation) a whole ATB word/byte at a time instead of block by + // block. A run of AT_TAIL never contains a HEAD/MARK, so free_tail is + // constant across it; an all-tail word is therefore wholly live or + // wholly dead. This is the dominant sweep cost for multi-block buffers. + #if MICROPY_GC_FAST_TABLE_SCANS + if ((block & (BLOCKS_PER_ATB - 1)) == 0) { + byte *atb = &area->gc_alloc_table_start[block / BLOCKS_PER_ATB]; + // Coalesce the run: word-step (16 blocks) when the ATB pointer is + // word-aligned and a full all-tail word remains, otherwise byte-step + // (4 blocks). The byte steps also walk up to alignment and mop up the + // final partial word. The word read only runs when atb is aligned, as + // an unaligned access would fault on Cortex-M0+ / RISC-V. In both + // cases: free the run if dead (free_tail), else extend last_used_block. + // Coalesce both all-tail runs (allocation bodies) and all-free + // runs (large gaps below live data). An all-tail byte/word is freed + // when dead (free_tail) else extends last_used_block; an all-free + // byte/word needs no action - it is already free and not "used". + while (block + BLOCKS_PER_ATB - 1 <= end_block + && (*atb == ATB_ALL_TAIL_BYTE || *atb == ATB_ALL_FREE_BYTE)) { + int is_tail = (*atb == ATB_ALL_TAIL_BYTE); + if (((uintptr_t)atb & 3) == 0 + && block + BLOCKS_PER_ATB_WORD - 1 <= end_block) { + uint32_t w = gc_read_atb_word(atb); + if (w == ATB_ALL_TAIL_WORD || w == ATB_ALL_FREE_WORD) { + // Long runs are swept here; let ports run their GC-loop hook. + MICROPY_GC_HOOK_LOOP(block); + if (w == ATB_ALL_TAIL_WORD) { + if (free_tail) { + memset(atb, 0, sizeof(uint32_t)); + } else { + last_used_block = block + BLOCKS_PER_ATB_WORD - 1; + } + } + block += BLOCKS_PER_ATB_WORD; + atb += 4; + continue; + } + // aligned but a mixed word: fall to the byte step + } + if (is_tail) { + if (free_tail) { + *atb = 0; + } else { + last_used_block = block + BLOCKS_PER_ATB - 1; + } + } + block += BLOCKS_PER_ATB; + atb += 1; + } + if (block > end_block) { + break; + } + } + #endif + switch (ATB_GET_KIND(area, block)) { case AT_HEAD: free_tail = 1; diff --git a/py/mpconfig.h b/py/mpconfig.h index 6bd179e3b8351..bb4b847aa2be6 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -780,6 +780,15 @@ typedef uint64_t mp_uint_t; #define MICROPY_GC_SPLIT_HEAP_AUTO (0) #endif +// Whether the GC scans its per-block tables a word (16/32 blocks) at a time +// instead of one block at a time, where a whole word is uniform. Covers the +// sweep's allocation-table tail-block and free runs, and the finaliser-table +// scan. Greatly speeds collection when large pure-data buffers (e.g. multi- +// KB/MB bytearrays) or large free gaps are present; costs a little code. +#ifndef MICROPY_GC_FAST_TABLE_SCANS +#define MICROPY_GC_FAST_TABLE_SCANS (0) +#endif + // Hook to run code during time consuming garbage collector operations // *i* is the loop index variable (e.g. can be used to run every x loops) #ifndef MICROPY_GC_HOOK_LOOP