Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ports/rp2/mpconfigport.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
#endif
#define MICROPY_ALLOC_PATH_MAX (128)
#define MICROPY_QSTR_BYTES_IN_HASH (1)
#define MICROPY_GC_FAST_TABLE_SCANS (1)

// MicroPython emitters
#define MICROPY_PERSISTENT_CODE_LOAD (1)
Expand Down
110 changes: 108 additions & 2 deletions py/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,38 @@
#define ATB_2_IS_FREE(a) (((a) & ATB_MASK_2) == 0)
#define ATB_3_IS_FREE(a) (((a) & ATB_MASK_3) == 0)

#if MICROPY_GC_FAST_TABLE_SCANS
// An ATB byte/word that is entirely AT_TAIL (0b10) entries: four / sixteen
// consecutive tail blocks, i.e. the interior of a large allocation. Used by
// the sweep to coalesce long tail runs instead of walking them block by block.
#define ATB_ALL_TAIL_BYTE (0xaa)
#define ATB_ALL_TAIL_WORD (0xaaaaaaaaUL)
// An ATB byte/word that is entirely AT_FREE (0b00): four / sixteen consecutive
// free blocks, i.e. a large gap (e.g. a freed buffer below still-live data).
// The sweep coalesces these the same way it coalesces all-tail runs.
#define ATB_ALL_FREE_BYTE (0x00)
#define ATB_ALL_FREE_WORD (0x00000000UL)
#define BLOCKS_PER_ATB_WORD (BLOCKS_PER_ATB * 4)

// Read four ATB bytes as a word during the sweep's tail-run fast path.
// We deliberately avoid `*(uint32_t *)atb`: the allocation table is a byte
// array, so casting it to uint32_t* violates strict aliasing, and py/ is built
// with strict aliasing enabled on most ports - an optimiser would then be free
// to assume this word read doesn't alias the byte stores to the same table and
// reorder them. memcpy is the well-defined idiom; with the alignment hint (the
// caller guarantees `atb` is word-aligned, via the byte-step dance) the compiler
// lowers it to a single aligned load, so it is as fast as the cast and never
// triggers an unaligned-access fault on Cortex-M0+ / Xtensa / RISC-V.
static inline uint32_t gc_read_atb_word(const byte *atb) {
#if defined(__GNUC__)
atb = __builtin_assume_aligned(atb, sizeof(uint32_t));
#endif
uint32_t w;
memcpy(&w, atb, sizeof(w));
return w;
}
#endif

#if MICROPY_GC_SPLIT_HEAP
#define NEXT_AREA(area) ((area)->next)
#else
Expand Down Expand Up @@ -625,6 +657,22 @@ static void gc_sweep_run_finalisers(void) {
// Small speed optimisation: skip over empty FTB blocks
size_t ftb_end = area->gc_last_used_block / BLOCKS_PER_FTB; // index is inclusive
for (size_t ftb_idx = 0; ftb_idx <= ftb_end; ftb_idx++) {
#if MICROPY_GC_FAST_TABLE_SCANS && MICROPY_ENABLE_FINALISER && !MICROPY_PY_WEAKREF
// Sibling of the sweep's uniform-run coalescing, for the finaliser
// table: skip spans with no finaliser bits a word (32 blocks) at a
// time. The FTB over a large pure-data buffer is all zero but is
// otherwise scanned byte-by-byte up to the high-water mark (costly on
// a PSRAM heap). Aligned word loads only (byte-walk to alignment via
// the for-loop first); a non-zero word always falls through to the
// per-byte handler, so finaliser execution is unaffected. Restricted
// to the FTB-only case (weakref disabled), the table walked here.
if (((uintptr_t)&area->gc_finaliser_table_start[ftb_idx] & 3) == 0
&& ftb_idx + 3 <= ftb_end
&& gc_read_atb_word(&area->gc_finaliser_table_start[ftb_idx]) == 0) {
ftb_idx += 3; // the for-loop's ++ advances a full 4-byte word
continue;
}
#endif
#if MICROPY_ENABLE_FINALISER
byte ftb = area->gc_finaliser_table_start[ftb_idx];
size_t block = ftb_idx * BLOCKS_PER_FTB;
Expand Down Expand Up @@ -689,10 +737,68 @@ static void gc_sweep_free_blocks(void) {

for (mp_state_mem_area_t *area = &MP_STATE_MEM(area); area != NULL; area = NEXT_AREA(area)) {
size_t last_used_block = 0;
assert(area->gc_last_used_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB);
size_t end_block = area->gc_last_used_block;
assert(end_block <= area->gc_alloc_table_byte_len * BLOCKS_PER_ATB);

for (size_t block = 0; block <= area->gc_last_used_block; block++) {
for (size_t block = 0; block <= end_block; block++) {
MICROPY_GC_HOOK_LOOP(block);

// Fast path: coalesce long runs of tail blocks (the body of a large
// allocation) a whole ATB word/byte at a time instead of block by
// block. A run of AT_TAIL never contains a HEAD/MARK, so free_tail is
// constant across it; an all-tail word is therefore wholly live or
// wholly dead. This is the dominant sweep cost for multi-block buffers.
#if MICROPY_GC_FAST_TABLE_SCANS
if ((block & (BLOCKS_PER_ATB - 1)) == 0) {
byte *atb = &area->gc_alloc_table_start[block / BLOCKS_PER_ATB];
// Coalesce the run: word-step (16 blocks) when the ATB pointer is
// word-aligned and a full all-tail word remains, otherwise byte-step
// (4 blocks). The byte steps also walk up to alignment and mop up the
// final partial word. The word read only runs when atb is aligned, as
// an unaligned access would fault on Cortex-M0+ / RISC-V. In both
// cases: free the run if dead (free_tail), else extend last_used_block.
// Coalesce both all-tail runs (allocation bodies) and all-free
// runs (large gaps below live data). An all-tail byte/word is freed
// when dead (free_tail) else extends last_used_block; an all-free
// byte/word needs no action - it is already free and not "used".
while (block + BLOCKS_PER_ATB - 1 <= end_block
&& (*atb == ATB_ALL_TAIL_BYTE || *atb == ATB_ALL_FREE_BYTE)) {
int is_tail = (*atb == ATB_ALL_TAIL_BYTE);
if (((uintptr_t)atb & 3) == 0
&& block + BLOCKS_PER_ATB_WORD - 1 <= end_block) {
uint32_t w = gc_read_atb_word(atb);
if (w == ATB_ALL_TAIL_WORD || w == ATB_ALL_FREE_WORD) {
// Long runs are swept here; let ports run their GC-loop hook.
MICROPY_GC_HOOK_LOOP(block);
if (w == ATB_ALL_TAIL_WORD) {
if (free_tail) {
memset(atb, 0, sizeof(uint32_t));
} else {
last_used_block = block + BLOCKS_PER_ATB_WORD - 1;
}
}
block += BLOCKS_PER_ATB_WORD;
atb += 4;
continue;
}
// aligned but a mixed word: fall to the byte step
}
if (is_tail) {
if (free_tail) {
*atb = 0;
} else {
last_used_block = block + BLOCKS_PER_ATB - 1;
}
}
block += BLOCKS_PER_ATB;
atb += 1;
}
if (block > end_block) {
break;
}
}
#endif

switch (ATB_GET_KIND(area, block)) {
case AT_HEAD:
free_tail = 1;
Expand Down
9 changes: 9 additions & 0 deletions py/mpconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,15 @@ typedef uint64_t mp_uint_t;
#define MICROPY_GC_SPLIT_HEAP_AUTO (0)
#endif

// Whether the GC scans its per-block tables a word (16/32 blocks) at a time
// instead of one block at a time, where a whole word is uniform. Covers the
// sweep's allocation-table tail-block and free runs, and the finaliser-table
// scan. Greatly speeds collection when large pure-data buffers (e.g. multi-
// KB/MB bytearrays) or large free gaps are present; costs a little code.
#ifndef MICROPY_GC_FAST_TABLE_SCANS
#define MICROPY_GC_FAST_TABLE_SCANS (0)
#endif

// Hook to run code during time consuming garbage collector operations
// *i* is the loop index variable (e.g. can be used to run every x loops)
#ifndef MICROPY_GC_HOOK_LOOP
Expand Down
Loading