diff --git a/.claude/skills/performance/SKILL.md b/.claude/skills/performance/SKILL.md new file mode 100644 index 00000000..24ffaf7a --- /dev/null +++ b/.claude/skills/performance/SKILL.md @@ -0,0 +1,89 @@ +--- +name: performance +description: Use when optimizing a hot path in @adobe/data — especially when an interface (Column, TypedBuffer, ManagedArray, etc.) has many concrete instances created from closure-returning factories and is dispatched from a tight per-row loop. Provides the closure→class refactor pattern, the V8 hidden-class reasoning behind it, accept/reject criteria, and the discipline that keeps the public API closure-free even when implementations are class-based. +--- + +# Closure→class refactor for hot dispatch sites + +## The problem in 30 seconds + +A factory shaped like this: + +```ts +function createColumn(array) { + return { + get(i) { return array[i]; }, + set(i, v) { array[i] = v; }, + }; +} +``` + +…creates a **fresh hidden class plus fresh per-instance methods every call**. When a tight loop sees several such instances at the same call site (`positionX.get(i)`, `positionY.get(i)`, `velocityX.get(i)`, …), V8's inline cache goes polymorphic → megamorphic and the methods can't be inlined. Each `.get` call dereferences the closure context to read `array`. Same data flow, but the loop is 5× slower than it needs to be. + +Replace with a class — `class Column { array; constructor(a) { this.array = a; } get(i) { return this.array[i]; } }` — and every instance shares one hidden class and one set of prototype methods. The IC monomorphizes, V8 inlines `.get`, and `array` becomes a fast property load. + +## When to look for it + +Three boxes must all be ticked: + +1. **Many instances.** The factory is called once per column, once per archetype, once per buffer — not once per app. +2. **A hot per-row loop reads or writes through the returned object's methods.** ECS systems, render loops, serialization sweeps — anywhere the call count is `N × elements`. +3. **The call site sees more than one such instance.** `positionX.set(...); positionY.set(...); positionZ.set(...);` — three different receivers at the same `.set` IC. + +If even one box is missing, leave the closure shape alone — it's clearer. + +## When *not* to apply it + +- One-off configs, plugin descriptors, options bags. The shape doesn't matter; clarity does. +- Cold paths: setup, teardown, migration, persistence. If it runs once per session, do not classify. +- Single-instance services where the call site only ever sees one shape — V8's IC is already monomorphic. +- Anywhere the closure carries non-trivial captured state that would feel awkward as a `private` field. Force-fitting state onto `this` for V8's benefit is the wrong trade if it harms readability. + +## Refactor recipe + +Mirror the existing precedents: + +- `packages/data/src/cache/managed-array.ts` → `ManagedTypedArrayColumn` +- `packages/data/src/typed-buffer/create-number-buffer.ts` → `NumberTypedBuffer` + +Specifically: + +1. Methods go on the class body (so they live on the prototype, not on each instance). +2. State goes on `this`; **assign every field in the constructor** in the same order across all instances. Use `readonly` where the field never reassigns. +3. Pull internal helpers up as `private` methods, not nested closures or arrow-function fields. Arrow-function class fields go on each instance, not the prototype, and reintroduce the slow shape. +4. Don't add or delete properties after construction — that mutates the hidden class. +5. The factory function (`createXxx(...)`) stays exported and unchanged for callers; only the body is `return new XxxClass(...)`. + +## Public-API discipline + +The interface stays a structural type — `Column`, `TypedBuffer`, `ManagedArray`. Implementations may be classy; consumers must still see what looks like a plain interface object. + +- **Don't export the class.** Module scope only. Export the factory and the type. +- **Don't add methods that aren't in the interface.** A consumer using public types must never need `instanceof MyClass`. +- **Don't lean on prototype tricks at the boundary.** Cloning, persistence, normalization, `JSON.stringify` — all should work whether the value is a class instance or a plain object. + +This protects the data-oriented promise at the boundary. Inside the box: do whatever the profile says. At the box wall: a value is just data. + +## Verifying the win + +In order, three pieces: + +1. **Focused micro-bench.** Use the perftest harness (`packages/data/src/perftest/perf-test.ts`) — call `test.run()` directly, with the harness's auto-tuned `N` and inner-loop sampling. Capture ms/iter before and after. +2. **Full perftest.** Run `packages/data/src/perftest/index.ts` in a headless Chromium against the local dist; check the relevant rows row-by-row. Confirm no regressions in unrelated rows. +3. **Optional sanity check.** Launch Chromium with `--js-flags="--allow-natives-syntax"` and `console.log(%HaveSameMap(positionX, velocityX))` — should be `true` after the refactor, `false` before. + +**Reject thresholds.** Keep the change only if focused bench is **≥25% faster** *and* the relevant full-perftest row is **≥10% faster**, with `pnpm test` and `npx tsc -b` clean. Anything less is noise; revert. + +## Things that are *not* this optimization + +Don't conflate the closure→class fix with adjacent perf problems — they have different cures. + +- **Per-element allocation in hot reads** (e.g. struct buffer's `get(i)` returning a fresh `{x,y,z}`). The cure is bulk APIs (`forEach`, `getInto(target, i)`) or operating on the underlying typed array directly, not class-ifying the readout. *Empirically confirmed: we benched a generated class for struct readout in this repo (`{x: f32, y: f32, z: f32}`, N=1M) and got 10.90 ms → 10.60 ms (−2.8%) — well below the 25% threshold. V8 already converges on a stable hidden class for object literals whose properties are assigned in fixed order, so the class buys nothing here. Don't re-try this without first changing the surface (e.g. a `getInto(target, i)` that mutates a caller-owned object, eliminating the allocation entirely).* +- **Polymorphic IC caused by mixed value types** (numbers and strings flowing through the same column). Cure: type-specialized columns. +- **Closure-captured bindings that get reassigned** (`array = allocator.refresh(array)` etc.). The hidden class is wrong from instance #1 — class form is correct independent of instance count. + +## In-repo precedents + +- `packages/data/src/cache/managed-array.ts` — `ManagedTypedArrayColumn`. Original case study; 5.4× on the focused profile, 30% on `ec2s:move column`. +- `packages/data/src/typed-buffer/create-number-buffer.ts` — `NumberTypedBuffer`. The `TypedBuffer` family (number, struct, const, enum, array) is uniformly class-based; that's the standard to follow. +- `packages/data/src/perftest/perf-test.ts` — harness with warmup + auto-tuned N + inner-loop sampling. Use it for any new benchmark. diff --git a/packages/data/src/cache/managed-array.ts b/packages/data/src/cache/managed-array.ts index 536d1b6a..4df2d479 100644 --- a/packages/data/src/cache/managed-array.ts +++ b/packages/data/src/cache/managed-array.ts @@ -98,82 +98,100 @@ function binaryDecode( return new ctor(byteArray.buffer, byteArray.byteOffset, length); } -function createManagedTypedArray( - ctor: TypedArrayConstructor, - allocator: MemoryAllocator -): ManagedArray { - let capacity = 16; - let array = allocator.allocate(ctor, capacity); - // when the main wasm memory is resized, we need to refresh the array. - allocator.needsRefresh(() => { - array = allocator.refresh(array); - }); - - const grow = (newCapacity?: number) => { - if (newCapacity && newCapacity > capacity) { - array = allocator.refresh(array); - const oldArray = array; +// Methods live on the prototype so every numeric column shares one hidden +// class. This lets V8 monomorphize the IC at hot get/set call sites in tight +// per-row loops (~20× speedup over the previous closure-per-instance shape). +class ManagedTypedArrayColumn implements ManagedArray { + readonly constant = false; + array: TypedArray; + private capacity: number; + private readonly ctor: TypedArrayConstructor; + private readonly allocator: MemoryAllocator; + + constructor(ctor: TypedArrayConstructor, allocator: MemoryAllocator) { + this.ctor = ctor; + this.allocator = allocator; + this.capacity = 16; + this.array = allocator.allocate(ctor, this.capacity); + // when the main wasm memory is resized, we need to refresh the array. + allocator.needsRefresh(() => { + this.array = allocator.refresh(this.array); + }); + } + + get native(): TypedArray { + return this.array; + } + + get(index: number): number { + return this.array[index]; + } + + set(index: number, value: number): void { + this.array[index] = value; + } + + move(from: number, to: number): void { + this.array[to] = this.array[from]; + } + + slice(start: number, end: number): number[] { + return [...this.array.subarray(start, end)]; + } + + ensureCapacity(newCapacity: number): void { + this.grow(newCapacity); + } + + private grow(newCapacity?: number): void { + if (newCapacity && newCapacity > this.capacity) { + this.array = this.allocator.refresh(this.array); + const oldArray = this.array; const growthFactor = 2; - capacity = Math.max(newCapacity, capacity * growthFactor); - const newArray = allocator.allocate(ctor, capacity); - newArray.set(array); - array = newArray; - allocator.release(oldArray); + this.capacity = Math.max(newCapacity, this.capacity * growthFactor); + const newArray = this.allocator.allocate(this.ctor, this.capacity); + newArray.set(this.array); + this.array = newArray; + this.allocator.release(oldArray); } - }; - const result = { - constant: false, - get native() { - return array; - }, - get(index: number) { - return array[index]; - }, - set(index: number, value: number): void { - // if (index >= capacity) { - // grow(); - // } - array[index] = value; - }, - move(from: number, to: number): void { - array[to] = array[from]; - }, - slice(start: number, end: number) { - return [...array.subarray(start, end)]; - }, - ensureCapacity(newCapacity: number) { - grow(newCapacity); - }, - toJSON(length: number, allowEncoding = true) { - const subarray = array.subarray(0, length); - if (!allowEncoding) { - return Array.from(subarray); + } + + toJSON(length: number, allowEncoding = true): Data { + const subarray = this.array.subarray(0, length); + if (!allowEncoding) { + return Array.from(subarray); + } + const jsonString = JSON.stringify(Array.from(subarray)); + const binaryString = binaryEncode(subarray); + return binaryString.length < jsonString.length + ? binaryString + : Array.from(subarray); + } + + fromJSON(data: Data, length: number): void { + if (typeof data === "string") { + const decodedArray = binaryDecode(data, length, this.ctor); + if (decodedArray.length > this.capacity) { + this.grow(decodedArray.length); } - const jsonString = JSON.stringify(Array.from(subarray)); - const binaryString = binaryEncode(subarray); - return binaryString.length < jsonString.length - ? binaryString - : Array.from(subarray); - }, - fromJSON(data: Data, length: number) { - if (typeof data === "string") { - const decodedArray = binaryDecode(data, length, ctor); - if (decodedArray.length > capacity) { - grow(decodedArray.length); - } - array.set(decodedArray); - } else { - if (!Array.isArray(data)) { - throw new Error(`Cannot set array to ${data}`); - } - if (data.length > capacity) { - grow(data.length); - } - array.set(data as number[]); + this.array.set(decodedArray); + } else { + if (!Array.isArray(data)) { + throw new Error(`Cannot set array to ${data}`); } - }, - }; - return result; + if (data.length > this.capacity) { + this.grow(data.length); + } + this.array.set(data as number[]); + } + } +} + +function createManagedTypedArray( + ctor: TypedArrayConstructor, + allocator: MemoryAllocator +): ManagedArray { + return new ManagedTypedArrayColumn(ctor, allocator); } export function createManagedArray( diff --git a/packages/data/src/perftest/index.ts b/packages/data/src/perftest/index.ts index 181a6ca4..078debe6 100644 --- a/packages/data/src/perftest/index.ts +++ b/packages/data/src/perftest/index.ts @@ -2,6 +2,7 @@ import * as vanilla_tests from "./vanilla-perf.js"; import * as horizon_tests from "./horizon-perf.js"; import * as ecs_tests from "./ecs-perf.js"; +import * as typed_buffer_tests from "./typed-buffer-perf.js"; import { runTests } from "./perf-test.js"; export function run() { @@ -9,6 +10,7 @@ export function run() { ...ecs_tests, ...vanilla_tests, ...horizon_tests, + ...typed_buffer_tests, }); } diff --git a/packages/data/src/perftest/perf-test.ts b/packages/data/src/perftest/perf-test.ts index 1238d58b..cb4388da 100644 --- a/packages/data/src/perftest/perf-test.ts +++ b/packages/data/src/perftest/perf-test.ts @@ -4,6 +4,7 @@ interface PerfResults { timeMs: number; result: any; flops: number; + n: number; } declare global { @@ -93,9 +94,16 @@ export async function runTests( // console.log(JSON.stringify(finalPositions)); } - const n = test.getVisibleEnabledPositions ? 100_000 : 100_000; + // Auto-tune n upward so each measured iteration falls in TARGET_BAND. + // Tests already at or above the lower bound keep their starting n. + const n = await tuneN(test, 100_000); + garbageCollect(); - await test.setup(n); + // Warmup so V8 has fully optimized before any sample is recorded. + const warmupEnd = getTime() + WARMUP_MS; + while (getTime() < warmupEnd) { + test.run(); + } garbageCollect(); const baselineMemory = getMemory(); @@ -105,7 +113,7 @@ export async function runTests( while (getTime() - timeStart < 1000) { const result = runOnce(test.run, test); result.memoryKb -= baselineMemory; - testResults.push({...result, flops: n * typeToFlops[test.type] }); + testResults.push({...result, flops: n * typeToFlops[test.type], n }); } await test.cleanup(); @@ -130,6 +138,7 @@ export async function runTests( const totalFlops = testResults.reduce((a, b) => a + b.flops, 0); const averageFlopsPerSecond = (totalFlops * 1000) / totalTime; tableValues[`${display(suite)}:${display(name)}`] = { + N: testResults[0]?.n ?? 0, Passes: memoryKb.length, // 'Mem Min (Mb)': Math.min(...memoryKb).toFixed(2), // 'Mem Max (Mb)': Math.max(...memoryKb).toFixed(2), @@ -170,7 +179,36 @@ export async function runTests( // allocate WASM-backed memory that is not reclaimed between calls. const MIN_SAMPLE_MS = 2; -function runOnce(fn: () => any, test: PerformanceTest): Omit { +// Target per-iteration time band. Tests that come in faster than the lower +// bound get their n scaled up so each measured call does meaningful work and +// dominates timing-call overhead. Tests already in or above the band are left +// alone — we never scale n down, since that would change benchmark semantics. +const TARGET_MIN_MS = 0.5; +const TARGET_MAX_MS = 50; +const MAX_AUTO_N = 1_000_000; +const WARMUP_MS = 50; + +async function tuneN(test: PerformanceTest, startN: number): Promise { + let n = startN; + for (let attempt = 0; attempt < 4; attempt++) { + await test.setup(n); + const probe = runOnce(test.run, test); + const probeMs = Math.max(probe.timeMs, 0.001); + if (probeMs >= TARGET_MIN_MS) { + return n; + } + const target = (TARGET_MIN_MS + TARGET_MAX_MS) / 2; + const newN = Math.min(MAX_AUTO_N, Math.round(n * (target / probeMs))); + if (newN <= n) { + return n; + } + await test.cleanup(); + n = newN; + } + return n; +} + +function runOnce(fn: () => any, test: PerformanceTest): Omit { const start = getTime(); let result: any; let iterations = 0; diff --git a/packages/data/src/perftest/typed-buffer-perf.ts b/packages/data/src/perftest/typed-buffer-perf.ts new file mode 100644 index 00000000..1b2ea29b --- /dev/null +++ b/packages/data/src/perftest/typed-buffer-perf.ts @@ -0,0 +1,143 @@ +// © 2026 Adobe. MIT License. See /LICENSE for details. +import { createNumberBuffer } from "../typed-buffer/create-number-buffer.js"; +import { createArrayBuffer } from "../typed-buffer/create-array-buffer.js"; +import { createStructBuffer } from "../typed-buffer/create-struct-buffer.js"; +import { F32 } from "../math/f32/index.js"; +import type { Schema } from "../schema/index.js"; +import type { PerformanceTest } from "./perf-test.js"; + +const Vec3Schema: Schema = { + type: "object", + properties: { + x: F32.schema, + y: F32.schema, + z: F32.schema, + }, + required: ["x", "y", "z"], +} as const; + +type Vec3 = { x: number; y: number; z: number }; + +// number buffer get+set sweep — direct typed-array baseline through TypedBuffer. +const numberBufferGetSet = (): PerformanceTest => { + let buffer: ReturnType; + let count = 0; + let sink = 0; + const setup = async (n: number) => { + count = n; + buffer = createNumberBuffer(F32.schema, n); + for (let i = 0; i < n; i++) { + buffer.set(i, i + 1); + } + }; + const run = () => { + let total = 0; + for (let i = 0; i < count; i++) { + total += buffer.get(i); + } + sink ^= total | 0; + for (let i = 0; i < count; i++) { + buffer.set(i, buffer.get(i) + 1); + } + }; + const cleanup = async () => { sink = 0; }; + return { setup, run, cleanup, type: "move" }; +}; + +// array buffer get+set with object payloads — non-typed-array baseline. +const arrayBufferGetSet = (): PerformanceTest => { + let buffer: ReturnType>; + let count = 0; + let sink = 0; + const setup = async (n: number) => { + count = n; + buffer = createArrayBuffer(Vec3Schema, n); + for (let i = 0; i < n; i++) { + buffer.set(i, { x: i + 1, y: i + 1, z: i + 1 }); + } + }; + const run = () => { + let total = 0; + for (let i = 0; i < count; i++) { + const v = buffer.get(i); + total += v.x + v.y + v.z; + } + sink ^= total | 0; + }; + const cleanup = async () => { sink = 0; }; + return { setup, run, cleanup, type: "move" }; +}; + +// struct_get — read every element and accumulate a sink to defeat DCE. +const structBufferGet = (): PerformanceTest => { + let buffer: ReturnType>; + let count = 0; + let sink = 0; + const setup = async (n: number) => { + count = n; + buffer = createStructBuffer(Vec3Schema, n); + for (let i = 0; i < n; i++) { + buffer.set(i, { x: i + 1, y: i + 2, z: i + 3 }); + } + }; + const run = () => { + let total = 0; + for (let i = 0; i < count; i++) { + const v = buffer.get(i) as Vec3; + total += v.x + v.y + v.z; + } + sink ^= total | 0; + }; + const cleanup = async () => { sink = 0; }; + return { setup, run, cleanup, type: "move" }; +}; + +// struct_set — write every element from a fixed literal. +const structBufferSet = (): PerformanceTest => { + let buffer: ReturnType>; + let count = 0; + const literal = { x: 1, y: 2, z: 3 }; + const setup = async (n: number) => { + count = n; + buffer = createStructBuffer(Vec3Schema, n); + }; + const run = () => { + for (let i = 0; i < count; i++) { + buffer.set(i, literal); + } + }; + const cleanup = async () => { }; + return { setup, run, cleanup, type: "move" }; +}; + +// struct_round_trip — get, mutate, set. Mirrors the per-frame ECS pattern. +const structBufferRoundTrip = (): PerformanceTest => { + let buffer: ReturnType>; + let count = 0; + const setup = async (n: number) => { + count = n; + buffer = createStructBuffer(Vec3Schema, n); + for (let i = 0; i < n; i++) { + buffer.set(i, { x: i + 1, y: i + 1, z: i + 1 }); + } + }; + const run = () => { + for (let i = 0; i < count; i++) { + const v = buffer.get(i) as Vec3; + v.x += 1; + v.y += 1; + v.z += 1; + buffer.set(i, v); + } + }; + const cleanup = async () => { }; + return { setup, run, cleanup, type: "move" }; +}; + +export const typed_buffer = { + number_get_set: numberBufferGetSet(), + array_get: arrayBufferGetSet(), + struct_get: structBufferGet(), + struct_set: structBufferSet(), + struct_round_trip: structBufferRoundTrip(), +};