From b94d6ac3f2f484aa756544f3cbd7fd0d25b24f99 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Fri, 6 Sep 2024 16:50:26 +0100 Subject: [PATCH 01/15] gh-119726: generate and patch AArch64 trampolines AArch64 trampolines are now generated at runtime at the end of every trace. --- Python/jit.c | 79 ++++++++++++++++++++++++++++++++++++++++-- Tools/jit/_stencils.py | 69 ++++++++++++++++-------------------- Tools/jit/_writer.py | 8 +++++ 3 files changed, 114 insertions(+), 42 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 33320761621c4c..d0fe7cd98a417a 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -3,6 +3,7 @@ #include "Python.h" #include "pycore_abstract.h" +#include "pycore_bitutils.h" #include "pycore_call.h" #include "pycore_ceval.h" #include "pycore_critical_section.h" @@ -390,8 +391,70 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value) patch_32r(location, value); } +void patch_aarch64_trampoline(unsigned char *location, int ordinal); + #include "jit_stencils.h" +typedef struct { + void *mem; + SymbolMask mask; + size_t size; +} TrampolineState; + +//TODO: remove as global variable +TrampolineState trampoline_state; + +#if defined(__aarch64__) || defined(_M_ARM64) + #define TRAMPOLINE_SIZE 16 +#else + #define TRAMPOLINE_SIZE 0 +#endif + +// Generate and patch AArch64 trampolines. The symbols to jump to are stored +// in the jit_stencils.h in the symbols_map. +void +patch_aarch64_trampoline(unsigned char *location, int ordinal) +{ + // Masking is done modulo 32 as the mask is stored as an array of uint32_t + const uint32_t symbol_mask = 1 << (ordinal % 32); + const uint32_t trampoline_mask = trampoline_state.mask[ordinal / 32]; + assert(symbol_mask & trampoline_mask); + + // Count the number of set bits in the trampoline mask lower than ordinal, + // this gives the index into the array of trampolines. + int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); + for (int i = 0; i < ordinal / 32; i++) { + index += _Py_popcount32(trampoline_state.mask[i]); + } + + uint32_t *p = trampoline_state.mem + index * TRAMPOLINE_SIZE; + assert((size_t)index * TRAMPOLINE_SIZE < trampoline_state.size); + + uintptr_t value = (uintptr_t)symbols_map[ordinal]; + + /* Generate the trampoline + 0: 58000048 ldr x8, 8 + 4: d61f0100 br x8 + 8: 00000000 // The next two words contain the 64-bit address to jump to. + c: 00000000 + */ + p[0] = 0x58000048; + p[1] = 0xD61F0100; + p[2] = value & 0xffffffff; + p[3] = value >> 32; + + patch_aarch64_26r(location, (uintptr_t)p); +} + +static void +combine_symbol_mask(const SymbolMask src, SymbolMask dest, size_t size) +{ + // Calculate the union of the trampolines required by each StencilGroup + for (size_t i = 0; i < size; i++) { + dest[i] |= src[i]; + } +} + // Compiles executor in-place. Don't forget to call _PyJIT_Free later! int _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) @@ -401,6 +464,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; size_t code_size = 0; size_t data_size = 0; + trampoline_state = (TrampolineState){}; group = &trampoline; code_size += group->code_size; data_size += group->data_size; @@ -410,15 +474,25 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz instruction_starts[i] = code_size; code_size += group->code_size; data_size += group->data_size; + combine_symbol_mask(group->trampoline_mask, + trampoline_state.mask, + Py_ARRAY_LENGTH(trampoline_state.mask)); } group = &stencil_groups[_FATAL_ERROR]; code_size += group->code_size; data_size += group->data_size; + combine_symbol_mask(group->trampoline_mask, + trampoline_state.mask, + Py_ARRAY_LENGTH(trampoline_state.mask)); + // Calculate the size of the trampolines required by the whole trace + for (size_t i = 0; i < Py_ARRAY_LENGTH(trampoline_state.mask); i++) { + trampoline_state.size += _Py_popcount32(trampoline_state.mask[i]) * TRAMPOLINE_SIZE; + } // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); - size_t padding = page_size - ((code_size + data_size) & (page_size - 1)); - size_t total_size = code_size + data_size + padding; + size_t padding = page_size - ((code_size + data_size + trampoline_state.size) & (page_size - 1)); + size_t total_size = code_size + data_size + trampoline_state.size + padding; unsigned char *memory = jit_alloc(total_size); if (memory == NULL) { return -1; @@ -430,6 +504,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; + trampoline_state.mem = memory + code_size + data_size; // Compile the trampoline, which handles converting between the native // calling convention and the calling convention used by jitted code // (which may be different for efficiency reasons). On platforms where diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 1c6a9edb39840d..33e8c0e95cfc4c 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -2,11 +2,15 @@ import dataclasses import enum -import sys import typing import _schema +# Number of 32-bit words needed to store the bit mask of external symbols +SYMBOL_MASK_SIZE: int = 4 + +known_symbols: dict[str | None, int] = {} + @enum.unique class HoleValue(enum.Enum): @@ -157,7 +161,7 @@ def as_c(self, where: str) -> str: if value: value += " + " value += f"(uintptr_t)&{self.symbol}" - if _signed(self.addend): + if _signed(self.addend) or not value: if value: value += " + " value += f"{_signed(self.addend):#x}" @@ -175,7 +179,6 @@ class Stencil: body: bytearray = dataclasses.field(default_factory=bytearray, init=False) holes: list[Hole] = dataclasses.field(default_factory=list, init=False) disassembly: list[str] = dataclasses.field(default_factory=list, init=False) - trampolines: dict[str, int] = dataclasses.field(default_factory=dict, init=False) def pad(self, alignment: int) -> None: """Pad the stencil to the given alignment.""" @@ -184,39 +187,6 @@ def pad(self, alignment: int) -> None: self.disassembly.append(f"{offset:x}: {' '.join(['00'] * padding)}") self.body.extend([0] * padding) - def emit_aarch64_trampoline(self, hole: Hole, alignment: int) -> Hole: - """Even with the large code model, AArch64 Linux insists on 28-bit jumps.""" - assert hole.symbol is not None - reuse_trampoline = hole.symbol in self.trampolines - if reuse_trampoline: - # Re-use the base address of the previously created trampoline - base = self.trampolines[hole.symbol] - else: - self.pad(alignment) - base = len(self.body) - new_hole = hole.replace(addend=base, symbol=None, value=HoleValue.DATA) - - if reuse_trampoline: - return new_hole - - self.disassembly += [ - f"{base + 4 * 0:x}: 58000048 ldr x8, 8", - f"{base + 4 * 1:x}: d61f0100 br x8", - f"{base + 4 * 2:x}: 00000000", - f"{base + 4 * 2:016x}: R_AARCH64_ABS64 {hole.symbol}", - f"{base + 4 * 3:x}: 00000000", - ] - for code in [ - 0x58000048.to_bytes(4, sys.byteorder), - 0xD61F0100.to_bytes(4, sys.byteorder), - 0x00000000.to_bytes(4, sys.byteorder), - 0x00000000.to_bytes(4, sys.byteorder), - ]: - self.body.extend(code) - self.holes.append(hole.replace(offset=base + 8, kind="R_AARCH64_ABS64")) - self.trampolines[hole.symbol] = base - return new_hole - def remove_jump(self, *, alignment: int = 1) -> None: """Remove a zero-length continuation jump, if it exists.""" hole = max(self.holes, key=lambda hole: hole.offset) @@ -282,6 +252,7 @@ class StencilGroup: default_factory=dict, init=False ) _got: dict[str, int] = dataclasses.field(default_factory=dict, init=False) + trampolines: set[int] = dataclasses.field(default_factory=set, init=False) def process_relocations(self, *, alignment: int = 1) -> None: """Fix up all GOT and internal relocations for this stencil group.""" @@ -291,9 +262,15 @@ def process_relocations(self, *, alignment: int = 1) -> None: in {"R_AARCH64_CALL26", "R_AARCH64_JUMP26", "ARM64_RELOC_BRANCH26"} and hole.value is HoleValue.ZERO ): - new_hole = self.data.emit_aarch64_trampoline(hole, alignment) - self.code.holes.remove(hole) - self.code.holes.append(new_hole) + hole.func = "patch_aarch64_trampoline" + if hole.symbol in known_symbols: + ordinal = known_symbols[hole.symbol] + else: + ordinal = len(known_symbols) + known_symbols[hole.symbol] = ordinal + self.trampolines.add(ordinal) + hole.addend = ordinal + hole.symbol = None self.code.remove_jump(alignment=alignment) self.code.pad(alignment) self.data.pad(8) @@ -348,9 +325,21 @@ def _emit_global_offset_table(self) -> None: ) self.data.body.extend([0] * 8) + def _get_trampoline_mask(self) -> str: + bitmask: int = 0 + trampoline_mask: list[str] = [] + for ordinal in self.trampolines: + bitmask |= 1 << ordinal + if bitmask: + trampoline_mask = [ + f"0x{(bitmask >> i*32) & ((1 << 32) - 1):x}" + for i in range(0, SYMBOL_MASK_SIZE) + ] + return ", ".join(trampoline_mask) + def as_c(self, opname: str) -> str: """Dump this hole as a StencilGroup initializer.""" - return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}}}" + return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}, {{{self._get_trampoline_mask()}}}}}" def symbol_to_value(symbol: str) -> tuple[HoleValue, str | None]: diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 9d11094f85c7ff..d39a3d19e0ba89 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -7,12 +7,15 @@ def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: + yield f"typedef uint32_t SymbolMask[{_stencils.SYMBOL_MASK_SIZE}];" + yield "" yield "typedef struct {" yield " void (*emit)(" yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[]);" yield " size_t code_size;" yield " size_t data_size;" + yield " SymbolMask trampoline_mask;" yield "} StencilGroup;" yield "" yield f"static const StencilGroup trampoline = {groups['trampoline'].as_c('trampoline')};" @@ -23,6 +26,11 @@ def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[s continue yield f" [{opname}] = {group.as_c(opname)}," yield "};" + yield "" + yield "static const void * const symbols_map[] = {" + for symbol, ordinal in _stencils.known_symbols.items(): + yield f" [{ordinal}] = &{symbol}," + yield "};" def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator[str]: From ef8c591ae85687194e1ab6319c5156fcfed5f155 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Tue, 10 Sep 2024 10:17:17 +0100 Subject: [PATCH 02/15] Fix Windows builds. --- Python/jit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index d0fe7cd98a417a..6e69ff34e8114d 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -396,7 +396,7 @@ void patch_aarch64_trampoline(unsigned char *location, int ordinal); #include "jit_stencils.h" typedef struct { - void *mem; + unsigned char *mem; SymbolMask mask; size_t size; } TrampolineState; @@ -427,7 +427,7 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal) index += _Py_popcount32(trampoline_state.mask[i]); } - uint32_t *p = trampoline_state.mem + index * TRAMPOLINE_SIZE; + uint32_t *p = (uint32_t*)(trampoline_state.mem + index * TRAMPOLINE_SIZE); assert((size_t)index * TRAMPOLINE_SIZE < trampoline_state.size); uintptr_t value = (uintptr_t)symbols_map[ordinal]; From ce3744f938e69ef802a78c5ff7bc65b88564dcdc Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Tue, 10 Sep 2024 11:30:18 +0100 Subject: [PATCH 03/15] fix pointer --- Python/jit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/jit.c b/Python/jit.c index 6e69ff34e8114d..5cebd190ca66d3 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -430,7 +430,7 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal) uint32_t *p = (uint32_t*)(trampoline_state.mem + index * TRAMPOLINE_SIZE); assert((size_t)index * TRAMPOLINE_SIZE < trampoline_state.size); - uintptr_t value = (uintptr_t)symbols_map[ordinal]; + uint64_t value = (uintptr_t)symbols_map[ordinal]; /* Generate the trampoline 0: 58000048 ldr x8, 8 From 33f433bc55b3f72364fcdf56222b61942f7400b6 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Tue, 10 Sep 2024 16:34:51 +0100 Subject: [PATCH 04/15] Fix internal compiler error on Windows --- Tools/jit/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index d39a3d19e0ba89..486c7518ee223f 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -27,7 +27,7 @@ def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[s yield f" [{opname}] = {group.as_c(opname)}," yield "};" yield "" - yield "static const void * const symbols_map[] = {" + yield f"static const void * const symbols_map[{max(len(_stencils.known_symbols), 1)}] = {{" for symbol, ordinal in _stencils.known_symbols.items(): yield f" [{ordinal}] = &{symbol}," yield "};" From 3693c000f9e4cd2083946156b1e300796b4ef562 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Thu, 19 Sep 2024 17:28:10 +0100 Subject: [PATCH 05/15] Address Brandt's feedback. --- Tools/jit/_stencils.py | 33 ++++++++++++++++----------------- Tools/jit/_targets.py | 7 +++++-- Tools/jit/_writer.py | 18 ++++++++++++------ 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 33e8c0e95cfc4c..3354a4d2eb938b 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -6,11 +6,6 @@ import _schema -# Number of 32-bit words needed to store the bit mask of external symbols -SYMBOL_MASK_SIZE: int = 4 - -known_symbols: dict[str | None, int] = {} - @enum.unique class HoleValue(enum.Enum): @@ -252,9 +247,14 @@ class StencilGroup: default_factory=dict, init=False ) _got: dict[str, int] = dataclasses.field(default_factory=dict, init=False) - trampolines: set[int] = dataclasses.field(default_factory=set, init=False) - - def process_relocations(self, *, alignment: int = 1) -> None: + _trampolines: set[int] = dataclasses.field(default_factory=set, init=False) + + def process_relocations( + self, + known_symbols: dict[str | None, int], + *, + alignment: int = 1, + ) -> None: """Fix up all GOT and internal relocations for this stencil group.""" for hole in self.code.holes.copy(): if ( @@ -268,7 +268,7 @@ def process_relocations(self, *, alignment: int = 1) -> None: else: ordinal = len(known_symbols) known_symbols[hole.symbol] = ordinal - self.trampolines.add(ordinal) + self._trampolines.add(ordinal) hole.addend = ordinal hole.symbol = None self.code.remove_jump(alignment=alignment) @@ -328,18 +328,17 @@ def _emit_global_offset_table(self) -> None: def _get_trampoline_mask(self) -> str: bitmask: int = 0 trampoline_mask: list[str] = [] - for ordinal in self.trampolines: + for ordinal in self._trampolines: bitmask |= 1 << ordinal - if bitmask: - trampoline_mask = [ - f"0x{(bitmask >> i*32) & ((1 << 32) - 1):x}" - for i in range(0, SYMBOL_MASK_SIZE) - ] - return ", ".join(trampoline_mask) + while bitmask: + word = bitmask & ((1 << 32) - 1) + trampoline_mask.append(f"{word:#04x}") + bitmask >>= 32 + return "{" + ", ".join(trampoline_mask) + "}" def as_c(self, opname: str) -> str: """Dump this hole as a StencilGroup initializer.""" - return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}, {{{self._get_trampoline_mask()}}}}}" + return f"{{emit_{opname}, {len(self.code.body)}, {len(self.data.body)}, {self._get_trampoline_mask()}}}" def symbol_to_value(symbol: str) -> tuple[HoleValue, str | None]: diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index b6c0e79e72fb3e..767676c68e6c0f 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -44,6 +44,7 @@ class _Target(typing.Generic[_S, _R]): stable: bool = False debug: bool = False verbose: bool = False + known_symbols: dict[str | None, int] = dataclasses.field(default_factory=dict) def _compute_digest(self, out: pathlib.Path) -> str: hasher = hashlib.sha256() @@ -95,7 +96,9 @@ async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: if group.data.body: line = f"0: {str(bytes(group.data.body)).removeprefix('b')}" group.data.disassembly.append(line) - group.process_relocations(alignment=self.alignment) + group.process_relocations( + known_symbols=self.known_symbols, alignment=self.alignment + ) return group def _handle_section(self, section: _S, group: _stencils.StencilGroup) -> None: @@ -231,7 +234,7 @@ def build( if comment: file.write(f"// {comment}\n") file.write("\n") - for line in _writer.dump(stencil_groups): + for line in _writer.dump(stencil_groups, self.known_symbols): file.write(f"{line}\n") try: jit_stencils_new.replace(jit_stencils) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 486c7518ee223f..8c28933918ff25 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -2,12 +2,16 @@ import itertools import typing +import math import _stencils -def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: - yield f"typedef uint32_t SymbolMask[{_stencils.SYMBOL_MASK_SIZE}];" +def _dump_footer( + groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] +) -> typing.Iterator[str]: + symbol_mask_size = math.ceil(len(symbols) / 32) + yield f"typedef uint32_t SymbolMask[{symbol_mask_size}];" yield "" yield "typedef struct {" yield " void (*emit)(" @@ -27,8 +31,8 @@ def _dump_footer(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[s yield f" [{opname}] = {group.as_c(opname)}," yield "};" yield "" - yield f"static const void * const symbols_map[{max(len(_stencils.known_symbols), 1)}] = {{" - for symbol, ordinal in _stencils.known_symbols.items(): + yield f"static const void * const symbols_map[{max(len(symbols), 1)}] = {{" + for symbol, ordinal in symbols.items(): yield f" [{ordinal}] = &{symbol}," yield "};" @@ -66,8 +70,10 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator yield "" -def dump(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]: +def dump( + groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] +) -> typing.Iterator[str]: """Yield a JIT compiler line-by-line as a C header file.""" for opname, group in sorted(groups.items()): yield from _dump_stencil(opname, group) - yield from _dump_footer(groups) + yield from _dump_footer(groups, symbols) From cc328509ae7a2bd1986702df6d29f34854173b56 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:57:35 +0000 Subject: [PATCH 06/15] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2024-09-19-16-57-34.gh-issue-119726.DseseK.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst new file mode 100644 index 00000000000000..662cdce13e9a28 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst @@ -0,0 +1 @@ +Emit runtime machine code for calls to C functions from the JIT on AArch64. Patch by Diego Russo. From 9c68b8c4b3df08f068277259f2a515c7cc17abff Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Fri, 20 Sep 2024 00:16:18 +0100 Subject: [PATCH 07/15] Fix windows builds (again) --- Tools/jit/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 8c28933918ff25..70f0b121fc843d 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -10,7 +10,7 @@ def _dump_footer( groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] ) -> typing.Iterator[str]: - symbol_mask_size = math.ceil(len(symbols) / 32) + symbol_mask_size = max(math.ceil(len(symbols) / 32), 1) yield f"typedef uint32_t SymbolMask[{symbol_mask_size}];" yield "" yield "typedef struct {" From 473cb7d937ea4540b56aea6040e9e312432bee55 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Thu, 26 Sep 2024 14:34:34 +0100 Subject: [PATCH 08/15] Remove trampoline_state as global variable --- Python/jit.c | 69 ++++++++++++++++++++++-------------------- Tools/jit/_stencils.py | 8 +++-- Tools/jit/_writer.py | 6 ++-- 3 files changed, 45 insertions(+), 38 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 5cebd190ca66d3..1713d3fa3d1a1e 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -114,6 +114,21 @@ mark_executable(unsigned char *memory, size_t size) // JIT compiler stuff: ///////////////////////////////////////////////////////// +#define SYMBOL_MASK_WORDS 4 + +typedef uint32_t SymbolMask[SYMBOL_MASK_WORDS]; + +typedef struct { + unsigned char *mem; + SymbolMask mask; + size_t size; +} TrampolineState; + +typedef struct { + TrampolineState trampolines; + uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; +} CompileState; + // Warning! AArch64 requires you to get your hands dirty. These are your gloves: // value[value_start : value_start + len] @@ -391,19 +406,10 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value) patch_32r(location, value); } -void patch_aarch64_trampoline(unsigned char *location, int ordinal); +void patch_aarch64_trampoline(unsigned char *location, int ordinal, CompileState *state); #include "jit_stencils.h" -typedef struct { - unsigned char *mem; - SymbolMask mask; - size_t size; -} TrampolineState; - -//TODO: remove as global variable -TrampolineState trampoline_state; - #if defined(__aarch64__) || defined(_M_ARM64) #define TRAMPOLINE_SIZE 16 #else @@ -413,22 +419,22 @@ TrampolineState trampoline_state; // Generate and patch AArch64 trampolines. The symbols to jump to are stored // in the jit_stencils.h in the symbols_map. void -patch_aarch64_trampoline(unsigned char *location, int ordinal) +patch_aarch64_trampoline(unsigned char *location, int ordinal, CompileState *state) { // Masking is done modulo 32 as the mask is stored as an array of uint32_t const uint32_t symbol_mask = 1 << (ordinal % 32); - const uint32_t trampoline_mask = trampoline_state.mask[ordinal / 32]; + const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; assert(symbol_mask & trampoline_mask); // Count the number of set bits in the trampoline mask lower than ordinal, // this gives the index into the array of trampolines. int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); for (int i = 0; i < ordinal / 32; i++) { - index += _Py_popcount32(trampoline_state.mask[i]); + index += _Py_popcount32(state->trampolines.mask[i]); } - uint32_t *p = (uint32_t*)(trampoline_state.mem + index * TRAMPOLINE_SIZE); - assert((size_t)index * TRAMPOLINE_SIZE < trampoline_state.size); + uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); + assert((size_t)index * TRAMPOLINE_SIZE < state->trampolines.size); uint64_t value = (uintptr_t)symbols_map[ordinal]; @@ -447,10 +453,10 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal) } static void -combine_symbol_mask(const SymbolMask src, SymbolMask dest, size_t size) +combine_symbol_mask(const SymbolMask src, SymbolMask dest) { // Calculate the union of the trampolines required by each StencilGroup - for (size_t i = 0; i < size; i++) { + for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) { dest[i] |= src[i]; } } @@ -461,70 +467,67 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz { const StencilGroup *group; // Loop once to find the total compiled size: - uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; size_t code_size = 0; size_t data_size = 0; - trampoline_state = (TrampolineState){}; + CompileState state = {}; group = &trampoline; code_size += group->code_size; data_size += group->data_size; for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; group = &stencil_groups[instruction->opcode]; - instruction_starts[i] = code_size; + state.instruction_starts[i] = code_size; code_size += group->code_size; data_size += group->data_size; combine_symbol_mask(group->trampoline_mask, - trampoline_state.mask, - Py_ARRAY_LENGTH(trampoline_state.mask)); + state.trampolines.mask); } group = &stencil_groups[_FATAL_ERROR]; code_size += group->code_size; data_size += group->data_size; combine_symbol_mask(group->trampoline_mask, - trampoline_state.mask, - Py_ARRAY_LENGTH(trampoline_state.mask)); + state.trampolines.mask); // Calculate the size of the trampolines required by the whole trace - for (size_t i = 0; i < Py_ARRAY_LENGTH(trampoline_state.mask); i++) { - trampoline_state.size += _Py_popcount32(trampoline_state.mask[i]) * TRAMPOLINE_SIZE; + for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) { + state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE; } // Round up to the nearest page: size_t page_size = get_page_size(); assert((page_size & (page_size - 1)) == 0); - size_t padding = page_size - ((code_size + data_size + trampoline_state.size) & (page_size - 1)); - size_t total_size = code_size + data_size + trampoline_state.size + padding; + size_t padding = page_size - ((code_size + data_size + state.trampolines.size) & (page_size - 1)); + size_t total_size = code_size + data_size + state.trampolines.size + padding; unsigned char *memory = jit_alloc(total_size); if (memory == NULL) { return -1; } // Update the offsets of each instruction: for (size_t i = 0; i < length; i++) { - instruction_starts[i] += (uintptr_t)memory; + state.instruction_starts[i] += (uintptr_t)memory; } // Loop again to emit the code: unsigned char *code = memory; unsigned char *data = memory + code_size; - trampoline_state.mem = memory + code_size + data_size; + state.trampolines.mem = memory + code_size + data_size; // Compile the trampoline, which handles converting between the native // calling convention and the calling convention used by jitted code // (which may be different for efficiency reasons). On platforms where // we don't change calling conventions, the trampoline is empty and // nothing is emitted here: group = &trampoline; - group->emit(code, data, executor, NULL, instruction_starts); + group->emit(code, data, executor, NULL, &state); code += group->code_size; data += group->data_size; assert(trace[0].opcode == _START_EXECUTOR); for (size_t i = 0; i < length; i++) { const _PyUOpInstruction *instruction = &trace[i]; group = &stencil_groups[instruction->opcode]; - group->emit(code, data, executor, instruction, instruction_starts); + group->emit(code, data, executor, instruction, &state); code += group->code_size; data += group->data_size; } // Protect against accidental buffer overrun into data: group = &stencil_groups[_FATAL_ERROR]; - group->emit(code, data, executor, NULL, instruction_starts); + group->emit(code, data, executor, NULL, &state); code += group->code_size; data += group->data_size; assert(code == memory + code_size); diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index 3354a4d2eb938b..cb579d4f5417ca 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -102,8 +102,8 @@ class HoleValue(enum.Enum): HoleValue.OPERAND_HI: "(instruction->operand >> 32)", HoleValue.OPERAND_LO: "(instruction->operand & UINT32_MAX)", HoleValue.TARGET: "instruction->target", - HoleValue.JUMP_TARGET: "instruction_starts[instruction->jump_target]", - HoleValue.ERROR_TARGET: "instruction_starts[instruction->error_target]", + HoleValue.JUMP_TARGET: "state->instruction_starts[instruction->jump_target]", + HoleValue.ERROR_TARGET: "state->instruction_starts[instruction->error_target]", HoleValue.ZERO: "", } @@ -124,6 +124,7 @@ class Hole: symbol: str | None # ...plus this addend: addend: int + need_state: bool = False func: str = dataclasses.field(init=False) # Convenience method: replace = dataclasses.replace @@ -160,6 +161,8 @@ def as_c(self, where: str) -> str: if value: value += " + " value += f"{_signed(self.addend):#x}" + if self.need_state: + return f"{self.func}({location}, {value}, state);" return f"{self.func}({location}, {value});" @@ -263,6 +266,7 @@ def process_relocations( and hole.value is HoleValue.ZERO ): hole.func = "patch_aarch64_trampoline" + hole.need_state = True if hole.symbol in known_symbols: ordinal = known_symbols[hole.symbol] else: diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 70f0b121fc843d..d07345385e42ce 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -11,12 +11,12 @@ def _dump_footer( groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] ) -> typing.Iterator[str]: symbol_mask_size = max(math.ceil(len(symbols) / 32), 1) - yield f"typedef uint32_t SymbolMask[{symbol_mask_size}];" + yield f"static_assert(SYMBOL_MASK_WORDS >= {symbol_mask_size}, \"SYMBOL_MASK_WORDS too small\");" yield "" yield "typedef struct {" yield " void (*emit)(" yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," - yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[]);" + yield " const _PyUOpInstruction *instruction, CompileState *state);" yield " size_t code_size;" yield " size_t data_size;" yield " SymbolMask trampoline_mask;" @@ -41,7 +41,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator yield "void" yield f"emit_{opname}(" yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," - yield " const _PyUOpInstruction *instruction, uintptr_t instruction_starts[])" + yield " const _PyUOpInstruction *instruction, CompileState *state)" yield "{" for part, stencil in [("code", group.code), ("data", group.data)]: for line in stencil.disassembly: From e252608abb52facc3e613a1b92118e859f7bb025 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Thu, 26 Sep 2024 14:43:44 +0100 Subject: [PATCH 09/15] Fix linter error --- Tools/jit/_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index d07345385e42ce..9a052e8e027378 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -11,7 +11,7 @@ def _dump_footer( groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] ) -> typing.Iterator[str]: symbol_mask_size = max(math.ceil(len(symbols) / 32), 1) - yield f"static_assert(SYMBOL_MASK_WORDS >= {symbol_mask_size}, \"SYMBOL_MASK_WORDS too small\");" + yield f'static_assert(SYMBOL_MASK_WORDS >= {symbol_mask_size}, "SYMBOL_MASK_WORDS too small");' yield "" yield "typedef struct {" yield " void (*emit)(" From 92c415112ce2450e25efe2b1b2605cb46c1ff024 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Fri, 27 Sep 2024 10:51:23 +0100 Subject: [PATCH 10/15] Update news file --- .../2024-09-19-16-57-34.gh-issue-119726.DseseK.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst index 662cdce13e9a28..c01eeff952534f 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-09-19-16-57-34.gh-issue-119726.DseseK.rst @@ -1 +1,2 @@ -Emit runtime machine code for calls to C functions from the JIT on AArch64. Patch by Diego Russo. +The JIT now generates more efficient code for calls to C functions resulting +in up to 0.8% memory savings and 1.5% speed improvement on AArch64. Patch by Diego Russo. From 645157cadc208cc853a7bc0e8c1dc9c884c9a1c7 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Mon, 30 Sep 2024 13:37:32 +0100 Subject: [PATCH 11/15] Address Brandt's feedback --- Python/jit.c | 36 +++++++++++++++--------------------- Tools/jit/_writer.py | 6 +++--- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 1713d3fa3d1a1e..8a701655c71394 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -116,18 +116,18 @@ mark_executable(unsigned char *memory, size_t size) #define SYMBOL_MASK_WORDS 4 -typedef uint32_t SymbolMask[SYMBOL_MASK_WORDS]; +typedef uint32_t symbol_mask[SYMBOL_MASK_WORDS]; typedef struct { unsigned char *mem; - SymbolMask mask; + symbol_mask mask; size_t size; -} TrampolineState; +} trampoline_state; typedef struct { - TrampolineState trampolines; + trampoline_state trampolines; uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; -} CompileState; +} jit_state; // Warning! AArch64 requires you to get your hands dirty. These are your gloves: @@ -406,20 +406,16 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value) patch_32r(location, value); } -void patch_aarch64_trampoline(unsigned char *location, int ordinal, CompileState *state); +void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state); #include "jit_stencils.h" -#if defined(__aarch64__) || defined(_M_ARM64) - #define TRAMPOLINE_SIZE 16 -#else - #define TRAMPOLINE_SIZE 0 -#endif +#define TRAMPOLINE_SIZE_AARCH64 16 // Generate and patch AArch64 trampolines. The symbols to jump to are stored // in the jit_stencils.h in the symbols_map. void -patch_aarch64_trampoline(unsigned char *location, int ordinal, CompileState *state) +patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) { // Masking is done modulo 32 as the mask is stored as an array of uint32_t const uint32_t symbol_mask = 1 << (ordinal % 32); @@ -433,8 +429,8 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, CompileState *sta index += _Py_popcount32(state->trampolines.mask[i]); } - uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); - assert((size_t)index * TRAMPOLINE_SIZE < state->trampolines.size); + uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE_AARCH64); + assert((size_t)(index + 1) * TRAMPOLINE_SIZE_AARCH64 < state->trampolines.size); uint64_t value = (uintptr_t)symbols_map[ordinal]; @@ -453,7 +449,7 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, CompileState *sta } static void -combine_symbol_mask(const SymbolMask src, SymbolMask dest) +combine_symbol_mask(const symbol_mask src, symbol_mask dest) { // Calculate the union of the trampolines required by each StencilGroup for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) { @@ -469,7 +465,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz // Loop once to find the total compiled size: size_t code_size = 0; size_t data_size = 0; - CompileState state = {}; + jit_state state = {}; group = &trampoline; code_size += group->code_size; data_size += group->data_size; @@ -479,17 +475,15 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz state.instruction_starts[i] = code_size; code_size += group->code_size; data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, - state.trampolines.mask); + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); } group = &stencil_groups[_FATAL_ERROR]; code_size += group->code_size; data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, - state.trampolines.mask); + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); // Calculate the size of the trampolines required by the whole trace for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) { - state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE; + state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE_AARCH64; } // Round up to the nearest page: size_t page_size = get_page_size(); diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 9a052e8e027378..6abef65cb0c1df 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -16,10 +16,10 @@ def _dump_footer( yield "typedef struct {" yield " void (*emit)(" yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," - yield " const _PyUOpInstruction *instruction, CompileState *state);" + yield " const _PyUOpInstruction *instruction, jit_state *state);" yield " size_t code_size;" yield " size_t data_size;" - yield " SymbolMask trampoline_mask;" + yield " symbol_mask trampoline_mask;" yield "} StencilGroup;" yield "" yield f"static const StencilGroup trampoline = {groups['trampoline'].as_c('trampoline')};" @@ -41,7 +41,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator yield "void" yield f"emit_{opname}(" yield " unsigned char *code, unsigned char *data, _PyExecutorObject *executor," - yield " const _PyUOpInstruction *instruction, CompileState *state)" + yield " const _PyUOpInstruction *instruction, jit_state *state)" yield "{" for part, stencil in [("code", group.code), ("data", group.data)]: for line in stencil.disassembly: From 345ba2ee14be6fca879433d8adb3146638e6f612 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Wed, 2 Oct 2024 01:24:59 +0100 Subject: [PATCH 12/15] Use TRAMPOLINE_SIZE instead of TRAMPOLINE_SIZE_AARCH64 --- Python/jit.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Python/jit.c b/Python/jit.c index 8a701655c71394..366e03967dfb56 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -410,7 +410,11 @@ void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *s #include "jit_stencils.h" -#define TRAMPOLINE_SIZE_AARCH64 16 +#if defined(__aarch64__) || defined(_M_ARM64) + #define TRAMPOLINE_SIZE 16 +#else + #define TRAMPOLINE_SIZE 0 +#endif // Generate and patch AArch64 trampolines. The symbols to jump to are stored // in the jit_stencils.h in the symbols_map. @@ -429,8 +433,8 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) index += _Py_popcount32(state->trampolines.mask[i]); } - uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE_AARCH64); - assert((size_t)(index + 1) * TRAMPOLINE_SIZE_AARCH64 < state->trampolines.size); + uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); + assert((size_t)(index + 1) * TRAMPOLINE_SIZE < state->trampolines.size); uint64_t value = (uintptr_t)symbols_map[ordinal]; @@ -483,7 +487,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); // Calculate the size of the trampolines required by the whole trace for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) { - state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE_AARCH64; + state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE; } // Round up to the nearest page: size_t page_size = get_page_size(); From 835ebf4d3f02ed5f9fce371e977e9f528ace5873 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Wed, 2 Oct 2024 01:28:30 +0100 Subject: [PATCH 13/15] Change typehin for known_symbols data structure. --- Tools/jit/_stencils.py | 3 ++- Tools/jit/_targets.py | 2 +- Tools/jit/_writer.py | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py index cb579d4f5417ca..bbb52f391f4b01 100644 --- a/Tools/jit/_stencils.py +++ b/Tools/jit/_stencils.py @@ -254,7 +254,7 @@ class StencilGroup: def process_relocations( self, - known_symbols: dict[str | None, int], + known_symbols: dict[str, int], *, alignment: int = 1, ) -> None: @@ -267,6 +267,7 @@ def process_relocations( ): hole.func = "patch_aarch64_trampoline" hole.need_state = True + assert hole.symbol is not None if hole.symbol in known_symbols: ordinal = known_symbols[hole.symbol] else: diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 767676c68e6c0f..5eb316e782fda8 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -44,7 +44,7 @@ class _Target(typing.Generic[_S, _R]): stable: bool = False debug: bool = False verbose: bool = False - known_symbols: dict[str | None, int] = dataclasses.field(default_factory=dict) + known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) def _compute_digest(self, out: pathlib.Path) -> str: hasher = hashlib.sha256() diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py index 6abef65cb0c1df..7b99d10310a645 100644 --- a/Tools/jit/_writer.py +++ b/Tools/jit/_writer.py @@ -8,7 +8,7 @@ def _dump_footer( - groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] + groups: dict[str, _stencils.StencilGroup], symbols: dict[str, int] ) -> typing.Iterator[str]: symbol_mask_size = max(math.ceil(len(symbols) / 32), 1) yield f'static_assert(SYMBOL_MASK_WORDS >= {symbol_mask_size}, "SYMBOL_MASK_WORDS too small");' @@ -71,7 +71,7 @@ def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator def dump( - groups: dict[str, _stencils.StencilGroup], symbols: dict[str | None, int] + groups: dict[str, _stencils.StencilGroup], symbols: dict[str, int] ) -> typing.Iterator[str]: """Yield a JIT compiler line-by-line as a C header file.""" for opname, group in sorted(groups.items()): From 6723f87fb8d457030b5e7d38da467f807b6d7ac4 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Wed, 2 Oct 2024 09:59:49 +0100 Subject: [PATCH 14/15] Fix issue in debug builds. --- Python/jit. | 558 +++++++++++++++++++++++++++++++++++++++++++++++++++ Python/jit.c | 2 +- 2 files changed, 559 insertions(+), 1 deletion(-) create mode 100644 Python/jit. diff --git a/Python/jit. b/Python/jit. new file mode 100644 index 00000000000000..234fc7dda83231 --- /dev/null +++ b/Python/jit. @@ -0,0 +1,558 @@ +#ifdef _Py_JIT + +#include "Python.h" + +#include "pycore_abstract.h" +#include "pycore_bitutils.h" +#include "pycore_call.h" +#include "pycore_ceval.h" +#include "pycore_critical_section.h" +#include "pycore_dict.h" +#include "pycore_intrinsics.h" +#include "pycore_long.h" +#include "pycore_opcode_metadata.h" +#include "pycore_opcode_utils.h" +#include "pycore_optimizer.h" +#include "pycore_pyerrors.h" +#include "pycore_setobject.h" +#include "pycore_sliceobject.h" +#include "pycore_jit.h" + +// Memory management stuff: //////////////////////////////////////////////////// + +#ifndef MS_WINDOWS + #include +#endif + +static size_t +get_page_size(void) +{ +#ifdef MS_WINDOWS + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +#else + return sysconf(_SC_PAGESIZE); +#endif +} + +static void +jit_error(const char *message) +{ +#ifdef MS_WINDOWS + int hint = GetLastError(); +#else + int hint = errno; +#endif + PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); +} + +static unsigned char * +jit_alloc(size_t size) +{ + assert(size); + assert(size % get_page_size() == 0); +#ifdef MS_WINDOWS + int flags = MEM_COMMIT | MEM_RESERVE; + unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); + int failed = memory == NULL; +#else + int flags = MAP_ANONYMOUS | MAP_PRIVATE; + unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + int failed = memory == MAP_FAILED; +#endif + if (failed) { + jit_error("unable to allocate memory"); + return NULL; + } + return memory; +} + +static int +jit_free(unsigned char *memory, size_t size) +{ + assert(size); + assert(size % get_page_size() == 0); +#ifdef MS_WINDOWS + int failed = !VirtualFree(memory, 0, MEM_RELEASE); +#else + int failed = munmap(memory, size); +#endif + if (failed) { + jit_error("unable to free memory"); + return -1; + } + return 0; +} + +static int +mark_executable(unsigned char *memory, size_t size) +{ + if (size == 0) { + return 0; + } + assert(size % get_page_size() == 0); + // Do NOT ever leave the memory writable! Also, don't forget to flush the + // i-cache (I cannot begin to tell you how horrible that is to debug): +#ifdef MS_WINDOWS + if (!FlushInstructionCache(GetCurrentProcess(), memory, size)) { + jit_error("unable to flush instruction cache"); + return -1; + } + int old; + int failed = !VirtualProtect(memory, size, PAGE_EXECUTE_READ, &old); +#else + __builtin___clear_cache((char *)memory, (char *)memory + size); + int failed = mprotect(memory, size, PROT_EXEC | PROT_READ); +#endif + if (failed) { + jit_error("unable to protect executable memory"); + return -1; + } + return 0; +} + +// JIT compiler stuff: ///////////////////////////////////////////////////////// + +#define SYMBOL_MASK_WORDS 4 + +typedef uint32_t symbol_mask[SYMBOL_MASK_WORDS]; + +typedef struct { + unsigned char *mem; + symbol_mask mask; + size_t size; +} trampoline_state; + +typedef struct { + trampoline_state trampolines; + uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; +} jit_state; + +// Warning! AArch64 requires you to get your hands dirty. These are your gloves: + +// value[value_start : value_start + len] +static uint32_t +get_bits(uint64_t value, uint8_t value_start, uint8_t width) +{ + assert(width <= 32); + return (value >> value_start) & ((1ULL << width) - 1); +} + +// *loc[loc_start : loc_start + width] = value[value_start : value_start + width] +static void +set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, + uint8_t width) +{ + assert(loc_start + width <= 32); + // Clear the bits we're about to patch: + *loc &= ~(((1ULL << width) - 1) << loc_start); + assert(get_bits(*loc, loc_start, width) == 0); + // Patch the bits: + *loc |= get_bits(value, value_start, width) << loc_start; + assert(get_bits(*loc, loc_start, width) == get_bits(value, value_start, width)); +} + +// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions +// for instruction encodings: +#define IS_AARCH64_ADD_OR_SUB(I) (((I) & 0x11C00000) == 0x11000000) +#define IS_AARCH64_ADRP(I) (((I) & 0x9F000000) == 0x90000000) +#define IS_AARCH64_BRANCH(I) (((I) & 0x7C000000) == 0x14000000) +#define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) +#define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) + +// LLD is a great reference for performing relocations... just keep in +// mind that Tools/jit/build.py does filtering and preprocessing for us! +// Here's a good place to start for each platform: +// - aarch64-apple-darwin: +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h +// - aarch64-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - aarch64-unknown-linux-gnu: +// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp +// - i686-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - x86_64-apple-darwin: +// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp +// - x86_64-pc-windows-msvc: +// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp +// - x86_64-unknown-linux-gnu: +// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp + +// Many of these patches are "relaxing", meaning that they can rewrite the +// code they're patching to be more efficient (like turning a 64-bit memory +// load into a 32-bit immediate load). These patches have an "x" in their name. +// Relative patches have an "r" in their name. + +// 32-bit absolute address. +void +patch_32(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + // Check that we're not out of range of 32 unsigned bits: + assert(value < (1ULL << 32)); + *loc32 = (uint32_t)value; +} + +// 32-bit relative address. +void +patch_32r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + value -= (uintptr_t)location; + // Check that we're not out of range of 32 signed bits: + assert((int64_t)value >= -(1LL << 31)); + assert((int64_t)value < (1LL << 31)); + *loc32 = (uint32_t)value; +} + +// 64-bit absolute address. +void +patch_64(unsigned char *location, uint64_t value) +{ + uint64_t *loc64 = (uint64_t *)location; + *loc64 = value; +} + +// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r +// (below). +void +patch_aarch64_12(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); + // There might be an implicit shift encoded in the instruction: + uint8_t shift = 0; + if (IS_AARCH64_LDR_OR_STR(*loc32)) { + shift = (uint8_t)get_bits(*loc32, 30, 2); + // If both of these are set, the shift is supposed to be 4. + // That's pretty weird, and it's never actually been observed... + assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0); + } + value = get_bits(value, 0, 12); + assert(get_bits(value, 0, shift) == 0); + set_bits(loc32, 10, value, shift, 12); +} + +// Relaxable 12-bit low part of an absolute address. Pairs nicely with +// patch_aarch64_21rx (below). +void +patch_aarch64_12x(unsigned char *location, uint64_t value) +{ + // This can *only* be relaxed if it occurs immediately before a matching + // patch_aarch64_21rx. If that happens, the JIT build step will replace both + // calls with a single call to patch_aarch64_33rx. Otherwise, we end up + // here, and the instruction is patched normally: + patch_aarch64_12(location, value); +} + +// 16-bit low part of an absolute address. +void +patch_aarch64_16a(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 0 of 3"): + assert(get_bits(*loc32, 21, 2) == 0); + set_bits(loc32, 5, value, 0, 16); +} + +// 16-bit middle-low part of an absolute address. +void +patch_aarch64_16b(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 1 of 3"): + assert(get_bits(*loc32, 21, 2) == 1); + set_bits(loc32, 5, value, 16, 16); +} + +// 16-bit middle-high part of an absolute address. +void +patch_aarch64_16c(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 2 of 3"): + assert(get_bits(*loc32, 21, 2) == 2); + set_bits(loc32, 5, value, 32, 16); +} + +// 16-bit high part of an absolute address. +void +patch_aarch64_16d(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_MOV(*loc32)); + // Check the implicit shift (this is "part 3 of 3"): + assert(get_bits(*loc32, 21, 2) == 3); + set_bits(loc32, 5, value, 48, 16); +} + +// 21-bit count of pages between this page and an absolute address's page... I +// know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). +void +patch_aarch64_21r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + value = (value >> 12) - ((uintptr_t)location >> 12); + // Check that we're not out of range of 21 signed bits: + assert((int64_t)value >= -(1 << 20)); + assert((int64_t)value < (1 << 20)); + // value[0:2] goes in loc[29:31]: + set_bits(loc32, 29, value, 0, 2); + // value[2:21] goes in loc[5:26]: + set_bits(loc32, 5, value, 2, 19); +} + +// Relaxable 21-bit count of pages between this page and an absolute address's +// page. Pairs nicely with patch_aarch64_12x (above). +void +patch_aarch64_21rx(unsigned char *location, uint64_t value) +{ + // This can *only* be relaxed if it occurs immediately before a matching + // patch_aarch64_12x. If that happens, the JIT build step will replace both + // calls with a single call to patch_aarch64_33rx. Otherwise, we end up + // here, and the instruction is patched normally: + patch_aarch64_21r(location, value); +} + +// 28-bit relative branch. +void +patch_aarch64_26r(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + assert(IS_AARCH64_BRANCH(*loc32)); + value -= (uintptr_t)location; + // Check that we're not out of range of 28 signed bits: + assert((int64_t)value >= -(1 << 27)); + assert((int64_t)value < (1 << 27)); + // Since instructions are 4-byte aligned, only use 26 bits: + assert(get_bits(value, 0, 2) == 0); + set_bits(loc32, 0, value, 2, 26); +} + +// A pair of patch_aarch64_21rx and patch_aarch64_12x. +void +patch_aarch64_33rx(unsigned char *location, uint64_t value) +{ + uint32_t *loc32 = (uint32_t *)location; + // Try to relax the pair of GOT loads into an immediate value: + assert(IS_AARCH64_ADRP(*loc32)); + unsigned char reg = get_bits(loc32[0], 0, 5); + assert(IS_AARCH64_LDR_OR_STR(loc32[1])); + // There should be only one register involved: + assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. + assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. + uint64_t relaxed = *(uint64_t *)value; + if (relaxed < (1UL << 16)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xD503201F; + return; + } + if (relaxed < (1ULL << 32)) { + // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY + loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; + loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; + return; + } + relaxed = value - (uintptr_t)location; + if ((relaxed & 0x3) == 0 && + (int64_t)relaxed >= -(1L << 19) && + (int64_t)relaxed < (1L << 19)) + { + // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop + loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; + loc32[1] = 0xD503201F; + return; + } + // Couldn't do it. Just patch the two instructions normally: + patch_aarch64_21rx(location, value); + patch_aarch64_12x(location + 4, value); +} + +// Relaxable 32-bit relative address. +void +patch_x86_64_32rx(unsigned char *location, uint64_t value) +{ + uint8_t *loc8 = (uint8_t *)location; + // Try to relax the GOT load into an immediate value: + uint64_t relaxed = *(uint64_t *)(value + 4) - 4; + if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && + (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) + { + if (loc8[-2] == 0x8B) { + // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] + loc8[-2] = 0x8D; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { + // call qword ptr [rip + AAA] -> nop; call XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE8; + value = relaxed; + } + else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { + // jmp qword ptr [rip + AAA] -> nop; jmp XXX + loc8[-2] = 0x90; + loc8[-1] = 0xE9; + value = relaxed; + } + } + patch_32r(location, value); +} + +void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state); + +#include "jit_stencils.h" + +#if defined(__aarch64__) || defined(_M_ARM64) + #define TRAMPOLINE_SIZE 16 +#else + #define TRAMPOLINE_SIZE 0 +#endif + +// Generate and patch AArch64 trampolines. The symbols to jump to are stored +// in the jit_stencils.h in the symbols_map. +void +patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) +{ + // Masking is done modulo 32 as the mask is stored as an array of uint32_t + const uint32_t symbol_mask = 1 << (ordinal % 32); + const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; + assert(symbol_mask & trampoline_mask); + + // Count the number of set bits in the trampoline mask lower than ordinal, + // this gives the index into the array of trampolines. + int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); + for (int i = 0; i < ordinal / 32; i++) { + index += _Py_popcount32(state->trampolines.mask[i]); + } + + uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); + assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); + + uint64_t value = (uintptr_t)symbols_map[ordinal]; + + /* Generate the trampoline + 0: 58000048 ldr x8, 8 + 4: d61f0100 br x8 + 8: 00000000 // The next two words contain the 64-bit address to jump to. + c: 00000000 + */ + p[0] = 0x58000048; + p[1] = 0xD61F0100; + p[2] = value & 0xffffffff; + p[3] = value >> 32; + + patch_aarch64_26r(location, (uintptr_t)p); +} + +static void +combine_symbol_mask(const symbol_mask src, symbol_mask dest) +{ + // Calculate the union of the trampolines required by each StencilGroup + for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) { + dest[i] |= src[i]; + } +} + +// Compiles executor in-place. Don't forget to call _PyJIT_Free later! +int +_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) +{ + const StencilGroup *group; + // Loop once to find the total compiled size: + size_t code_size = 0; + size_t data_size = 0; + jit_state state = {}; + group = &trampoline; + code_size += group->code_size; + data_size += group->data_size; + for (size_t i = 0; i < length; i++) { + const _PyUOpInstruction *instruction = &trace[i]; + group = &stencil_groups[instruction->opcode]; + state.instruction_starts[i] = code_size; + code_size += group->code_size; + data_size += group->data_size; + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); + } + group = &stencil_groups[_FATAL_ERROR]; + code_size += group->code_size; + data_size += group->data_size; + combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); + // Calculate the size of the trampolines required by the whole trace + for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) { + state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE; + } + // Round up to the nearest page: + size_t page_size = get_page_size(); + assert((page_size & (page_size - 1)) == 0); + size_t padding = page_size - ((code_size + data_size + state.trampolines.size) & (page_size - 1)); + size_t total_size = code_size + data_size + state.trampolines.size + padding; + unsigned char *memory = jit_alloc(total_size); + if (memory == NULL) { + return -1; + } + // Update the offsets of each instruction: + for (size_t i = 0; i < length; i++) { + state.instruction_starts[i] += (uintptr_t)memory; + } + // Loop again to emit the code: + unsigned char *code = memory; + unsigned char *data = memory + code_size; + state.trampolines.mem = memory + code_size + data_size; + // Compile the trampoline, which handles converting between the native + // calling convention and the calling convention used by jitted code + // (which may be different for efficiency reasons). On platforms where + // we don't change calling conventions, the trampoline is empty and + // nothing is emitted here: + group = &trampoline; + group->emit(code, data, executor, NULL, &state); + code += group->code_size; + data += group->data_size; + assert(trace[0].opcode == _START_EXECUTOR); + for (size_t i = 0; i < length; i++) { + const _PyUOpInstruction *instruction = &trace[i]; + group = &stencil_groups[instruction->opcode]; + group->emit(code, data, executor, instruction, &state); + code += group->code_size; + data += group->data_size; + } + // Protect against accidental buffer overrun into data: + group = &stencil_groups[_FATAL_ERROR]; + group->emit(code, data, executor, NULL, &state); + code += group->code_size; + data += group->data_size; + assert(code == memory + code_size); + assert(data == memory + code_size + data_size); + if (mark_executable(memory, total_size)) { + jit_free(memory, total_size); + return -1; + } + executor->jit_code = memory; + executor->jit_side_entry = memory + trampoline.code_size; + executor->jit_size = total_size; + return 0; +} + +void +_PyJIT_Free(_PyExecutorObject *executor) +{ + unsigned char *memory = (unsigned char *)executor->jit_code; + size_t size = executor->jit_size; + if (memory) { + executor->jit_code = NULL; + executor->jit_side_entry = NULL; + executor->jit_size = 0; + if (jit_free(memory, size)) { + PyErr_WriteUnraisable(NULL); + } + } +} + +#endif // _Py_JIT diff --git a/Python/jit.c b/Python/jit.c index 366e03967dfb56..234fc7dda83231 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -434,7 +434,7 @@ patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) } uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); - assert((size_t)(index + 1) * TRAMPOLINE_SIZE < state->trampolines.size); + assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); uint64_t value = (uintptr_t)symbols_map[ordinal]; From 27436f5101ad709091489e750cdbe1f751a94ef2 Mon Sep 17 00:00:00 2001 From: Diego Russo Date: Wed, 2 Oct 2024 11:24:49 +0100 Subject: [PATCH 15/15] Remove unused file. --- Python/jit. | 558 ---------------------------------------------------- 1 file changed, 558 deletions(-) delete mode 100644 Python/jit. diff --git a/Python/jit. b/Python/jit. deleted file mode 100644 index 234fc7dda83231..00000000000000 --- a/Python/jit. +++ /dev/null @@ -1,558 +0,0 @@ -#ifdef _Py_JIT - -#include "Python.h" - -#include "pycore_abstract.h" -#include "pycore_bitutils.h" -#include "pycore_call.h" -#include "pycore_ceval.h" -#include "pycore_critical_section.h" -#include "pycore_dict.h" -#include "pycore_intrinsics.h" -#include "pycore_long.h" -#include "pycore_opcode_metadata.h" -#include "pycore_opcode_utils.h" -#include "pycore_optimizer.h" -#include "pycore_pyerrors.h" -#include "pycore_setobject.h" -#include "pycore_sliceobject.h" -#include "pycore_jit.h" - -// Memory management stuff: //////////////////////////////////////////////////// - -#ifndef MS_WINDOWS - #include -#endif - -static size_t -get_page_size(void) -{ -#ifdef MS_WINDOWS - SYSTEM_INFO si; - GetSystemInfo(&si); - return si.dwPageSize; -#else - return sysconf(_SC_PAGESIZE); -#endif -} - -static void -jit_error(const char *message) -{ -#ifdef MS_WINDOWS - int hint = GetLastError(); -#else - int hint = errno; -#endif - PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); -} - -static unsigned char * -jit_alloc(size_t size) -{ - assert(size); - assert(size % get_page_size() == 0); -#ifdef MS_WINDOWS - int flags = MEM_COMMIT | MEM_RESERVE; - unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); - int failed = memory == NULL; -#else - int flags = MAP_ANONYMOUS | MAP_PRIVATE; - unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); - int failed = memory == MAP_FAILED; -#endif - if (failed) { - jit_error("unable to allocate memory"); - return NULL; - } - return memory; -} - -static int -jit_free(unsigned char *memory, size_t size) -{ - assert(size); - assert(size % get_page_size() == 0); -#ifdef MS_WINDOWS - int failed = !VirtualFree(memory, 0, MEM_RELEASE); -#else - int failed = munmap(memory, size); -#endif - if (failed) { - jit_error("unable to free memory"); - return -1; - } - return 0; -} - -static int -mark_executable(unsigned char *memory, size_t size) -{ - if (size == 0) { - return 0; - } - assert(size % get_page_size() == 0); - // Do NOT ever leave the memory writable! Also, don't forget to flush the - // i-cache (I cannot begin to tell you how horrible that is to debug): -#ifdef MS_WINDOWS - if (!FlushInstructionCache(GetCurrentProcess(), memory, size)) { - jit_error("unable to flush instruction cache"); - return -1; - } - int old; - int failed = !VirtualProtect(memory, size, PAGE_EXECUTE_READ, &old); -#else - __builtin___clear_cache((char *)memory, (char *)memory + size); - int failed = mprotect(memory, size, PROT_EXEC | PROT_READ); -#endif - if (failed) { - jit_error("unable to protect executable memory"); - return -1; - } - return 0; -} - -// JIT compiler stuff: ///////////////////////////////////////////////////////// - -#define SYMBOL_MASK_WORDS 4 - -typedef uint32_t symbol_mask[SYMBOL_MASK_WORDS]; - -typedef struct { - unsigned char *mem; - symbol_mask mask; - size_t size; -} trampoline_state; - -typedef struct { - trampoline_state trampolines; - uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH]; -} jit_state; - -// Warning! AArch64 requires you to get your hands dirty. These are your gloves: - -// value[value_start : value_start + len] -static uint32_t -get_bits(uint64_t value, uint8_t value_start, uint8_t width) -{ - assert(width <= 32); - return (value >> value_start) & ((1ULL << width) - 1); -} - -// *loc[loc_start : loc_start + width] = value[value_start : value_start + width] -static void -set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, - uint8_t width) -{ - assert(loc_start + width <= 32); - // Clear the bits we're about to patch: - *loc &= ~(((1ULL << width) - 1) << loc_start); - assert(get_bits(*loc, loc_start, width) == 0); - // Patch the bits: - *loc |= get_bits(value, value_start, width) << loc_start; - assert(get_bits(*loc, loc_start, width) == get_bits(value, value_start, width)); -} - -// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions -// for instruction encodings: -#define IS_AARCH64_ADD_OR_SUB(I) (((I) & 0x11C00000) == 0x11000000) -#define IS_AARCH64_ADRP(I) (((I) & 0x9F000000) == 0x90000000) -#define IS_AARCH64_BRANCH(I) (((I) & 0x7C000000) == 0x14000000) -#define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000) -#define IS_AARCH64_MOV(I) (((I) & 0x9F800000) == 0x92800000) - -// LLD is a great reference for performing relocations... just keep in -// mind that Tools/jit/build.py does filtering and preprocessing for us! -// Here's a good place to start for each platform: -// - aarch64-apple-darwin: -// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp -// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp -// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h -// - aarch64-pc-windows-msvc: -// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp -// - aarch64-unknown-linux-gnu: -// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp -// - i686-pc-windows-msvc: -// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp -// - x86_64-apple-darwin: -// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp -// - x86_64-pc-windows-msvc: -// - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp -// - x86_64-unknown-linux-gnu: -// - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp - -// Many of these patches are "relaxing", meaning that they can rewrite the -// code they're patching to be more efficient (like turning a 64-bit memory -// load into a 32-bit immediate load). These patches have an "x" in their name. -// Relative patches have an "r" in their name. - -// 32-bit absolute address. -void -patch_32(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - // Check that we're not out of range of 32 unsigned bits: - assert(value < (1ULL << 32)); - *loc32 = (uint32_t)value; -} - -// 32-bit relative address. -void -patch_32r(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - value -= (uintptr_t)location; - // Check that we're not out of range of 32 signed bits: - assert((int64_t)value >= -(1LL << 31)); - assert((int64_t)value < (1LL << 31)); - *loc32 = (uint32_t)value; -} - -// 64-bit absolute address. -void -patch_64(unsigned char *location, uint64_t value) -{ - uint64_t *loc64 = (uint64_t *)location; - *loc64 = value; -} - -// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r -// (below). -void -patch_aarch64_12(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); - // There might be an implicit shift encoded in the instruction: - uint8_t shift = 0; - if (IS_AARCH64_LDR_OR_STR(*loc32)) { - shift = (uint8_t)get_bits(*loc32, 30, 2); - // If both of these are set, the shift is supposed to be 4. - // That's pretty weird, and it's never actually been observed... - assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0); - } - value = get_bits(value, 0, 12); - assert(get_bits(value, 0, shift) == 0); - set_bits(loc32, 10, value, shift, 12); -} - -// Relaxable 12-bit low part of an absolute address. Pairs nicely with -// patch_aarch64_21rx (below). -void -patch_aarch64_12x(unsigned char *location, uint64_t value) -{ - // This can *only* be relaxed if it occurs immediately before a matching - // patch_aarch64_21rx. If that happens, the JIT build step will replace both - // calls with a single call to patch_aarch64_33rx. Otherwise, we end up - // here, and the instruction is patched normally: - patch_aarch64_12(location, value); -} - -// 16-bit low part of an absolute address. -void -patch_aarch64_16a(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 0 of 3"): - assert(get_bits(*loc32, 21, 2) == 0); - set_bits(loc32, 5, value, 0, 16); -} - -// 16-bit middle-low part of an absolute address. -void -patch_aarch64_16b(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 1 of 3"): - assert(get_bits(*loc32, 21, 2) == 1); - set_bits(loc32, 5, value, 16, 16); -} - -// 16-bit middle-high part of an absolute address. -void -patch_aarch64_16c(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 2 of 3"): - assert(get_bits(*loc32, 21, 2) == 2); - set_bits(loc32, 5, value, 32, 16); -} - -// 16-bit high part of an absolute address. -void -patch_aarch64_16d(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_MOV(*loc32)); - // Check the implicit shift (this is "part 3 of 3"): - assert(get_bits(*loc32, 21, 2) == 3); - set_bits(loc32, 5, value, 48, 16); -} - -// 21-bit count of pages between this page and an absolute address's page... I -// know, I know, it's weird. Pairs nicely with patch_aarch64_12 (above). -void -patch_aarch64_21r(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - value = (value >> 12) - ((uintptr_t)location >> 12); - // Check that we're not out of range of 21 signed bits: - assert((int64_t)value >= -(1 << 20)); - assert((int64_t)value < (1 << 20)); - // value[0:2] goes in loc[29:31]: - set_bits(loc32, 29, value, 0, 2); - // value[2:21] goes in loc[5:26]: - set_bits(loc32, 5, value, 2, 19); -} - -// Relaxable 21-bit count of pages between this page and an absolute address's -// page. Pairs nicely with patch_aarch64_12x (above). -void -patch_aarch64_21rx(unsigned char *location, uint64_t value) -{ - // This can *only* be relaxed if it occurs immediately before a matching - // patch_aarch64_12x. If that happens, the JIT build step will replace both - // calls with a single call to patch_aarch64_33rx. Otherwise, we end up - // here, and the instruction is patched normally: - patch_aarch64_21r(location, value); -} - -// 28-bit relative branch. -void -patch_aarch64_26r(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - assert(IS_AARCH64_BRANCH(*loc32)); - value -= (uintptr_t)location; - // Check that we're not out of range of 28 signed bits: - assert((int64_t)value >= -(1 << 27)); - assert((int64_t)value < (1 << 27)); - // Since instructions are 4-byte aligned, only use 26 bits: - assert(get_bits(value, 0, 2) == 0); - set_bits(loc32, 0, value, 2, 26); -} - -// A pair of patch_aarch64_21rx and patch_aarch64_12x. -void -patch_aarch64_33rx(unsigned char *location, uint64_t value) -{ - uint32_t *loc32 = (uint32_t *)location; - // Try to relax the pair of GOT loads into an immediate value: - assert(IS_AARCH64_ADRP(*loc32)); - unsigned char reg = get_bits(loc32[0], 0, 5); - assert(IS_AARCH64_LDR_OR_STR(loc32[1])); - // There should be only one register involved: - assert(reg == get_bits(loc32[1], 0, 5)); // ldr's output register. - assert(reg == get_bits(loc32[1], 5, 5)); // ldr's input register. - uint64_t relaxed = *(uint64_t *)value; - if (relaxed < (1UL << 16)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - loc32[1] = 0xD503201F; - return; - } - if (relaxed < (1ULL << 32)) { - // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY - loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | reg; - loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | reg; - return; - } - relaxed = value - (uintptr_t)location; - if ((relaxed & 0x3) == 0 && - (int64_t)relaxed >= -(1L << 19) && - (int64_t)relaxed < (1L << 19)) - { - // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr reg, XXX; nop - loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | reg; - loc32[1] = 0xD503201F; - return; - } - // Couldn't do it. Just patch the two instructions normally: - patch_aarch64_21rx(location, value); - patch_aarch64_12x(location + 4, value); -} - -// Relaxable 32-bit relative address. -void -patch_x86_64_32rx(unsigned char *location, uint64_t value) -{ - uint8_t *loc8 = (uint8_t *)location; - // Try to relax the GOT load into an immediate value: - uint64_t relaxed = *(uint64_t *)(value + 4) - 4; - if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && - (int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) - { - if (loc8[-2] == 0x8B) { - // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] - loc8[-2] = 0x8D; - value = relaxed; - } - else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { - // call qword ptr [rip + AAA] -> nop; call XXX - loc8[-2] = 0x90; - loc8[-1] = 0xE8; - value = relaxed; - } - else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { - // jmp qword ptr [rip + AAA] -> nop; jmp XXX - loc8[-2] = 0x90; - loc8[-1] = 0xE9; - value = relaxed; - } - } - patch_32r(location, value); -} - -void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state); - -#include "jit_stencils.h" - -#if defined(__aarch64__) || defined(_M_ARM64) - #define TRAMPOLINE_SIZE 16 -#else - #define TRAMPOLINE_SIZE 0 -#endif - -// Generate and patch AArch64 trampolines. The symbols to jump to are stored -// in the jit_stencils.h in the symbols_map. -void -patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state) -{ - // Masking is done modulo 32 as the mask is stored as an array of uint32_t - const uint32_t symbol_mask = 1 << (ordinal % 32); - const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32]; - assert(symbol_mask & trampoline_mask); - - // Count the number of set bits in the trampoline mask lower than ordinal, - // this gives the index into the array of trampolines. - int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1)); - for (int i = 0; i < ordinal / 32; i++) { - index += _Py_popcount32(state->trampolines.mask[i]); - } - - uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE); - assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size); - - uint64_t value = (uintptr_t)symbols_map[ordinal]; - - /* Generate the trampoline - 0: 58000048 ldr x8, 8 - 4: d61f0100 br x8 - 8: 00000000 // The next two words contain the 64-bit address to jump to. - c: 00000000 - */ - p[0] = 0x58000048; - p[1] = 0xD61F0100; - p[2] = value & 0xffffffff; - p[3] = value >> 32; - - patch_aarch64_26r(location, (uintptr_t)p); -} - -static void -combine_symbol_mask(const symbol_mask src, symbol_mask dest) -{ - // Calculate the union of the trampolines required by each StencilGroup - for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) { - dest[i] |= src[i]; - } -} - -// Compiles executor in-place. Don't forget to call _PyJIT_Free later! -int -_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length) -{ - const StencilGroup *group; - // Loop once to find the total compiled size: - size_t code_size = 0; - size_t data_size = 0; - jit_state state = {}; - group = &trampoline; - code_size += group->code_size; - data_size += group->data_size; - for (size_t i = 0; i < length; i++) { - const _PyUOpInstruction *instruction = &trace[i]; - group = &stencil_groups[instruction->opcode]; - state.instruction_starts[i] = code_size; - code_size += group->code_size; - data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); - } - group = &stencil_groups[_FATAL_ERROR]; - code_size += group->code_size; - data_size += group->data_size; - combine_symbol_mask(group->trampoline_mask, state.trampolines.mask); - // Calculate the size of the trampolines required by the whole trace - for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) { - state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE; - } - // Round up to the nearest page: - size_t page_size = get_page_size(); - assert((page_size & (page_size - 1)) == 0); - size_t padding = page_size - ((code_size + data_size + state.trampolines.size) & (page_size - 1)); - size_t total_size = code_size + data_size + state.trampolines.size + padding; - unsigned char *memory = jit_alloc(total_size); - if (memory == NULL) { - return -1; - } - // Update the offsets of each instruction: - for (size_t i = 0; i < length; i++) { - state.instruction_starts[i] += (uintptr_t)memory; - } - // Loop again to emit the code: - unsigned char *code = memory; - unsigned char *data = memory + code_size; - state.trampolines.mem = memory + code_size + data_size; - // Compile the trampoline, which handles converting between the native - // calling convention and the calling convention used by jitted code - // (which may be different for efficiency reasons). On platforms where - // we don't change calling conventions, the trampoline is empty and - // nothing is emitted here: - group = &trampoline; - group->emit(code, data, executor, NULL, &state); - code += group->code_size; - data += group->data_size; - assert(trace[0].opcode == _START_EXECUTOR); - for (size_t i = 0; i < length; i++) { - const _PyUOpInstruction *instruction = &trace[i]; - group = &stencil_groups[instruction->opcode]; - group->emit(code, data, executor, instruction, &state); - code += group->code_size; - data += group->data_size; - } - // Protect against accidental buffer overrun into data: - group = &stencil_groups[_FATAL_ERROR]; - group->emit(code, data, executor, NULL, &state); - code += group->code_size; - data += group->data_size; - assert(code == memory + code_size); - assert(data == memory + code_size + data_size); - if (mark_executable(memory, total_size)) { - jit_free(memory, total_size); - return -1; - } - executor->jit_code = memory; - executor->jit_side_entry = memory + trampoline.code_size; - executor->jit_size = total_size; - return 0; -} - -void -_PyJIT_Free(_PyExecutorObject *executor) -{ - unsigned char *memory = (unsigned char *)executor->jit_code; - size_t size = executor->jit_size; - if (memory) { - executor->jit_code = NULL; - executor->jit_side_entry = NULL; - executor->jit_size = 0; - if (jit_free(memory, size)) { - PyErr_WriteUnraisable(NULL); - } - } -} - -#endif // _Py_JIT