diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index 2248920c266aef5..08bd4eec3c50ba7 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -715,6 +715,89 @@ def dummy(x): self.assertNotIn("_GUARD_CODE_VERSION__PUSH_FRAME", uops) self.assertNotIn("_GUARD_IP__PUSH_FRAME", uops) + def assert_kw_call_optimized(self, ex): + uops = get_opnames(ex) + self.assertNotIn("_PY_FRAME_KW", uops) + init_index = next( + (i for i, opname in enumerate(uops) + if opname.startswith("_INIT_CALL_PY_EXACT_ARGS")), + None, + ) + self.assertIsNotNone(init_index, uops) + pop_index = max( + (i for i, opname in enumerate(uops[:init_index]) + if opname == "_POP_TOP"), + default=None, + ) + self.assertIsNotNone(pop_index, uops) + stack_check_index = max( + (i for i, opname in enumerate(uops[:init_index]) + if opname == "_CHECK_STACK_SPACE_OPERAND"), + default=None, + ) + self.assertIsNotNone(stack_check_index, uops) + self.assertLess(stack_check_index, pop_index, uops) + self.assertNotIn("_CHECK_FUNCTION_EXACT_ARGS", uops[pop_index:init_index]) + return uops, pop_index, init_index + + def test_call_kw_py_exact_args(self): + def callee(x, a, b): + return x + a + b + + def testfunc(n): + total = 0 + for i in range(n): + total += callee(i, b=2, a=1) + return total + + res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) + self.assertEqual(res, TIER2_THRESHOLD * (TIER2_THRESHOLD - 1) // 2 + 3 * TIER2_THRESHOLD) + self.assertIsNotNone(ex) + uops, pop_index, init_index = self.assert_kw_call_optimized(ex) + self.assertTrue( + any(opname.startswith("_SWAP") for opname in uops[pop_index:init_index]), + uops, + ) + self.assertIn("_BINARY_OP_ADD_INT", uops) + + def test_call_kw_py_exact_args_no_reorder(self): + def callee(x, a, b): + return x + a + b + + def testfunc(n): + total = 0 + for i in range(n): + total += callee(i, a=1, b=2) + return total + + res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) + self.assertEqual(res, TIER2_THRESHOLD * (TIER2_THRESHOLD - 1) // 2 + 3 * TIER2_THRESHOLD) + self.assertIsNotNone(ex) + uops, pop_index, init_index = self.assert_kw_call_optimized(ex) + self.assertFalse( + any(opname.startswith("_SWAP") for opname in uops[pop_index:init_index]), + uops, + ) + + def test_call_kw_bound_method_exact_args(self): + class C: + def callee(self, x, a, b): + return x + a + b + + obj = C() + + def testfunc(n): + total = 0 + for i in range(n): + total += obj.callee(i, b=2, a=1) + return total + + res, ex = self._run_with_optimizer(testfunc, TIER2_THRESHOLD) + self.assertEqual(res, TIER2_THRESHOLD * (TIER2_THRESHOLD - 1) // 2 + 3 * TIER2_THRESHOLD) + self.assertIsNotNone(ex) + uops, _, _ = self.assert_kw_call_optimized(ex) + self.assertIn("_BINARY_OP_ADD_INT", uops) + def test_int_type_propagate_through_range(self): def testfunc(n): diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index edb4c644bccbf6f..98836212c18d880 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -1286,7 +1286,117 @@ dummy_func(void) { } op(_PY_FRAME_KW, (callable, self_or_null, args[oparg], kwnames -- new_frame)) { - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); + bool valid = false; + PyObject *func_o = sym_get_const(ctx, callable); + PyObject *kwnames_o = sym_get_const(ctx, kwnames); + bool has_self = sym_is_not_null(self_or_null); + PyCodeObject *co = NULL; + Py_ssize_t total_args = 0; + int desired[256]; + JitOptRef frame_args[257]; + + if ((has_self || sym_is_null(self_or_null)) && + func_o != NULL && PyFunction_Check(func_o) && + kwnames_o != NULL && PyTuple_CheckExact(kwnames_o) && + oparg <= 256) + { + PyFunctionObject *func = (PyFunctionObject *)func_o; + co = (PyCodeObject *)func->func_code; + Py_ssize_t kwcount = PyTuple_GET_SIZE(kwnames_o); + total_args = oparg + has_self; + Py_ssize_t positional_args = total_args - kwcount; + Py_ssize_t positional_stack_args = positional_args - has_self; + + if ((co->co_flags & (CO_OPTIMIZED | CO_VARARGS | CO_VARKEYWORDS)) == CO_OPTIMIZED && + co->co_kwonlyargcount == 0 && + co->co_argcount == total_args && + positional_args >= has_self) + { + int source_for_local[257]; + for (int i = 0; i < total_args; i++) { + source_for_local[i] = -1; + } + if (has_self) { + source_for_local[0] = -2; + } + for (int i = 0; i < positional_stack_args; i++) { + source_for_local[has_self + i] = i; + } + + valid = true; + for (Py_ssize_t i = 0; valid && i < kwcount; i++) { + PyObject *keyword = PyTuple_GET_ITEM(kwnames_o, i); + if (!PyUnicode_CheckExact(keyword)) { + valid = false; + break; + } + int target = -1; + for (int j = co->co_posonlyargcount; j < co->co_argcount; j++) { + PyObject *varname = PyTuple_GET_ITEM(co->co_localsplusnames, j); + if (keyword == varname || PyUnicode_Equal(keyword, varname)) { + target = j; + break; + } + } + if (target < has_self || target < 0 || source_for_local[target] != -1) { + valid = false; + break; + } + source_for_local[target] = (int)(positional_stack_args + i); + } + + if (has_self) { + frame_args[0] = self_or_null; + } + for (int local = 0; valid && local < co->co_argcount; local++) { + if (source_for_local[local] == -1) { + valid = false; + break; + } + if (local >= has_self) { + int source = source_for_local[local]; + desired[local - has_self] = source; + frame_args[local] = args[source]; + } + } + } + } + + if (!valid) { + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); + } + else { + int current[256]; + for (int i = 0; i < oparg; i++) { + current[i] = i; + } + + ADD_OP(_CHECK_STACK_SPACE_OPERAND, 0, co->co_framesize); + ADD_OP(_POP_TOP, 0, 0); + for (int pos = 0; pos < oparg - 1; pos++) { + int source = desired[pos]; + int source_pos = pos; + while (current[source_pos] != source) { + source_pos++; + } + if (source_pos != pos) { + int top = oparg - 1; + if (source_pos != top) { + ADD_OP(_SWAP, oparg - source_pos, 0); + int temp = current[source_pos]; + current[source_pos] = current[top]; + current[top] = temp; + } + ADD_OP(_SWAP, oparg - pos, 0); + int temp = current[pos]; + current[pos] = current[top]; + current[top] = temp; + } + } + + ADD_OP(_INIT_CALL_PY_EXACT_ARGS, oparg, 0); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, frame_args, (int)total_args)); + } } op(_PY_FRAME_EX, (func_st, null, callargs_st, kwargs_st -- ex_frame)) { diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 8895e02d47b1693..8fb695919b94cb8 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -5054,10 +5054,119 @@ /* _DO_CALL_KW is not a viable micro-op for tier 2 */ case _PY_FRAME_KW: { + JitOptRef kwnames; + JitOptRef *args; + JitOptRef self_or_null; JitOptRef callable; JitOptRef new_frame; + kwnames = stack_pointer[-1]; + args = &stack_pointer[-1 - oparg]; + self_or_null = stack_pointer[-2 - oparg]; callable = stack_pointer[-3 - oparg]; - new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); + bool valid = false; + PyObject *func_o = sym_get_const(ctx, callable); + PyObject *kwnames_o = sym_get_const(ctx, kwnames); + bool has_self = sym_is_not_null(self_or_null); + PyCodeObject *co = NULL; + Py_ssize_t total_args = 0; + int desired[256]; + JitOptRef frame_args[257]; + if ((has_self || sym_is_null(self_or_null)) && + func_o != NULL && PyFunction_Check(func_o) && + kwnames_o != NULL && PyTuple_CheckExact(kwnames_o) && + oparg <= 256) + { + PyFunctionObject *func = (PyFunctionObject *)func_o; + co = (PyCodeObject *)func->func_code; + Py_ssize_t kwcount = PyTuple_GET_SIZE(kwnames_o); + total_args = oparg + has_self; + Py_ssize_t positional_args = total_args - kwcount; + Py_ssize_t positional_stack_args = positional_args - has_self; + if ((co->co_flags & (CO_OPTIMIZED | CO_VARARGS | CO_VARKEYWORDS)) == CO_OPTIMIZED && + co->co_kwonlyargcount == 0 && + co->co_argcount == total_args && + positional_args >= has_self) + { + int source_for_local[257]; + for (int i = 0; i < total_args; i++) { + source_for_local[i] = -1; + } + if (has_self) { + source_for_local[0] = -2; + } + for (int i = 0; i < positional_stack_args; i++) { + source_for_local[has_self + i] = i; + } + valid = true; + for (Py_ssize_t i = 0; valid && i < kwcount; i++) { + PyObject *keyword = PyTuple_GET_ITEM(kwnames_o, i); + if (!PyUnicode_CheckExact(keyword)) { + valid = false; + break; + } + int target = -1; + for (int j = co->co_posonlyargcount; j < co->co_argcount; j++) { + PyObject *varname = PyTuple_GET_ITEM(co->co_localsplusnames, j); + if (keyword == varname || PyUnicode_Equal(keyword, varname)) { + target = j; + break; + } + } + if (target < has_self || target < 0 || source_for_local[target] != -1) { + valid = false; + break; + } + source_for_local[target] = (int)(positional_stack_args + i); + } + if (has_self) { + frame_args[0] = self_or_null; + } + for (int local = 0; valid && local < co->co_argcount; local++) { + if (source_for_local[local] == -1) { + valid = false; + break; + } + if (local >= has_self) { + int source = source_for_local[local]; + desired[local - has_self] = source; + frame_args[local] = args[source]; + } + } + } + } + if (!valid) { + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, NULL, 0)); + } + else { + int current[256]; + for (int i = 0; i < oparg; i++) { + current[i] = i; + } + ADD_OP(_CHECK_STACK_SPACE_OPERAND, 0, co->co_framesize); + ADD_OP(_POP_TOP, 0, 0); + for (int pos = 0; pos < oparg - 1; pos++) { + int source = desired[pos]; + int source_pos = pos; + while (current[source_pos] != source) { + source_pos++; + } + if (source_pos != pos) { + int top = oparg - 1; + if (source_pos != top) { + ADD_OP(_SWAP, oparg - source_pos, 0); + int temp = current[source_pos]; + current[source_pos] = current[top]; + current[top] = temp; + } + ADD_OP(_SWAP, oparg - pos, 0); + int temp = current[pos]; + current[pos] = current[top]; + current[top] = temp; + } + } + ADD_OP(_INIT_CALL_PY_EXACT_ARGS, oparg, 0); + new_frame = PyJitRef_WrapInvalid(frame_new_from_symbol(ctx, callable, frame_args, (int)total_args)); + } CHECK_STACK_BOUNDS(-2 - oparg); stack_pointer[-3 - oparg] = new_frame; stack_pointer += -2 - oparg;