"""Combined pyperf benchmark for the PyObject_CallMethod optimization.
PyObject_CallMethod is a C-API entry point with no direct Python equivalent
(`obj.m()` compiles to LOAD_METHOD/CALL), so the per-call cost is measured two
ways:
* Isolated: via the _testinternalcapi.bench_callmethod harness, which loops the
raw C-API call. Each measured op is exactly one CallMethod-family call.
(Harness is a local build artifact; these cases are skipped if absent.)
* End-to-end: io's IOBase.readline() calls _PyObject_CallMethod(self, read, 1)
once per byte, so reading a line is a pure-Python path that hits it ~2000x.
Run:
python bench_callmethod_pyperf.py -o new.json
python bench_callmethod_pyperf.py --compare-to old.json
"""
import datetime
import io
import pyperf
try:
import _testinternalcapi
_bench = _testinternalcapi.bench_callmethod
except (ImportError, AttributeError):
_bench = None
# --- data for the isolated cases ---
tz = datetime.timezone(datetime.timedelta(hours=5, minutes=30))
dt = datetime.datetime(2024, 6, 1, 12, 0, tzinfo=tz)
class PyObj:
def m(self, x):
return x
# (label, obj, name, arg, mode) mode 0=PyObject_CallMethod "O",
# 2=PyObject_CallMethod NULL, 3=_PyObject_CallMethod
ISOLATED = [
("CallMethod / C method tzinfo.utcoffset(dt)", tz, "utcoffset", dt, 0),
("CallMethod / C method {}.get(k)", {}, "get", "k", 0),
("CallMethod / C method '..'.count(x)", "hello world", "count", "l", 0),
("CallMethod / py method obj.m(x)", PyObj(), "m", 5, 0),
("CallMethod / no args (10**6).bit_length()", 10 ** 6, "bit_length", 0, 2),
("_PyObject_CallMethod tzinfo.utcoffset(dt)", tz, "utcoffset", dt, 3),
]
def make_iso(obj, name, arg, mode):
def run(loops):
t0 = pyperf.perf_counter()
_bench(obj, name, arg, loops, mode)
return pyperf.perf_counter() - t0
return run
# --- end-to-end io case (pure Python, no harness needed) ---
LINE_LEN = 2000
class Raw(io.RawIOBase):
def __init__(self, data):
self.data = data
self.pos = 0
def readable(self):
return True
def readinto(self, b):
if self.pos >= len(self.data):
return 0
k = min(len(b), len(self.data) - self.pos)
b[:k] = self.data[self.pos:self.pos + k]
self.pos += k
return k
def bench_io(loops):
line = b"a" * LINE_LEN + b"\n"
r = Raw(line)
t0 = pyperf.perf_counter()
for _ in range(loops):
r.pos = 0
r.readline()
return pyperf.perf_counter() - t0
# A second, different pure-Python path: bisect.insort on a list *subclass* goes
# through _PyObject_CallMethod(a, "insert", "nO", index, x) -- exact lists take a
# PyList_Insert fast path that bypasses the call. (Distinct "nO" / 2-arg path.)
import bisect
class SubList(list):
pass
def bench_insort(loops):
a = SubList(range(0, 32, 2)) # small even list, keeps the insert shift cheap
t0 = pyperf.perf_counter()
for _ in range(loops):
bisect.insort_right(a, 15) # -> _PyObject_CallMethod(a, "insert", ...)
del a[bisect.bisect_left(a, 15)] # remove it again to keep 'a' stable
return pyperf.perf_counter() - t0
if __name__ == "__main__":
runner = pyperf.Runner()
if _bench is not None:
for label, obj, name, arg, mode in ISOLATED:
runner.bench_time_func(label, make_iso(obj, name, arg, mode))
runner.bench_time_func("io.readline (per _PyObject_CallMethod)",
bench_io, inner_loops=LINE_LEN)
runner.bench_time_func("bisect.insort (list subclass, _PyObject_CallMethod)",
bench_insort)
Feature or enhancement
Proposal:
PyObject_CallMethod()(and siblings_PyObject_CallMethod(),PyEval_CallMethod(),_PyObject_CallMethodId()and_PyObject_CallMethod_SizeT()) look the method up withPyObject_GetAttr(), which materialises a temporary bound-method object (PyMethodObject) on every call, then call it and throw it away.The interpreter itself avoids this for
obj.meth(...)by using_PyObject_GetMethod()(theLOAD_METHODlookup), which returns the unbound function plus a flag and calls it withselfprepended.In
PyObject_CallMethod()we can resolve the method with_PyObject_GetMethod()and call it directly(
_PyObject_VectorcallPrependwhen unbound). The now-unusedcallmethod()and_PyObject_CallMethodFormat()helpers are removed.CallMethod/ C methodtzinfo.utcoffset(dt)CallMethod/ C method{}.get(k)CallMethod/ C method"..".count(x)CallMethod/ py methodobj.m(x)CallMethod/ no args(10**6).bit_length()_PyObject_CallMethod(interned name)tzinfo.utcoffset(dt)io.readlinebisect.insort(list subclass)The
CallMethodrows are isolated per-call measurements via a small_testinternalcapiharness;io.readlineandbisect.insortare pure-Python paths that reach the C-API indirectly. The performance can also impact C-extensions that use thePyObject_CallMethodinterface.Benchmark script (pyperf)
Has this already been discussed elsewhere?
No response given
Links to previous discussion of this feature:
No response
Linked PRs