Skip to content

Commit bc33e57

Browse files
committed
Issue #24155: Optimize heapify for better cache utililzation.
1 parent a33df31 commit bc33e57

2 files changed

Lines changed: 75 additions & 0 deletions

File tree

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ Library
4141
- Issue #21795: smtpd now supports the 8BITMIME extension whenever
4242
the new *decode_data* constructor argument is set to False.
4343

44+
- Issue #24155: optimize heapq.heapify() for better cache performance
45+
when heapifying large lists.
46+
4447
- Issue #21800: imaplib now supports RFC 5161 (enable), RFC 6855
4548
(utf8/internationalized email) and automatically encodes non-ASCII
4649
usernames and passwords to UTF8.

Modules/_heapqmodule.c

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,71 @@ PyDoc_STRVAR(heappushpop_doc,
250250
from the heap. The combined action runs more efficiently than\n\
251251
heappush() followed by a separate call to heappop().");
252252

253+
static Py_ssize_t
254+
keep_top_bit(Py_ssize_t n)
255+
{
256+
int i = 0;
257+
258+
while (n > 1) {
259+
i += 1;
260+
n >>= 1;
261+
}
262+
return n << i;
263+
}
264+
265+
/* Cache friendly version of heapify()
266+
-----------------------------------
267+
268+
Build-up a heap in O(n) time by performing siftup() operations
269+
on nodes whose children are already heaps.
270+
271+
The simplest way is to sift the nodes in reverse order from
272+
n//2-1 to 0 inclusive. The downside is that children may be
273+
out of cache by the time their parent is reached.
274+
275+
A better way is to not wait for the children to go out of cache.
276+
Once a sibling pair of child nodes have been sifted, immediately
277+
sift their parent node (while the children are still in cache).
278+
279+
Both ways build child heaps before their parents, so both ways
280+
do the exact same number of comparisons and produce exactly
281+
the same heap. The only difference is that the traversal
282+
order is optimized for cache efficiency.
283+
*/
284+
285+
static PyObject *
286+
cache_friendly_heapify(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
287+
{
288+
Py_ssize_t i, j, m, mhalf, leftmost;
289+
290+
m = PyList_GET_SIZE(heap) >> 1; /* index of first childless node */
291+
leftmost = keep_top_bit(m + 1) - 1; /* leftmost node in row of m */
292+
mhalf = m >> 1; /* parent of first childless node */
293+
294+
for (i = leftmost - 1 ; i >= mhalf ; i--) {
295+
j = i;
296+
while (1) {
297+
if (siftup_func((PyListObject *)heap, j))
298+
return NULL;
299+
if (!(j & 1))
300+
break;
301+
j >>= 1;
302+
}
303+
}
304+
305+
for (i = m - 1 ; i >= leftmost ; i--) {
306+
j = i;
307+
while (1) {
308+
if (siftup_func((PyListObject *)heap, j))
309+
return NULL;
310+
if (!(j & 1))
311+
break;
312+
j >>= 1;
313+
}
314+
}
315+
Py_RETURN_NONE;
316+
}
317+
253318
static PyObject *
254319
heapify_internal(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
255320
{
@@ -260,7 +325,14 @@ heapify_internal(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
260325
return NULL;
261326
}
262327

328+
/* For heaps likely to be bigger than L1 cache, we use the cache
329+
friendly heapify function. For smaller heaps that fit entirely
330+
in cache, we prefer the simpler algorithm with less branching.
331+
*/
263332
n = PyList_GET_SIZE(heap);
333+
if (n > 10000)
334+
return cache_friendly_heapify(heap, siftup_func);
335+
264336
/* Transform bottom-up. The largest index there's any point to
265337
looking at is the largest with a child index in-range, so must
266338
have 2*i + 1 < n, or i < (n-1)/2. If n is even = 2*j, this is

0 commit comments

Comments
 (0)