@@ -250,6 +250,71 @@ PyDoc_STRVAR(heappushpop_doc,
250250from the heap. The combined action runs more efficiently than\n\
251251heappush() followed by a separate call to heappop()." );
252252
253+ static Py_ssize_t
254+ keep_top_bit (Py_ssize_t n )
255+ {
256+ int i = 0 ;
257+
258+ while (n > 1 ) {
259+ i += 1 ;
260+ n >>= 1 ;
261+ }
262+ return n << i ;
263+ }
264+
265+ /* Cache friendly version of heapify()
266+ -----------------------------------
267+
268+ Build-up a heap in O(n) time by performing siftup() operations
269+ on nodes whose children are already heaps.
270+
271+ The simplest way is to sift the nodes in reverse order from
272+ n//2-1 to 0 inclusive. The downside is that children may be
273+ out of cache by the time their parent is reached.
274+
275+ A better way is to not wait for the children to go out of cache.
276+ Once a sibling pair of child nodes have been sifted, immediately
277+ sift their parent node (while the children are still in cache).
278+
279+ Both ways build child heaps before their parents, so both ways
280+ do the exact same number of comparisons and produce exactly
281+ the same heap. The only difference is that the traversal
282+ order is optimized for cache efficiency.
283+ */
284+
285+ static PyObject *
286+ cache_friendly_heapify (PyObject * heap , int siftup_func (PyListObject * , Py_ssize_t ))
287+ {
288+ Py_ssize_t i , j , m , mhalf , leftmost ;
289+
290+ m = PyList_GET_SIZE (heap ) >> 1 ; /* index of first childless node */
291+ leftmost = keep_top_bit (m + 1 ) - 1 ; /* leftmost node in row of m */
292+ mhalf = m >> 1 ; /* parent of first childless node */
293+
294+ for (i = leftmost - 1 ; i >= mhalf ; i -- ) {
295+ j = i ;
296+ while (1 ) {
297+ if (siftup_func ((PyListObject * )heap , j ))
298+ return NULL ;
299+ if (!(j & 1 ))
300+ break ;
301+ j >>= 1 ;
302+ }
303+ }
304+
305+ for (i = m - 1 ; i >= leftmost ; i -- ) {
306+ j = i ;
307+ while (1 ) {
308+ if (siftup_func ((PyListObject * )heap , j ))
309+ return NULL ;
310+ if (!(j & 1 ))
311+ break ;
312+ j >>= 1 ;
313+ }
314+ }
315+ Py_RETURN_NONE ;
316+ }
317+
253318static PyObject *
254319heapify_internal (PyObject * heap , int siftup_func (PyListObject * , Py_ssize_t ))
255320{
@@ -260,7 +325,14 @@ heapify_internal(PyObject *heap, int siftup_func(PyListObject *, Py_ssize_t))
260325 return NULL ;
261326 }
262327
328+ /* For heaps likely to be bigger than L1 cache, we use the cache
329+ friendly heapify function. For smaller heaps that fit entirely
330+ in cache, we prefer the simpler algorithm with less branching.
331+ */
263332 n = PyList_GET_SIZE (heap );
333+ if (n > 10000 )
334+ return cache_friendly_heapify (heap , siftup_func );
335+
264336 /* Transform bottom-up. The largest index there's any point to
265337 looking at is the largest with a child index in-range, so must
266338 have 2*i + 1 < n, or i < (n-1)/2. If n is even = 2*j, this is
0 commit comments