Skip to content

Commit a8082fe

Browse files
committed
ENH: Use Highway's VQSort on AArch64
Introduces Highway, a SIMD library from Google. Highway has it's own dispatch mechanism and SIMD flavour (which adds padding) which may be a good replacement for NumPy intrinsics. Highway also has its own set of vectorised Math routines we could bolt on. For this patch, I've integrated the VQSort algorithm only on AArch64 so as not to impact other architectures. The VQSort algorithms performance is comparable to the AVX512 algorithms, according to this report: https://github.com/Voultapher/sort-research-rs/blob/main/writeup/intel_avx512/text.md Performance improvements in the NumPy benchmark suite are as follows: ``` before after ratio [f4b52d53] [f08058d4] <main-setuppy> <highway-sort> + 53.0±0.08μs 221±0.4μs 4.18 bench_function_base.Sort.time_sort('quick', 'int64', ('ordered',)) + 84.8±0.3μs 222±0.3μs 2.62 bench_function_base.Sort.time_sort('quick', 'int64', ('reversed',)) + 89.0±0.2μs 197±0.2μs 2.21 bench_function_base.Sort.time_sort('quick', 'float64', ('ordered',)) + 51.5±0.01μs 80.0±0.1μs 1.55 bench_function_base.Sort.time_sort('quick', 'uint32', ('ordered',)) + 52.0±0.07μs 80.3±0.2μs 1.55 bench_function_base.Sort.time_sort('quick', 'int32', ('ordered',)) + 148±0.3μs 197±0.2μs 1.33 bench_function_base.Sort.time_sort('quick', 'float64', ('reversed',)) + 471±0.2μs 598±0.5μs 1.27 bench_function_base.Sort.time_argsort('heap', 'float64', ('ordered',)) + 536±0.7μs 644±0.8μs 1.20 bench_function_base.Sort.time_argsort('heap', 'float64', ('reversed',)) + 39.8±0.02μs 47.4±0.05μs 1.19 bench_function_base.Sort.time_argsort('merge', 'int32', ('sorted_block', 1000)) + 39.8±0.01μs 47.4±0.02μs 1.19 bench_function_base.Sort.time_argsort('merge', 'uint32', ('sorted_block', 1000)) + 40.1±0.01μs 47.6±0.04μs 1.19 bench_function_base.Sort.time_argsort('merge', 'int64', ('sorted_block', 1000)) + 27.9±0.03μs 32.6±0.03μs 1.17 bench_function_base.Sort.time_argsort('heap', 'uint32', ('uniform',)) + 685±0.5μs 796±0.8μs 1.16 bench_function_base.Sort.time_argsort('heap', 'float64', ('sorted_block', 10)) + 68.7±0.05μs 78.4±0.04μs 1.14 bench_function_base.Sort.time_argsort('merge', 'uint32', ('sorted_block', 100)) + 68.7±0.05μs 78.3±0.1μs 1.14 bench_function_base.Sort.time_argsort('merge', 'int32', ('sorted_block', 100)) + 69.4±0.03μs 78.8±0.05μs 1.14 bench_function_base.Sort.time_argsort('merge', 'int64', ('sorted_block', 100)) + 22.0±0.01μs 24.7±0.01μs 1.12 bench_function_base.Sort.time_sort('heap', 'uint32', ('uniform',)) + 674±0.7μs 743±0.4μs 1.10 bench_function_base.Sort.time_argsort('heap', 'float64', ('sorted_block', 1000)) + 730±0.6μs 794±0.8μs 1.09 bench_function_base.Sort.time_argsort('heap', 'float64', ('sorted_block', 100)) + 464±2μs 492±10μs 1.06 bench_function_base.Sort.time_argsort('heap', 'int32', ('reversed',)) + 95.5±0.1μs 101±0.1μs 1.06 bench_function_base.Sort.time_argsort('quick', 'float64', ('ordered',)) + 839±1μs 886±2μs 1.06 bench_function_base.Sort.time_argsort('heap', 'float64', ('random',)) + 95.7±0.1μs 101±0.05μs 1.06 bench_function_base.Sort.time_argsort('quick', 'float32', ('ordered',)) - 516±0.6μs 488±0.9μs 0.95 bench_function_base.Sort.time_sort('heap', 'float32', ('ordered',)) - 532±1μs 499±5μs 0.94 bench_function_base.Sort.time_argsort('heap', 'int16', ('reversed',)) - 34.5±0.05μs 32.2±0.03μs 0.93 bench_function_base.Sort.time_sort('merge', 'int32', ('sorted_block', 1000)) - 715±1μs 666±0.4μs 0.93 bench_function_base.Sort.time_sort('heap', 'float64', ('random',)) - 254±2μs 231±0.2μs 0.91 bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 1000)) - 655±0.5μs 592±0.3μs 0.90 bench_function_base.Sort.time_sort('heap', 'float64', ('sorted_block', 100)) - 24.7±0.02μs 22.0±0.07μs 0.89 bench_function_base.Sort.time_sort('heap', 'int32', ('uniform',)) - 621±0.7μs 549±0.3μs 0.88 bench_function_base.Sort.time_sort('heap', 'float64', ('sorted_block', 1000)) - 659±0.9μs 555±0.5μs 0.84 bench_function_base.Sort.time_sort('heap', 'float64', ('sorted_block', 10)) - 545±0.4μs 444±0.5μs 0.81 bench_function_base.Sort.time_sort('heap', 'float64', ('reversed',)) - 511±0.5μs 396±0.4μs 0.78 bench_function_base.Sort.time_sort('heap', 'float64', ('ordered',)) - 316±0.6μs 225±0.2μs 0.71 bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 10)) - 333±0.4μs 233±0.3μs 0.70 bench_function_base.Sort.time_sort('quick', 'int64', ('sorted_block', 100)) - 323±0.2μs 206±0.2μs 0.64 bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 1000)) - 134±0.9μs 85.0±0.07μs 0.64 bench_function_base.Sort.time_sort('quick', 'float32', ('reversed',)) - 376±0.9μs 228±0.3μs 0.61 bench_function_base.Sort.time_sort('quick', 'int64', ('random',)) - 418±0.4μs 207±0.4μs 0.50 bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 100)) - 413±2μs 199±0.2μs 0.48 bench_function_base.Sort.time_sort('quick', 'float64', ('sorted_block', 10)) - 64.4±0.3ms 28.6±0.3ms 0.44 bench_function_base.Sort.time_sort_worst - 501±2μs 204±0.3μs 0.41 bench_function_base.Sort.time_sort('quick', 'float64', ('random',)) - 248±1μs 86.9±0.2μs 0.35 bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 1000)) - 251±0.7μs 87.0±0.2μs 0.35 bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 1000)) - 329±0.9μs 91.4±0.1μs 0.28 bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 1000)) - 329±1μs 85.2±0.06μs 0.26 bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 100)) - 329±0.7μs 85.0±0.06μs 0.26 bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 100)) - 315±0.5μs 80.4±0.05μs 0.26 bench_function_base.Sort.time_sort('quick', 'int32', ('sorted_block', 10)) - 320±1μs 80.3±0.09μs 0.25 bench_function_base.Sort.time_sort('quick', 'uint32', ('sorted_block', 10)) - 372±0.4μs 82.1±0.06μs 0.22 bench_function_base.Sort.time_sort('quick', 'int32', ('random',)) - 414±0.4μs 89.6±0.1μs 0.22 bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 100)) - 384±1μs 81.8±0.02μs 0.21 bench_function_base.Sort.time_sort('quick', 'uint32', ('random',)) - 415±0.3μs 84.8±0.2μs 0.20 bench_function_base.Sort.time_sort('quick', 'float32', ('sorted_block', 10)) - 491±2μs 86.3±0.1μs 0.18 bench_function_base.Sort.time_sort('quick', 'float32', ('random',)) - 111±0.4μs 13.0±0.01μs 0.12 bench_function_base.Sort.time_sort('quick', 'float64', ('uniform',)) - 50.9±0.03μs 4.93±0.01μs 0.10 bench_function_base.Sort.time_sort('quick', 'int64', ('uniform',)) - 108±0.3μs 7.25±0.02μs 0.07 bench_function_base.Sort.time_sort('quick', 'float32', ('uniform',)) - 51.7±0.05μs 3.43±0.01μs 0.07 bench_function_base.Sort.time_sort('quick', 'int32', ('uniform',)) - 51.3±0.07μs 3.35±0.06μs 0.07 bench_function_base.Sort.time_sort('quick', 'uint32', ('uniform',)) ```
1 parent 7e77d36 commit a8082fe

8 files changed

Lines changed: 76 additions & 19 deletions

File tree

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@
1010
[submodule "vendored-meson/meson"]
1111
path = vendored-meson/meson
1212
url = https://github.com/numpy/meson.git
13+
[submodule "numpy/_core/src/highway"]
14+
path = numpy/_core/src/highway
15+
url = https://github.com/google/highway.git

numpy/_core/src/highway

Submodule highway added at 65d30ea

numpy/_core/src/npysort/quicksort.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ inline bool quicksort_dispatch(T *start, npy_intp num)
8484
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
8585
}
8686
else if (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint64_t)) {
87-
#ifndef NPY_DISABLE_OPTIMIZATION
87+
#if !defined(NPY_DISABLE_OPTIMIZATION) && !(defined(NPY_HAVE_ASIMD) && !defined(NPY_CAN_LINK_HIGHWAY))
8888
#include "simd_qsort.dispatch.h"
8989
#endif
9090
NPY_CPU_DISPATCH_CALL_XB(dispfunc = np::qsort_simd::template QSort, <TF>);
@@ -105,7 +105,7 @@ inline bool aquicksort_dispatch(T *start, npy_intp* arg, npy_intp num)
105105
using TF = typename np::meta::FixedWidth<T>::Type;
106106
void (*dispfunc)(TF*, npy_intp*, npy_intp) = nullptr;
107107
#ifndef NPY_DISABLE_OPTIMIZATION
108-
#include "simd_qsort.dispatch.h"
108+
#include "simd_argsort.dispatch.h"
109109
#endif
110110
/* x86-simd-sort uses 8-byte int to store arg values, npy_intp is 4 bytes
111111
* in 32-bit*/
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*@targets
2+
* $maxopt $keep_baseline
3+
* avx512_skx
4+
*/
5+
// policy $keep_baseline is used to avoid skip building avx512_skx
6+
// when its part of baseline features (--cpu-baseline), since
7+
// 'baseline' option isn't specified within targets.
8+
9+
#include "simd_qsort.hpp"
10+
11+
#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
12+
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
13+
#endif
14+
15+
namespace np { namespace qsort_simd {
16+
17+
#if defined(NPY_HAVE_AVX512_SKX) && !defined(_MSC_VER)
18+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
19+
{
20+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
21+
}
22+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
23+
{
24+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
25+
}
26+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
27+
{
28+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
29+
}
30+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
31+
{
32+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
33+
}
34+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
35+
{
36+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
37+
}
38+
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
39+
{
40+
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
41+
}
42+
#endif
43+
44+
}} // namespace np::simd

numpy/_core/src/npysort/simd_qsort.dispatch.cpp

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
/*@targets
2-
* $maxopt $keep_baseline avx512_skx
2+
* $maxopt $keep_baseline
3+
* avx512_skx
4+
* asimd
35
*/
46
// policy $keep_baseline is used to avoid skip building avx512_skx
57
// when its part of baseline features (--cpu-baseline), since
@@ -11,7 +13,8 @@
1113
#if defined(NPY_HAVE_AVX512_SKX)
1214
#include "x86-simd-sort/src/avx512-32bit-qsort.hpp"
1315
#include "x86-simd-sort/src/avx512-64bit-qsort.hpp"
14-
#include "x86-simd-sort/src/avx512-64bit-argsort.hpp"
16+
#elif defined(NPY_HAVE_ASIMD)
17+
#include "hwy/contrib/sort/vqsort.h"
1518
#endif
1619

1720
namespace np { namespace qsort_simd {
@@ -89,31 +92,32 @@ template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
8992
{
9093
avx512_qsort(arr, size);
9194
}
92-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int32_t *arr, npy_intp *arg, npy_intp size)
95+
#elif defined(NPY_HAVE_ASIMD) && defined(NPY_CAN_LINK_HIGHWAY)
96+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int32_t *arr, intptr_t size)
9397
{
94-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
98+
hwy::VQSort(arr, size, hwy::SortAscending());
9599
}
96-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint32_t *arr, npy_intp *arg, npy_intp size)
100+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint32_t *arr, intptr_t size)
97101
{
98-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
102+
hwy::VQSort(arr, size, hwy::SortAscending());
99103
}
100-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(int64_t *arr, npy_intp *arg, npy_intp size)
104+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(int64_t *arr, intptr_t size)
101105
{
102-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
106+
hwy::VQSort(arr, size, hwy::SortAscending());
103107
}
104-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(uint64_t *arr, npy_intp *arg, npy_intp size)
108+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(uint64_t *arr, intptr_t size)
105109
{
106-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
110+
hwy::VQSort(arr, size, hwy::SortAscending());
107111
}
108-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(float *arr, npy_intp *arg, npy_intp size)
112+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(float *arr, intptr_t size)
109113
{
110-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
114+
hwy::VQSort(arr, size, hwy::SortAscending());
111115
}
112-
template<> void NPY_CPU_DISPATCH_CURFX(ArgQSort)(double *arr, npy_intp *arg, npy_intp size)
116+
template<> void NPY_CPU_DISPATCH_CURFX(QSort)(double *arr, intptr_t size)
113117
{
114-
avx512_argsort(arr, reinterpret_cast<int64_t*>(arg), size);
118+
hwy::VQSort(arr, size, hwy::SortAscending());
115119
}
116-
#endif // NPY_HAVE_AVX512_SKX
120+
#endif
117121

118122
}} // namespace np::simd
119123

numpy/_core/src/npysort/simd_qsort.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ namespace np { namespace qsort_simd {
1010
#endif
1111
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSort, (T *arr, intptr_t size))
1212
NPY_CPU_DISPATCH_DECLARE(template <typename T> void QSelect, (T* arr, npy_intp num, npy_intp kth))
13+
14+
#ifndef NPY_DISABLE_OPTIMIZATION
15+
#include "simd_argsort.dispatch.h"
16+
#endif
1317
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSort, (T *arr, npy_intp* arg, npy_intp size))
1418
NPY_CPU_DISPATCH_DECLARE(template <typename T> void ArgQSelect, (T *arr, npy_intp* arg, npy_intp kth, npy_intp size))
1519

numpy/_core/src/npysort/simd_qsort_16bit.dispatch.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*@targets
2-
* $maxopt $keep_baseline avx512_icl avx512_spr
2+
* $maxopt $keep_baseline
3+
* avx512_icl avx512_spr
34
*/
45
// policy $keep_baseline is used to avoid skip building avx512_skx
56
// when its part of baseline features (--cpu-baseline), since

0 commit comments

Comments
 (0)