Don't send commas to stage 2, avoid clmul in most cases#2049
Don't send commas to stage 2, avoid clmul in most cases#2049
Conversation
| borrow_out = result >= value1; | ||
| return result; | ||
| #else | ||
| return __builtin_subcll(value1, value2, borrow, &borrow); |
There was a problem hiding this comment.
At a glance, it looks like __builtin_subcll is LLVM specific?
It might be worth guarding its usage:
https://gcc.gnu.org/onlinedocs/cpp/_005f_005fhas_005fbuiltin.html
It might worth examining alternatives:
https://godbolt.org/z/1WT9nPv6M
#include <cstdint>
using borrow_t = unsigned long long;
uint64_t subtract_borrow(const uint64_t value1, const uint64_t value2, borrow_t& borrow) noexcept {
return __builtin_subcll(value1, value2, borrow, &borrow);
}
uint64_t subtract_borrow_manual(const uint64_t value1, const uint64_t value2, borrow_t& borrow) noexcept {
uint64_t result = value1 - value2 - borrow;
borrow = result >= value1;
return result;
}
#if defined(_M_X64) || defined(__amd64__)
#include <x86intrin.h>
// visual studio has _subborrow_u64 in <intrin.h>
// https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
//
uint64_t subtract_borrow_intel(const uint64_t value1, const uint64_t value2, uint8_t& borrow) {
uint64_t result;
borrow = _subborrow_u64(borrow, value2, value1, (unsigned long long *)&result);
return result;
}
#endifThere was a problem hiding this comment.
It's entirely possible; I did construct these to be analogues of the add_overflow() implementation for the given architecture (i.e. pulling from the same libraries and using the same #ifdefs)
|
@lemire I made some more variants in this Godbolt. Just based on manual inspection of the assembly, if I had to choose a
uint64_t subtract_borrow_using_overflow_bool(const uint64_t value1, const uint64_t value2, bool& borrow) {
unsigned long long result;
borrow = __builtin_usubll_overflow(value1, value2 + borrow, &result);
return result;
}
uint64_t subtract_borrow_intel_bool(const uint64_t value1, const uint64_t value2, bool& borrow) {
unsigned long long result;
borrow = _subborrow_u64(borrow, value1, value2, (unsigned long long *)&result);
return result;
}
uint64_t subtract_borrow_manual_bool(const uint64_t value1, const uint64_t value2, bool& borrow) noexcept {
uint64_t result = value1 - value2 - borrow;
borrow = result >= value1;
return result;
}subtract_borrow_using_overflow_bool(unsigned long, unsigned long, bool&):
# result = value1 - value2 - overflow
mov rax, rdi # value1
movzx ecx, byte ptr [rdx] # overflow
add rcx, rsi # overflow + value2
sub rax, rcx # value1 - (overflow + value2)
# overflow = result >= value1
setb byte ptr [rdx]
subtract_borrow_intel_bool(unsigned long, unsigned long, bool&):
# result = value1 - value2 - overflow
mov rax, rdi # value1
movzx ecx, byte ptr [rdx] # overflow
add cl, -1 # cl = overflow - 1 ???
sbb rax, rsi # value1 - value2 - overflow
# overflow = result >= value1
setb byte ptr [rdx]
subtract_borrow_manual_bool(unsigned long, unsigned long, bool&):
# result = value1 - (value2 + overflow)
movzx ecx, byte ptr [rdx] # overflow
add rcx, rsi # overflow + value2
sub rax, rcx # value1 - (value2 + overflow)
# overflow = (result >= value1)
cmp rax, rdi
setae byte ptr [rdx] |
|
Added some comments in the assembly for easier following. Bottom line on Ice Lake, once you use
Of course, running these in a performance test is the only way to know for sure, since the processor does some minor JIT-ish activities :) |
|
On ARM, it looks like |
|
And on GCC 13.2 Intel, everything pretty much looks the same as each other. |
|
Changing |
The algorithm detects all missing/extra separator errors in stage 1, and then doesn't send commas.