diff --git a/README.md b/README.md index 30adaa8..d342dc5 100644 --- a/README.md +++ b/README.md @@ -9,29 +9,31 @@ Being small and portable, the goal is to be able to use ChibiHash as a good Some key features: -* Small: ~60 loc in C +* Small: ~65 loc in C * Fast: See benchmark table below * Portable: Doesn't use hardware specific instructions (e.g SSE) -* Good Quality: Passes [smhasher][], so should be good quality (I think) +* Good Quality: Passes [smhasher][] and [smhasher3][], so should be good quality (I think) * Unencumbered: Released into the public domain * Free of undefined behavior and gives same result regardless of host system's endianness. * Non-cryptographic -Here's some benchmark against other similar hash functions: +Here's some benchmark (made via smhasher3) against other similar themed hash functions: -| Name | Large input (GiB/sec) | Small input (Cycles/Hash) | -| :--- | :-------------------------: | :------------------------: | -| chibihash64 | **18.08** | 49 | -| xxhash64 | 12.59 | 50 | -| city64 | 14.95 | **35** | -| spooky64 | 13.83 | 59 | +| Name | Large input (GiB/sec) | Small input (Cycles/Hash) | +| :--- | :-------------------------: | :------------------------: | +| chibihash64 | **24.20** | 34 | +| xxhash64 | 15.10 | 50 | +| city64 | 18.30 | 47 | +| spooky64 | 16.68 | 70 | +| rapidhash.protected 1 | 21.50 | **32** | +| polymur-hash 1, 2 | 13.82 | 43 | -It's the fastest of the bunch for large string throughput. -For small string (< 32 bytes), cityhash beats it - worth noting that cityhash -has [hardcoded special cases][city-small] for input below or equal 32 bytes. +1. Requires compiler/cpu support for retrieving the full 128 bit result of a + 64x64 bit multiply. +2. Universal, but has a complicated seeding step. [smhasher]: https://github.com/aappleby/smhasher -[city-small]: https://github.com/google/cityhash/blob/f5dc54147fcce12cefd16548c8e760d68ac04226/src/city.cc#L367-L375 +[smhasher3]: https://gitlab.com/fwojcik/smhasher3 ## When NOT to use @@ -47,3 +49,16 @@ Here are some reasons to avoid using this: ## Unofficial ports A list of unofficial ports to other languages is [maintained here](https://github.com/N-R-K/ChibiHash/issues/4). + +## Changelog + +### v2 + +- Faster performance on short string (42 cycles/hash vs 34 cycles/hash). + The tail end handling has been reworked entirely with some inspiration from + wyhash's short input reading. +- Better seeding. v1 seed only affected 64 bits of the initial state. + v2 seed affects the full 256 bits. This allows it to pass smhasher3's + SeedBlockLen and SeedBlockOffset tests. +- Slightly better mixing in bulk handling. +- Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3. diff --git a/chibihash64.h b/chibihash64.h index c061c42..45a3181 100644 --- a/chibihash64.h +++ b/chibihash64.h @@ -1,75 +1,82 @@ -// small, fast 64 bit hash function. +#ifndef CHIBIHASH64__HGUARD +#define CHIBIHASH64__HGUARD +// small, fast 64 bit hash function (version 2). // // https://github.com/N-R-K/ChibiHash -// https://nrk.neocities.org/articles/chibihash // // This is free and unencumbered software released into the public domain. // For more information, please refer to -#pragma once #include #include -static inline uint64_t -chibihash64__load64le(const uint8_t *p) +static inline uint64_t chibihash64__load32le(const uint8_t *p) { return (uint64_t)p[0] << 0 | (uint64_t)p[1] << 8 | - (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 | - (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 | - (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56; + (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24; +} +static inline uint64_t chibihash64__load64le(const uint8_t *p) +{ + return chibihash64__load32le(p) | (chibihash64__load32le(p+4) << 32); +} +static inline uint64_t chibihash64__rotl(uint64_t x, int n) +{ + return (x << n) | (x >> (-n & 63)); } static inline uint64_t chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed) { - const uint8_t *k = (const uint8_t *)keyIn; + const uint8_t *p = (const uint8_t *)keyIn; ptrdiff_t l = len; - const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5); - const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7); - const uint64_t P3 = UINT64_C(0x3243F6A8885A308D); - - uint64_t h[4] = { P1, P2, P3, seed }; + const uint64_t K = UINT64_C(0x2B7E151628AED2A7); // digits of e + uint64_t seed2 = chibihash64__rotl(seed-K, 15) + chibihash64__rotl(seed-K, 47); + uint64_t h[4] = { seed, seed+K, seed2, seed2+(K*K^K) }; - // unrolling gives very slight speed boost on large inputs at the cost - // of larger code size. typically not worth the trade off as larger - // code-size hinders inlinability as well - // #pragma GCC unroll 2 + // depending on your system unrolling might (or might not) make things + // a tad bit faster on large strings. on my system, it actually makes + // things slower. + // generally speaking, the cost of bigger code size is usually not + // worth the trade-off since larger code-size will hinder inlinability + // but depending on your needs, you may want to uncomment the pragma + // below to unroll the loop. + //#pragma GCC unroll 2 for (; l >= 32; l -= 32) { - for (int i = 0; i < 4; ++i, k += 8) { - uint64_t lane = chibihash64__load64le(k); - h[i] ^= lane; - h[i] *= P1; - h[(i+1)&3] ^= ((lane << 40) | (lane >> 24)); + for (int i = 0; i < 4; ++i, p += 8) { + uint64_t stripe = chibihash64__load64le(p); + h[i] = (stripe + h[i]) * K; + h[(i+1)&3] += chibihash64__rotl(stripe, 27); } } - h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32); - if (l & 1) { - h[0] ^= k[0]; - --l, ++k; + for (; l >= 8; l -= 8, p += 8) { + h[0] ^= chibihash64__load32le(p+0); h[0] *= K; + h[1] ^= chibihash64__load32le(p+4); h[1] *= K; } - h[0] *= P2; h[0] ^= h[0] >> 31; - for (int i = 1; l >= 8; l -= 8, k += 8, ++i) { - h[i] ^= chibihash64__load64le(k); - h[i] *= P2; h[i] ^= h[i] >> 31; + if (l >= 4) { + h[2] ^= chibihash64__load32le(p); + h[3] ^= chibihash64__load32le(p + l - 4); + } else if (l > 0) { + h[2] ^= p[0]; + h[3] ^= p[l/2] | ((uint64_t)p[l-1] << 8); } - for (int i = 0; l > 0; l -= 2, k += 2, ++i) { - h[i] ^= (k[0] | ((uint64_t)k[1] << 8)); - h[i] *= P3; h[i] ^= h[i] >> 31; - } + h[0] += chibihash64__rotl(h[2] * K, 31) ^ (h[2] >> 31); + h[1] += chibihash64__rotl(h[3] * K, 31) ^ (h[3] >> 31); + h[0] *= K; h[0] ^= h[0] >> 31; + h[1] += h[0]; - uint64_t x = seed; - x ^= h[0] * ((h[2] >> 32)|1); - x ^= h[1] * ((h[3] >> 32)|1); - x ^= h[2] * ((h[0] >> 32)|1); - x ^= h[3] * ((h[1] >> 32)|1); + uint64_t x = (uint64_t)len * K; + x ^= chibihash64__rotl(x, 29); + x += seed; + x ^= h[1]; - // moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html - x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653); - x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35); - x ^= x >> 27; + x ^= chibihash64__rotl(x, 15) ^ chibihash64__rotl(x, 42); + x *= K; + x ^= chibihash64__rotl(x, 13) ^ chibihash64__rotl(x, 31); return x; } + +#endif // CHIBIHASH64__HGUARD diff --git a/old_versions/chibihash64-v1.h b/old_versions/chibihash64-v1.h new file mode 100644 index 0000000..23c5a60 --- /dev/null +++ b/old_versions/chibihash64-v1.h @@ -0,0 +1,75 @@ +// small, fast 64 bit hash function (version 1). +// +// https://github.com/N-R-K/ChibiHash +// https://nrk.neocities.org/articles/chibihash +// +// This is free and unencumbered software released into the public domain. +// For more information, please refer to +#pragma once +#include +#include + +static inline uint64_t +chibihash64__load64le(const uint8_t *p) +{ + return (uint64_t)p[0] << 0 | (uint64_t)p[1] << 8 | + (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 | + (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 | + (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56; +} + +static inline uint64_t +chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed) +{ + const uint8_t *k = (const uint8_t *)keyIn; + ptrdiff_t l = len; + + const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5); + const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7); + const uint64_t P3 = UINT64_C(0x3243F6A8885A308D); + + uint64_t h[4] = { P1, P2, P3, seed }; + + // unrolling gives very slight speed boost on large inputs at the cost + // of larger code size. typically not worth the trade off as larger + // code-size hinders inlinability as well + // #pragma GCC unroll 2 + for (; l >= 32; l -= 32) { + for (int i = 0; i < 4; ++i, k += 8) { + uint64_t lane = chibihash64__load64le(k); + h[i] ^= lane; + h[i] *= P1; + h[(i+1)&3] ^= ((lane << 40) | (lane >> 24)); + } + } + + h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32); + if (l & 1) { + h[0] ^= k[0]; + --l, ++k; + } + h[0] *= P2; h[0] ^= h[0] >> 31; + + for (int i = 1; l >= 8; l -= 8, k += 8, ++i) { + h[i] ^= chibihash64__load64le(k); + h[i] *= P2; h[i] ^= h[i] >> 31; + } + + for (int i = 0; l > 0; l -= 2, k += 2, ++i) { + h[i] ^= (k[0] | ((uint64_t)k[1] << 8)); + h[i] *= P3; h[i] ^= h[i] >> 31; + } + + uint64_t x = seed; + x ^= h[0] * ((h[2] >> 32)|1); + x ^= h[1] * ((h[3] >> 32)|1); + x ^= h[2] * ((h[0] >> 32)|1); + x ^= h[3] * ((h[1] >> 32)|1); + + // moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html + x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653); + x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35); + x ^= x >> 27; + + return x; +}