diff --git a/README.md b/README.md
index 30adaa8..d342dc5 100644
--- a/README.md
+++ b/README.md
@@ -9,29 +9,31 @@ Being small and portable, the goal is to be able to use ChibiHash as a good
Some key features:
-* Small: ~60 loc in C
+* Small: ~65 loc in C
* Fast: See benchmark table below
* Portable: Doesn't use hardware specific instructions (e.g SSE)
-* Good Quality: Passes [smhasher][], so should be good quality (I think)
+* Good Quality: Passes [smhasher][] and [smhasher3][], so should be good quality (I think)
* Unencumbered: Released into the public domain
* Free of undefined behavior and gives same result regardless of host system's endianness.
* Non-cryptographic
-Here's some benchmark against other similar hash functions:
+Here's some benchmark (made via smhasher3) against other similar themed hash functions:
-| Name | Large input (GiB/sec) | Small input (Cycles/Hash) |
-| :--- | :-------------------------: | :------------------------: |
-| chibihash64 | **18.08** | 49 |
-| xxhash64 | 12.59 | 50 |
-| city64 | 14.95 | **35** |
-| spooky64 | 13.83 | 59 |
+| Name | Large input (GiB/sec) | Small input (Cycles/Hash) |
+| :--- | :-------------------------: | :------------------------: |
+| chibihash64 | **24.20** | 34 |
+| xxhash64 | 15.10 | 50 |
+| city64 | 18.30 | 47 |
+| spooky64 | 16.68 | 70 |
+| rapidhash.protected 1 | 21.50 | **32** |
+| polymur-hash 1, 2 | 13.82 | 43 |
-It's the fastest of the bunch for large string throughput.
-For small string (< 32 bytes), cityhash beats it - worth noting that cityhash
-has [hardcoded special cases][city-small] for input below or equal 32 bytes.
+1. Requires compiler/cpu support for retrieving the full 128 bit result of a
+ 64x64 bit multiply.
+2. Universal, but has a complicated seeding step.
[smhasher]: https://github.com/aappleby/smhasher
-[city-small]: https://github.com/google/cityhash/blob/f5dc54147fcce12cefd16548c8e760d68ac04226/src/city.cc#L367-L375
+[smhasher3]: https://gitlab.com/fwojcik/smhasher3
## When NOT to use
@@ -47,3 +49,16 @@ Here are some reasons to avoid using this:
## Unofficial ports
A list of unofficial ports to other languages is [maintained here](https://github.com/N-R-K/ChibiHash/issues/4).
+
+## Changelog
+
+### v2
+
+- Faster performance on short string (42 cycles/hash vs 34 cycles/hash).
+ The tail end handling has been reworked entirely with some inspiration from
+ wyhash's short input reading.
+- Better seeding. v1 seed only affected 64 bits of the initial state.
+ v2 seed affects the full 256 bits. This allows it to pass smhasher3's
+ SeedBlockLen and SeedBlockOffset tests.
+- Slightly better mixing in bulk handling.
+- Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3.
diff --git a/chibihash64.h b/chibihash64.h
index c061c42..45a3181 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -1,75 +1,82 @@
-// small, fast 64 bit hash function.
+#ifndef CHIBIHASH64__HGUARD
+#define CHIBIHASH64__HGUARD
+// small, fast 64 bit hash function (version 2).
//
// https://github.com/N-R-K/ChibiHash
-// https://nrk.neocities.org/articles/chibihash
//
// This is free and unencumbered software released into the public domain.
// For more information, please refer to
-#pragma once
#include
#include
-static inline uint64_t
-chibihash64__load64le(const uint8_t *p)
+static inline uint64_t chibihash64__load32le(const uint8_t *p)
{
return (uint64_t)p[0] << 0 | (uint64_t)p[1] << 8 |
- (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 |
- (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 |
- (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56;
+ (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24;
+}
+static inline uint64_t chibihash64__load64le(const uint8_t *p)
+{
+ return chibihash64__load32le(p) | (chibihash64__load32le(p+4) << 32);
+}
+static inline uint64_t chibihash64__rotl(uint64_t x, int n)
+{
+ return (x << n) | (x >> (-n & 63));
}
static inline uint64_t
chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
{
- const uint8_t *k = (const uint8_t *)keyIn;
+ const uint8_t *p = (const uint8_t *)keyIn;
ptrdiff_t l = len;
- const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5);
- const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7);
- const uint64_t P3 = UINT64_C(0x3243F6A8885A308D);
-
- uint64_t h[4] = { P1, P2, P3, seed };
+ const uint64_t K = UINT64_C(0x2B7E151628AED2A7); // digits of e
+ uint64_t seed2 = chibihash64__rotl(seed-K, 15) + chibihash64__rotl(seed-K, 47);
+ uint64_t h[4] = { seed, seed+K, seed2, seed2+(K*K^K) };
- // unrolling gives very slight speed boost on large inputs at the cost
- // of larger code size. typically not worth the trade off as larger
- // code-size hinders inlinability as well
- // #pragma GCC unroll 2
+ // depending on your system unrolling might (or might not) make things
+ // a tad bit faster on large strings. on my system, it actually makes
+ // things slower.
+ // generally speaking, the cost of bigger code size is usually not
+ // worth the trade-off since larger code-size will hinder inlinability
+ // but depending on your needs, you may want to uncomment the pragma
+ // below to unroll the loop.
+ //#pragma GCC unroll 2
for (; l >= 32; l -= 32) {
- for (int i = 0; i < 4; ++i, k += 8) {
- uint64_t lane = chibihash64__load64le(k);
- h[i] ^= lane;
- h[i] *= P1;
- h[(i+1)&3] ^= ((lane << 40) | (lane >> 24));
+ for (int i = 0; i < 4; ++i, p += 8) {
+ uint64_t stripe = chibihash64__load64le(p);
+ h[i] = (stripe + h[i]) * K;
+ h[(i+1)&3] += chibihash64__rotl(stripe, 27);
}
}
- h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32);
- if (l & 1) {
- h[0] ^= k[0];
- --l, ++k;
+ for (; l >= 8; l -= 8, p += 8) {
+ h[0] ^= chibihash64__load32le(p+0); h[0] *= K;
+ h[1] ^= chibihash64__load32le(p+4); h[1] *= K;
}
- h[0] *= P2; h[0] ^= h[0] >> 31;
- for (int i = 1; l >= 8; l -= 8, k += 8, ++i) {
- h[i] ^= chibihash64__load64le(k);
- h[i] *= P2; h[i] ^= h[i] >> 31;
+ if (l >= 4) {
+ h[2] ^= chibihash64__load32le(p);
+ h[3] ^= chibihash64__load32le(p + l - 4);
+ } else if (l > 0) {
+ h[2] ^= p[0];
+ h[3] ^= p[l/2] | ((uint64_t)p[l-1] << 8);
}
- for (int i = 0; l > 0; l -= 2, k += 2, ++i) {
- h[i] ^= (k[0] | ((uint64_t)k[1] << 8));
- h[i] *= P3; h[i] ^= h[i] >> 31;
- }
+ h[0] += chibihash64__rotl(h[2] * K, 31) ^ (h[2] >> 31);
+ h[1] += chibihash64__rotl(h[3] * K, 31) ^ (h[3] >> 31);
+ h[0] *= K; h[0] ^= h[0] >> 31;
+ h[1] += h[0];
- uint64_t x = seed;
- x ^= h[0] * ((h[2] >> 32)|1);
- x ^= h[1] * ((h[3] >> 32)|1);
- x ^= h[2] * ((h[0] >> 32)|1);
- x ^= h[3] * ((h[1] >> 32)|1);
+ uint64_t x = (uint64_t)len * K;
+ x ^= chibihash64__rotl(x, 29);
+ x += seed;
+ x ^= h[1];
- // moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
- x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);
- x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35);
- x ^= x >> 27;
+ x ^= chibihash64__rotl(x, 15) ^ chibihash64__rotl(x, 42);
+ x *= K;
+ x ^= chibihash64__rotl(x, 13) ^ chibihash64__rotl(x, 31);
return x;
}
+
+#endif // CHIBIHASH64__HGUARD
diff --git a/old_versions/chibihash64-v1.h b/old_versions/chibihash64-v1.h
new file mode 100644
index 0000000..23c5a60
--- /dev/null
+++ b/old_versions/chibihash64-v1.h
@@ -0,0 +1,75 @@
+// small, fast 64 bit hash function (version 1).
+//
+// https://github.com/N-R-K/ChibiHash
+// https://nrk.neocities.org/articles/chibihash
+//
+// This is free and unencumbered software released into the public domain.
+// For more information, please refer to
+#pragma once
+#include
+#include
+
+static inline uint64_t
+chibihash64__load64le(const uint8_t *p)
+{
+ return (uint64_t)p[0] << 0 | (uint64_t)p[1] << 8 |
+ (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 |
+ (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 |
+ (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56;
+}
+
+static inline uint64_t
+chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
+{
+ const uint8_t *k = (const uint8_t *)keyIn;
+ ptrdiff_t l = len;
+
+ const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5);
+ const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7);
+ const uint64_t P3 = UINT64_C(0x3243F6A8885A308D);
+
+ uint64_t h[4] = { P1, P2, P3, seed };
+
+ // unrolling gives very slight speed boost on large inputs at the cost
+ // of larger code size. typically not worth the trade off as larger
+ // code-size hinders inlinability as well
+ // #pragma GCC unroll 2
+ for (; l >= 32; l -= 32) {
+ for (int i = 0; i < 4; ++i, k += 8) {
+ uint64_t lane = chibihash64__load64le(k);
+ h[i] ^= lane;
+ h[i] *= P1;
+ h[(i+1)&3] ^= ((lane << 40) | (lane >> 24));
+ }
+ }
+
+ h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32);
+ if (l & 1) {
+ h[0] ^= k[0];
+ --l, ++k;
+ }
+ h[0] *= P2; h[0] ^= h[0] >> 31;
+
+ for (int i = 1; l >= 8; l -= 8, k += 8, ++i) {
+ h[i] ^= chibihash64__load64le(k);
+ h[i] *= P2; h[i] ^= h[i] >> 31;
+ }
+
+ for (int i = 0; l > 0; l -= 2, k += 2, ++i) {
+ h[i] ^= (k[0] | ((uint64_t)k[1] << 8));
+ h[i] *= P3; h[i] ^= h[i] >> 31;
+ }
+
+ uint64_t x = seed;
+ x ^= h[0] * ((h[2] >> 32)|1);
+ x ^= h[1] * ((h[3] >> 32)|1);
+ x ^= h[2] * ((h[0] >> 32)|1);
+ x ^= h[3] * ((h[1] >> 32)|1);
+
+ // moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
+ x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);
+ x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35);
+ x ^= x >> 27;
+
+ return x;
+}