From d417f3cabbf0568c2497fa787a65253fd516911f Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Thu, 21 Nov 2024 21:13:28 +0000
Subject: [PATCH 1/7] v2 (wip)

---
 README.md     | 26 ++++++++++++++++++
 chibihash64.h | 73 +++++++++++++++++++++++++--------------------------
 2 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index 30adaa8..b9fdb16 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,29 @@
+# Work in progress v2 branch
+
+This is a WIP branch for version 2 of ChibiHash.
+Things may change at any point without notice.
+
+Some major improvement compared to v1:
+
+- Faster performance on short string (49 cycles/hash vs 36 cycles/hash).
+  The tail end handling has been reworked entirely with some inspiration from
+  wyhash's short input reading.
+- Better seeding. v1 seed only affected 64 bits of the initial state.
+  v2 seed affects 128 bits. This allows it to pass smhasher3's SeedBlockLen and
+  SeedBlockOffset tests.
+- Slightly better mixing in bulk handling.
+- Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3.
+
+Avenue for improvement:
+
+- Faster bulk handling without using 128 bit multiplication.
+- Investigate better/faster compressor for the 256 bit -> 64 bit reduction.
+- Drop moremur in favor of using existing multiplier (digits of pi) for the finisher ??
+
+---
+
+Below is the original v1 readme, unaltered.
+
 # ChibiHash: Small, Fast 64 bit hash function
 
 I started writing this because all the 64 bit hash functions I came across were
diff --git a/chibihash64.h b/chibihash64.h
index c061c42..5fdb2f4 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -9,62 +9,61 @@
 #include <stdint.h>
 #include <stddef.h>
 
-static inline uint64_t
-chibihash64__load64le(const uint8_t *p)
+static inline uint64_t chibihash64__load32le(const uint8_t *p)
 {
 	return (uint64_t)p[0] <<  0 | (uint64_t)p[1] <<  8 |
-	       (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 |
-	       (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 |
-	       (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56;
+	       (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24;
+}
+static inline uint64_t chibihash64__load64le(const uint8_t *p)
+{
+	return chibihash64__load32le(p) | (chibihash64__load32le(p+4) << 32);
+}
+static inline uint64_t chibihash64__rotl(uint64_t x, int n)
+{
+	return (x << n) | (x >> (-n & 63));
 }
 
 static inline uint64_t
 chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 {
-	const uint8_t *k = (const uint8_t *)keyIn;
+	const uint64_t K = UINT64_C(0x3243F6A8885A308D); // digits of pi
+	const uint8_t *p = (const uint8_t *)keyIn;
 	ptrdiff_t l = len;
 
-	const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5);
-	const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7);
-	const uint64_t P3 = UINT64_C(0x3243F6A8885A308D);
-
-	uint64_t h[4] = { P1, P2, P3, seed };
+	uint64_t h[4] = { seed, K*K^K, K, chibihash64__rotl(seed-K, 27) };
 
-	// unrolling gives very slight speed boost on large inputs at the cost
-	// of larger code size. typically not worth the trade off as larger
-	// code-size hinders inlinability as well
+	// uncomment for a small speed boost (~3%) on large strings,
+	// at the cost of bigger code size. usually not worth the trade-off
+	// since larger code-size will hinder inlinability
 	// #pragma GCC unroll 2
 	for (; l >= 32; l -= 32) {
-		for (int i = 0; i < 4; ++i, k += 8) {
-			uint64_t lane = chibihash64__load64le(k);
-			h[i] ^= lane;
-			h[i] *= P1;
-			h[(i+1)&3] ^= ((lane << 40) | (lane >> 24));
+		for (int i = 0; i < 4; ++i, p += 8) {
+			uint64_t stripe = chibihash64__load64le(p);
+			h[i] = (stripe ^ h[i]) * K;
+			h[(i+1)&3] ^= chibihash64__rotl(stripe, 39);
 		}
+		h[1] += h[0];
 	}
 
-	h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32);
-	if (l & 1) {
-		h[0] ^= k[0];
-		--l, ++k;
-	}
-	h[0] *= P2; h[0] ^= h[0] >> 31;
-
-	for (int i = 1; l >= 8; l -= 8, k += 8, ++i) {
-		h[i] ^= chibihash64__load64le(k);
-		h[i] *= P2; h[i] ^= h[i] >> 31;
+	for (; l >= 8; l -= 8, p += 8) {
+		h[0] ^= chibihash64__load32le(p+0); h[0] *= K;
+		h[1] ^= chibihash64__load32le(p+4); h[1] *= K;
 	}
 
-	for (int i = 0; l > 0; l -= 2, k += 2, ++i) {
-		h[i] ^= (k[0] | ((uint64_t)k[1] << 8));
-		h[i] *= P3; h[i] ^= h[i] >> 31;
+	if (l >= 4) {
+		h[2] ^= chibihash64__load32le(p);
+		h[3] ^= chibihash64__load32le(p + l - 4);
+	} else if (l > 0) {
+		h[2] ^= p[0];
+		h[3] ^= p[l/2];
+		h[0] ^= p[l-1];
 	}
 
-	uint64_t x = seed;
-	x ^= h[0] * ((h[2] >> 32)|1);
-	x ^= h[1] * ((h[3] >> 32)|1);
-	x ^= h[2] * ((h[0] >> 32)|1);
-	x ^= h[3] * ((h[1] >> 32)|1);
+	uint64_t x = len * K; x ^= chibihash64__rotl(x, 29);
+	x ^= h[0] * (chibihash64__rotl(h[1], 29) | 1);
+	x ^= h[1] * (chibihash64__rotl(h[2], 29) | 1);
+	x ^= h[2] * (chibihash64__rotl(h[3], 29) | 1);
+	x ^= h[3] * (chibihash64__rotl(h[0], 29) | 1);
 
 	// moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
 	x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);

From 31e8dffab8a80b75d75a62289b158fbd2981fc7b Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Mon, 25 Nov 2024 02:53:59 +0000
Subject: [PATCH 2/7] update

---
 README.md     |  8 ++++----
 chibihash64.h | 25 ++++++++++++++-----------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index b9fdb16..a67f90f 100644
--- a/README.md
+++ b/README.md
@@ -5,12 +5,12 @@ Things may change at any point without notice.
 
 Some major improvement compared to v1:
 
-- Faster performance on short string (49 cycles/hash vs 36 cycles/hash).
+- Faster performance on short string (49 cycles/hash vs 35 cycles/hash).
   The tail end handling has been reworked entirely with some inspiration from
   wyhash's short input reading.
 - Better seeding. v1 seed only affected 64 bits of the initial state.
-  v2 seed affects 128 bits. This allows it to pass smhasher3's SeedBlockLen and
-  SeedBlockOffset tests.
+  v2 seed affects the full 256 bits. This allows it to pass smhasher3's
+  SeedBlockLen and SeedBlockOffset tests.
 - Slightly better mixing in bulk handling.
 - Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3.
 
@@ -18,7 +18,7 @@ Avenue for improvement:
 
 - Faster bulk handling without using 128 bit multiplication.
 - Investigate better/faster compressor for the 256 bit -> 64 bit reduction.
-- Drop moremur in favor of using existing multiplier (digits of pi) for the finisher ??
+- Drop moremur in favor of using existing multiplier (digits of e) for the finisher ??
 
 ---
 
diff --git a/chibihash64.h b/chibihash64.h
index 5fdb2f4..5400efe 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -1,4 +1,4 @@
-// small, fast 64 bit hash function.
+// small, fast 64 bit hash function (version 2).
 //
 // https://github.com/N-R-K/ChibiHash
 // https://nrk.neocities.org/articles/chibihash
@@ -22,15 +22,22 @@ static inline uint64_t chibihash64__rotl(uint64_t x, int n)
 {
 	return (x << n) | (x >> (-n & 63));
 }
+static inline uint64_t chibihash64__reduce(uint64_t a, uint64_t b)
+{
+	uint64_t x = a * (chibihash64__rotl(b, 9) | 1);
+	x ^= chibihash64__rotl(b, 41) + chibihash64__rotl(a, 31);
+	return x;
+}
 
 static inline uint64_t
 chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 {
-	const uint64_t K = UINT64_C(0x3243F6A8885A308D); // digits of pi
 	const uint8_t *p = (const uint8_t *)keyIn;
 	ptrdiff_t l = len;
 
-	uint64_t h[4] = { seed, K*K^K, K, chibihash64__rotl(seed-K, 27) };
+	const uint64_t K = UINT64_C(0x2B7E151628AED2A7); // digits of e
+	uint64_t seed2 = chibihash64__rotl(seed-K, 15) + chibihash64__rotl(seed-K, 47);
+	uint64_t h[4] = { seed, seed+K, seed2, seed2+(K*K^K) };
 
 	// uncomment for a small speed boost (~3%) on large strings,
 	// at the cost of bigger code size. usually not worth the trade-off
@@ -40,9 +47,8 @@ chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 		for (int i = 0; i < 4; ++i, p += 8) {
 			uint64_t stripe = chibihash64__load64le(p);
 			h[i] = (stripe ^ h[i]) * K;
-			h[(i+1)&3] ^= chibihash64__rotl(stripe, 39);
+			h[(i+1)&3] ^= chibihash64__rotl(stripe, 27);
 		}
-		h[1] += h[0];
 	}
 
 	for (; l >= 8; l -= 8, p += 8) {
@@ -55,15 +61,12 @@ chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 		h[3] ^= chibihash64__load32le(p + l - 4);
 	} else if (l > 0) {
 		h[2] ^= p[0];
-		h[3] ^= p[l/2];
-		h[0] ^= p[l-1];
+		h[3] ^= p[l/2] | ((uint64_t)p[l-1] << 8);
 	}
 
 	uint64_t x = len * K; x ^= chibihash64__rotl(x, 29);
-	x ^= h[0] * (chibihash64__rotl(h[1], 29) | 1);
-	x ^= h[1] * (chibihash64__rotl(h[2], 29) | 1);
-	x ^= h[2] * (chibihash64__rotl(h[3], 29) | 1);
-	x ^= h[3] * (chibihash64__rotl(h[0], 29) | 1);
+	x ^= chibihash64__reduce(h[0], h[1]);
+	x ^= chibihash64__reduce(h[2], h[3]) + seed;
 
 	// moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
 	x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);

From 33540aef331124ecf3f53de189e5d9de1b6fd97f Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Mon, 25 Nov 2024 17:39:41 +0000
Subject: [PATCH 3/7] plus, instead of xor

slightly better mixing.
this also makes unrolling slower on my system.
---
 chibihash64.h | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/chibihash64.h b/chibihash64.h
index 5400efe..985e59a 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -39,15 +39,19 @@ chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 	uint64_t seed2 = chibihash64__rotl(seed-K, 15) + chibihash64__rotl(seed-K, 47);
 	uint64_t h[4] = { seed, seed+K, seed2, seed2+(K*K^K) };
 
-	// uncomment for a small speed boost (~3%) on large strings,
-	// at the cost of bigger code size. usually not worth the trade-off
-	// since larger code-size will hinder inlinability
-	// #pragma GCC unroll 2
+	// depending on your system unrolling might (or might not) make things
+	// a tad bit faster on large strings. on my system, it actually makes
+	// things slower.
+	// generally speaking, the cost of bigger code size is usually not
+	// worth the trade-off since larger code-size will hinder inlinability
+	// but depending on your needs, you may want to uncomment the pragma
+	// below to unroll the loop.
+	//#pragma GCC unroll 2
 	for (; l >= 32; l -= 32) {
 		for (int i = 0; i < 4; ++i, p += 8) {
 			uint64_t stripe = chibihash64__load64le(p);
-			h[i] = (stripe ^ h[i]) * K;
-			h[(i+1)&3] ^= chibihash64__rotl(stripe, 27);
+			h[i] = (stripe + h[i]) * K;
+			h[(i+1)&3] += chibihash64__rotl(stripe, 27);
 		}
 	}
 
@@ -64,7 +68,8 @@ chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 		h[3] ^= p[l/2] | ((uint64_t)p[l-1] << 8);
 	}
 
-	uint64_t x = len * K; x ^= chibihash64__rotl(x, 29);
+	uint64_t x = (uint64_t)len * K;
+	x ^= chibihash64__rotl(x, 29);
 	x ^= chibihash64__reduce(h[0], h[1]);
 	x ^= chibihash64__reduce(h[2], h[3]) + seed;
 

From 55ecfcff372e98e5576a1c4ff7e91a0676e7b6aa Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Fri, 29 Nov 2024 02:22:59 +0000
Subject: [PATCH 4/7] new compressor & finisher

---
 README.md     |  4 +---
 chibihash64.h | 22 ++++++++++------------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index a67f90f..34b7487 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ Things may change at any point without notice.
 
 Some major improvement compared to v1:
 
-- Faster performance on short string (49 cycles/hash vs 35 cycles/hash).
+- Faster performance on short string (49 cycles/hash vs 34 cycles/hash).
   The tail end handling has been reworked entirely with some inspiration from
   wyhash's short input reading.
 - Better seeding. v1 seed only affected 64 bits of the initial state.
@@ -17,8 +17,6 @@ Some major improvement compared to v1:
 Avenue for improvement:
 
 - Faster bulk handling without using 128 bit multiplication.
-- Investigate better/faster compressor for the 256 bit -> 64 bit reduction.
-- Drop moremur in favor of using existing multiplier (digits of e) for the finisher ??
 
 ---
 
diff --git a/chibihash64.h b/chibihash64.h
index 985e59a..51f810c 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -22,12 +22,6 @@ static inline uint64_t chibihash64__rotl(uint64_t x, int n)
 {
 	return (x << n) | (x >> (-n & 63));
 }
-static inline uint64_t chibihash64__reduce(uint64_t a, uint64_t b)
-{
-	uint64_t x = a * (chibihash64__rotl(b, 9) | 1);
-	x ^= chibihash64__rotl(b, 41) + chibihash64__rotl(a, 31);
-	return x;
-}
 
 static inline uint64_t
 chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
@@ -68,15 +62,19 @@ chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 		h[3] ^= p[l/2] | ((uint64_t)p[l-1] << 8);
 	}
 
+	h[0] += chibihash64__rotl(h[2] * K, 31) ^ (h[2] >> 31);
+	h[1] += chibihash64__rotl(h[3] * K, 31) ^ (h[3] >> 31);
+	h[0] *= K; h[0] ^= h[0] >> 31;
+	h[1] += h[0];
+
 	uint64_t x = (uint64_t)len * K;
 	x ^= chibihash64__rotl(x, 29);
-	x ^= chibihash64__reduce(h[0], h[1]);
-	x ^= chibihash64__reduce(h[2], h[3]) + seed;
+	x += seed;
+	x ^= h[1];
 
-	// moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
-	x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);
-	x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35);
-	x ^= x >> 27;
+	x ^= chibihash64__rotl(x, 15) ^ chibihash64__rotl(x, 42);
+	x *= K;
+	x ^= chibihash64__rotl(x, 13) ^ chibihash64__rotl(x, 31);
 
 	return x;
 }

From e511772d94bc113f8b1ae1a84fc2e0d85f4d29e3 Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Fri, 29 Nov 2024 15:48:10 +0000
Subject: [PATCH 5/7] this and that

---
 README.md                     | 58 ++++++++++-----------------
 chibihash64.h                 |  1 -
 old_versions/chibihash64-v1.h | 75 +++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 37 deletions(-)
 create mode 100644 old_versions/chibihash64-v1.h

diff --git a/README.md b/README.md
index 34b7487..3a094b0 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,3 @@
-# Work in progress v2 branch
-
-This is a WIP branch for version 2 of ChibiHash.
-Things may change at any point without notice.
-
-Some major improvement compared to v1:
-
-- Faster performance on short string (49 cycles/hash vs 34 cycles/hash).
-  The tail end handling has been reworked entirely with some inspiration from
-  wyhash's short input reading.
-- Better seeding. v1 seed only affected 64 bits of the initial state.
-  v2 seed affects the full 256 bits. This allows it to pass smhasher3's
-  SeedBlockLen and SeedBlockOffset tests.
-- Slightly better mixing in bulk handling.
-- Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3.
-
-Avenue for improvement:
-
-- Faster bulk handling without using 128 bit multiplication.
-
----
-
-Below is the original v1 readme, unaltered.
-
 # ChibiHash: Small, Fast 64 bit hash function
 
 I started writing this because all the 64 bit hash functions I came across were
@@ -33,29 +9,26 @@ Being small and portable, the goal is to be able to use ChibiHash as a good
 
 Some key features:
 
-* Small: ~60 loc in C
+* Small: ~65 loc in C
 * Fast: See benchmark table below
 * Portable: Doesn't use hardware specific instructions (e.g SSE)
-* Good Quality: Passes [smhasher][], so should be good quality (I think)
+* Good Quality: Passes [smhasher][] and [smhasher3][], so should be good quality (I think)
 * Unencumbered: Released into the public domain
 * Free of undefined behavior and gives same result regardless of host system's endianness.
 * Non-cryptographic
 
-Here's some benchmark against other similar hash functions:
+Here's some benchmark (made via smhasher3) against other similar hash functions:
 
 | Name |      Large input (GiB/sec)  |  Small input (Cycles/Hash) |
 | :--- | :-------------------------: | :------------------------: |
-| chibihash64  |  **18.08**   |   49   |
-| xxhash64     |    12.59     |   50   |
-| city64       |    14.95     | **35** |
-| spooky64     |    13.83     |   59   |
-
-It's the fastest of the bunch for large string throughput.
-For small string (< 32 bytes), cityhash beats it - worth noting that cityhash
-has [hardcoded special cases][city-small] for input below or equal 32 bytes.
+| chibihash64  |  **24.20**   |   34   |
+| xxhash64     |    15.10     |   50   |
+| city64       |    18.30     |   47   |
+| spooky64     |    16.68     |   70   |
+| rapidhash.protected |    21.50     | **32** |
 
 [smhasher]: https://github.com/aappleby/smhasher
-[city-small]: https://github.com/google/cityhash/blob/f5dc54147fcce12cefd16548c8e760d68ac04226/src/city.cc#L367-L375
+[smhasher3]: https://gitlab.com/fwojcik/smhasher3
 
 ## When NOT to use
 
@@ -71,3 +44,16 @@ Here are some reasons to avoid using this:
 ## Unofficial ports
 
 A list of unofficial ports to other languages is [maintained here](https://github.com/N-R-K/ChibiHash/issues/4).
+
+## Changelog
+
+### v2
+
+- Faster performance on short string (42 cycles/hash vs 34 cycles/hash).
+  The tail end handling has been reworked entirely with some inspiration from
+  wyhash's short input reading.
+- Better seeding. v1 seed only affected 64 bits of the initial state.
+  v2 seed affects the full 256 bits. This allows it to pass smhasher3's
+  SeedBlockLen and SeedBlockOffset tests.
+- Slightly better mixing in bulk handling.
+- Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3.
diff --git a/chibihash64.h b/chibihash64.h
index 51f810c..a0cf96b 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -1,7 +1,6 @@
 // small, fast 64 bit hash function (version 2).
 //
 // https://github.com/N-R-K/ChibiHash
-// https://nrk.neocities.org/articles/chibihash
 //
 // This is free and unencumbered software released into the public domain.
 // For more information, please refer to <https://unlicense.org/>
diff --git a/old_versions/chibihash64-v1.h b/old_versions/chibihash64-v1.h
new file mode 100644
index 0000000..23c5a60
--- /dev/null
+++ b/old_versions/chibihash64-v1.h
@@ -0,0 +1,75 @@
+// small, fast 64 bit hash function (version 1).
+//
+// https://github.com/N-R-K/ChibiHash
+// https://nrk.neocities.org/articles/chibihash
+//
+// This is free and unencumbered software released into the public domain.
+// For more information, please refer to <https://unlicense.org/>
+#pragma once
+#include <stdint.h>
+#include <stddef.h>
+
+static inline uint64_t
+chibihash64__load64le(const uint8_t *p)
+{
+	return (uint64_t)p[0] <<  0 | (uint64_t)p[1] <<  8 |
+	       (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 |
+	       (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 |
+	       (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56;
+}
+
+static inline uint64_t
+chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
+{
+	const uint8_t *k = (const uint8_t *)keyIn;
+	ptrdiff_t l = len;
+
+	const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5);
+	const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7);
+	const uint64_t P3 = UINT64_C(0x3243F6A8885A308D);
+
+	uint64_t h[4] = { P1, P2, P3, seed };
+
+	// unrolling gives very slight speed boost on large inputs at the cost
+	// of larger code size. typically not worth the trade off as larger
+	// code-size hinders inlinability as well
+	// #pragma GCC unroll 2
+	for (; l >= 32; l -= 32) {
+		for (int i = 0; i < 4; ++i, k += 8) {
+			uint64_t lane = chibihash64__load64le(k);
+			h[i] ^= lane;
+			h[i] *= P1;
+			h[(i+1)&3] ^= ((lane << 40) | (lane >> 24));
+		}
+	}
+
+	h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32);
+	if (l & 1) {
+		h[0] ^= k[0];
+		--l, ++k;
+	}
+	h[0] *= P2; h[0] ^= h[0] >> 31;
+
+	for (int i = 1; l >= 8; l -= 8, k += 8, ++i) {
+		h[i] ^= chibihash64__load64le(k);
+		h[i] *= P2; h[i] ^= h[i] >> 31;
+	}
+
+	for (int i = 0; l > 0; l -= 2, k += 2, ++i) {
+		h[i] ^= (k[0] | ((uint64_t)k[1] << 8));
+		h[i] *= P3; h[i] ^= h[i] >> 31;
+	}
+
+	uint64_t x = seed;
+	x ^= h[0] * ((h[2] >> 32)|1);
+	x ^= h[1] * ((h[3] >> 32)|1);
+	x ^= h[2] * ((h[0] >> 32)|1);
+	x ^= h[3] * ((h[1] >> 32)|1);
+
+	// moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
+	x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);
+	x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35);
+	x ^= x >> 27;
+
+	return x;
+}

From 080f8b33ce70d1872e64b4ca313222c63fa587f5 Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Fri, 29 Nov 2024 15:53:51 +0000
Subject: [PATCH 6/7] portable header guard

---
 chibihash64.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/chibihash64.h b/chibihash64.h
index a0cf96b..45a3181 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -1,10 +1,11 @@
+#ifndef CHIBIHASH64__HGUARD
+#define CHIBIHASH64__HGUARD
 // small, fast 64 bit hash function (version 2).
 //
 // https://github.com/N-R-K/ChibiHash
 //
 // This is free and unencumbered software released into the public domain.
 // For more information, please refer to <https://unlicense.org/>
-#pragma once
 #include <stdint.h>
 #include <stddef.h>
 
@@ -77,3 +78,5 @@ chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 
 	return x;
 }
+
+#endif // CHIBIHASH64__HGUARD

From 0ae6a3cc36f40ba1073d29c91496caa6510dc2ef Mon Sep 17 00:00:00 2001
From: NRK <nrk@disroot.org>
Date: Fri, 29 Nov 2024 16:11:43 +0000
Subject: [PATCH 7/7] add polymur to benchmarks, add some footnotes

---
 README.md | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 3a094b0..d342dc5 100644
--- a/README.md
+++ b/README.md
@@ -17,15 +17,20 @@ Some key features:
 * Free of undefined behavior and gives same result regardless of host system's endianness.
 * Non-cryptographic
 
-Here's some benchmark (made via smhasher3) against other similar hash functions:
-
-| Name |      Large input (GiB/sec)  |  Small input (Cycles/Hash) |
-| :--- | :-------------------------: | :------------------------: |
-| chibihash64  |  **24.20**   |   34   |
-| xxhash64     |    15.10     |   50   |
-| city64       |    18.30     |   47   |
-| spooky64     |    16.68     |   70   |
-| rapidhash.protected |    21.50     | **32** |
+Here's some benchmark (made via smhasher3) against other similar themed hash functions:
+
+| Name               |    Large input (GiB/sec)    |  Small input (Cycles/Hash) |
+| :---               | :-------------------------: | :------------------------: |
+| chibihash64                       |  **24.20**   |   34   |
+| xxhash64                          |    15.10     |   50   |
+| city64                            |    18.30     |   47   |
+| spooky64                          |    16.68     |   70   |
+| rapidhash.protected <sup>1</sup>  |    21.50     | **32** |
+| polymur-hash <sup>1, 2</sup>      |    13.82     |   43   |
+
+1. Requires compiler/cpu support for retrieving the full 128 bit result of a
+   64x64 bit multiply.
+2. Universal, but has a complicated seeding step.
 
 [smhasher]: https://github.com/aappleby/smhasher
 [smhasher3]: https://gitlab.com/fwojcik/smhasher3