diff --git a/README.md b/README.md
index 30adaa8..d342dc5 100644
--- a/README.md
+++ b/README.md
@@ -9,29 +9,31 @@ Being small and portable, the goal is to be able to use ChibiHash as a good
 
 Some key features:
 
-* Small: ~60 loc in C
+* Small: ~65 loc in C
 * Fast: See benchmark table below
 * Portable: Doesn't use hardware specific instructions (e.g SSE)
-* Good Quality: Passes [smhasher][], so should be good quality (I think)
+* Good Quality: Passes [smhasher][] and [smhasher3][], so should be good quality (I think)
 * Unencumbered: Released into the public domain
 * Free of undefined behavior and gives same result regardless of host system's endianness.
 * Non-cryptographic
 
-Here's some benchmark against other similar hash functions:
+Here's some benchmark (made via smhasher3) against other similar themed hash functions:
 
-| Name |      Large input (GiB/sec)  |  Small input (Cycles/Hash) |
-| :--- | :-------------------------: | :------------------------: |
-| chibihash64  |  **18.08**   |   49   |
-| xxhash64     |    12.59     |   50   |
-| city64       |    14.95     | **35** |
-| spooky64     |    13.83     |   59   |
+| Name               |    Large input (GiB/sec)    |  Small input (Cycles/Hash) |
+| :---               | :-------------------------: | :------------------------: |
+| chibihash64                       |  **24.20**   |   34   |
+| xxhash64                          |    15.10     |   50   |
+| city64                            |    18.30     |   47   |
+| spooky64                          |    16.68     |   70   |
+| rapidhash.protected <sup>1</sup>  |    21.50     | **32** |
+| polymur-hash <sup>1, 2</sup>      |    13.82     |   43   |
 
-It's the fastest of the bunch for large string throughput.
-For small string (< 32 bytes), cityhash beats it - worth noting that cityhash
-has [hardcoded special cases][city-small] for input below or equal 32 bytes.
+1. Requires compiler/cpu support for retrieving the full 128 bit result of a
+   64x64 bit multiply.
+2. Universal, but has a complicated seeding step.
 
 [smhasher]: https://github.com/aappleby/smhasher
-[city-small]: https://github.com/google/cityhash/blob/f5dc54147fcce12cefd16548c8e760d68ac04226/src/city.cc#L367-L375
+[smhasher3]: https://gitlab.com/fwojcik/smhasher3
 
 ## When NOT to use
 
@@ -47,3 +49,16 @@ Here are some reasons to avoid using this:
 ## Unofficial ports
 
 A list of unofficial ports to other languages is [maintained here](https://github.com/N-R-K/ChibiHash/issues/4).
+
+## Changelog
+
+### v2
+
+- Faster performance on short string (42 cycles/hash vs 34 cycles/hash).
+  The tail end handling has been reworked entirely with some inspiration from
+  wyhash's short input reading.
+- Better seeding. v1 seed only affected 64 bits of the initial state.
+  v2 seed affects the full 256 bits. This allows it to pass smhasher3's
+  SeedBlockLen and SeedBlockOffset tests.
+- Slightly better mixing in bulk handling.
+- Passes all 252 tests in smhasher3 (commit 34093a3), v1 failed 3.
diff --git a/chibihash64.h b/chibihash64.h
index c061c42..45a3181 100644
--- a/chibihash64.h
+++ b/chibihash64.h
@@ -1,75 +1,82 @@
-// small, fast 64 bit hash function.
+#ifndef CHIBIHASH64__HGUARD
+#define CHIBIHASH64__HGUARD
+// small, fast 64 bit hash function (version 2).
 //
 // https://github.com/N-R-K/ChibiHash
-// https://nrk.neocities.org/articles/chibihash
 //
 // This is free and unencumbered software released into the public domain.
 // For more information, please refer to <https://unlicense.org/>
-#pragma once
 #include <stdint.h>
 #include <stddef.h>
 
-static inline uint64_t
-chibihash64__load64le(const uint8_t *p)
+static inline uint64_t chibihash64__load32le(const uint8_t *p)
 {
 	return (uint64_t)p[0] <<  0 | (uint64_t)p[1] <<  8 |
-	       (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 |
-	       (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 |
-	       (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56;
+	       (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24;
+}
+static inline uint64_t chibihash64__load64le(const uint8_t *p)
+{
+	return chibihash64__load32le(p) | (chibihash64__load32le(p+4) << 32);
+}
+static inline uint64_t chibihash64__rotl(uint64_t x, int n)
+{
+	return (x << n) | (x >> (-n & 63));
 }
 
 static inline uint64_t
 chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
 {
-	const uint8_t *k = (const uint8_t *)keyIn;
+	const uint8_t *p = (const uint8_t *)keyIn;
 	ptrdiff_t l = len;
 
-	const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5);
-	const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7);
-	const uint64_t P3 = UINT64_C(0x3243F6A8885A308D);
-
-	uint64_t h[4] = { P1, P2, P3, seed };
+	const uint64_t K = UINT64_C(0x2B7E151628AED2A7); // digits of e
+	uint64_t seed2 = chibihash64__rotl(seed-K, 15) + chibihash64__rotl(seed-K, 47);
+	uint64_t h[4] = { seed, seed+K, seed2, seed2+(K*K^K) };
 
-	// unrolling gives very slight speed boost on large inputs at the cost
-	// of larger code size. typically not worth the trade off as larger
-	// code-size hinders inlinability as well
-	// #pragma GCC unroll 2
+	// depending on your system unrolling might (or might not) make things
+	// a tad bit faster on large strings. on my system, it actually makes
+	// things slower.
+	// generally speaking, the cost of bigger code size is usually not
+	// worth the trade-off since larger code-size will hinder inlinability
+	// but depending on your needs, you may want to uncomment the pragma
+	// below to unroll the loop.
+	//#pragma GCC unroll 2
 	for (; l >= 32; l -= 32) {
-		for (int i = 0; i < 4; ++i, k += 8) {
-			uint64_t lane = chibihash64__load64le(k);
-			h[i] ^= lane;
-			h[i] *= P1;
-			h[(i+1)&3] ^= ((lane << 40) | (lane >> 24));
+		for (int i = 0; i < 4; ++i, p += 8) {
+			uint64_t stripe = chibihash64__load64le(p);
+			h[i] = (stripe + h[i]) * K;
+			h[(i+1)&3] += chibihash64__rotl(stripe, 27);
 		}
 	}
 
-	h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32);
-	if (l & 1) {
-		h[0] ^= k[0];
-		--l, ++k;
+	for (; l >= 8; l -= 8, p += 8) {
+		h[0] ^= chibihash64__load32le(p+0); h[0] *= K;
+		h[1] ^= chibihash64__load32le(p+4); h[1] *= K;
 	}
-	h[0] *= P2; h[0] ^= h[0] >> 31;
 
-	for (int i = 1; l >= 8; l -= 8, k += 8, ++i) {
-		h[i] ^= chibihash64__load64le(k);
-		h[i] *= P2; h[i] ^= h[i] >> 31;
+	if (l >= 4) {
+		h[2] ^= chibihash64__load32le(p);
+		h[3] ^= chibihash64__load32le(p + l - 4);
+	} else if (l > 0) {
+		h[2] ^= p[0];
+		h[3] ^= p[l/2] | ((uint64_t)p[l-1] << 8);
 	}
 
-	for (int i = 0; l > 0; l -= 2, k += 2, ++i) {
-		h[i] ^= (k[0] | ((uint64_t)k[1] << 8));
-		h[i] *= P3; h[i] ^= h[i] >> 31;
-	}
+	h[0] += chibihash64__rotl(h[2] * K, 31) ^ (h[2] >> 31);
+	h[1] += chibihash64__rotl(h[3] * K, 31) ^ (h[3] >> 31);
+	h[0] *= K; h[0] ^= h[0] >> 31;
+	h[1] += h[0];
 
-	uint64_t x = seed;
-	x ^= h[0] * ((h[2] >> 32)|1);
-	x ^= h[1] * ((h[3] >> 32)|1);
-	x ^= h[2] * ((h[0] >> 32)|1);
-	x ^= h[3] * ((h[1] >> 32)|1);
+	uint64_t x = (uint64_t)len * K;
+	x ^= chibihash64__rotl(x, 29);
+	x += seed;
+	x ^= h[1];
 
-	// moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
-	x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);
-	x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35);
-	x ^= x >> 27;
+	x ^= chibihash64__rotl(x, 15) ^ chibihash64__rotl(x, 42);
+	x *= K;
+	x ^= chibihash64__rotl(x, 13) ^ chibihash64__rotl(x, 31);
 
 	return x;
 }
+
+#endif // CHIBIHASH64__HGUARD
diff --git a/old_versions/chibihash64-v1.h b/old_versions/chibihash64-v1.h
new file mode 100644
index 0000000..23c5a60
--- /dev/null
+++ b/old_versions/chibihash64-v1.h
@@ -0,0 +1,75 @@
+// small, fast 64 bit hash function (version 1).
+//
+// https://github.com/N-R-K/ChibiHash
+// https://nrk.neocities.org/articles/chibihash
+//
+// This is free and unencumbered software released into the public domain.
+// For more information, please refer to <https://unlicense.org/>
+#pragma once
+#include <stdint.h>
+#include <stddef.h>
+
+static inline uint64_t
+chibihash64__load64le(const uint8_t *p)
+{
+	return (uint64_t)p[0] <<  0 | (uint64_t)p[1] <<  8 |
+	       (uint64_t)p[2] << 16 | (uint64_t)p[3] << 24 |
+	       (uint64_t)p[4] << 32 | (uint64_t)p[5] << 40 |
+	       (uint64_t)p[6] << 48 | (uint64_t)p[7] << 56;
+}
+
+static inline uint64_t
+chibihash64(const void *keyIn, ptrdiff_t len, uint64_t seed)
+{
+	const uint8_t *k = (const uint8_t *)keyIn;
+	ptrdiff_t l = len;
+
+	const uint64_t P1 = UINT64_C(0x2B7E151628AED2A5);
+	const uint64_t P2 = UINT64_C(0x9E3793492EEDC3F7);
+	const uint64_t P3 = UINT64_C(0x3243F6A8885A308D);
+
+	uint64_t h[4] = { P1, P2, P3, seed };
+
+	// unrolling gives very slight speed boost on large inputs at the cost
+	// of larger code size. typically not worth the trade off as larger
+	// code-size hinders inlinability as well
+	// #pragma GCC unroll 2
+	for (; l >= 32; l -= 32) {
+		for (int i = 0; i < 4; ++i, k += 8) {
+			uint64_t lane = chibihash64__load64le(k);
+			h[i] ^= lane;
+			h[i] *= P1;
+			h[(i+1)&3] ^= ((lane << 40) | (lane >> 24));
+		}
+	}
+
+	h[0] += ((uint64_t)len << 32) | ((uint64_t)len >> 32);
+	if (l & 1) {
+		h[0] ^= k[0];
+		--l, ++k;
+	}
+	h[0] *= P2; h[0] ^= h[0] >> 31;
+
+	for (int i = 1; l >= 8; l -= 8, k += 8, ++i) {
+		h[i] ^= chibihash64__load64le(k);
+		h[i] *= P2; h[i] ^= h[i] >> 31;
+	}
+
+	for (int i = 0; l > 0; l -= 2, k += 2, ++i) {
+		h[i] ^= (k[0] | ((uint64_t)k[1] << 8));
+		h[i] *= P3; h[i] ^= h[i] >> 31;
+	}
+
+	uint64_t x = seed;
+	x ^= h[0] * ((h[2] >> 32)|1);
+	x ^= h[1] * ((h[3] >> 32)|1);
+	x ^= h[2] * ((h[0] >> 32)|1);
+	x ^= h[3] * ((h[1] >> 32)|1);
+
+	// moremur: https://mostlymangling.blogspot.com/2019/12/stronger-better-morer-moremur-better.html
+	x ^= x >> 27; x *= UINT64_C(0x3C79AC492BA7B653);
+	x ^= x >> 33; x *= UINT64_C(0x1C69B3F74AC4AE35);
+	x ^= x >> 27;
+
+	return x;
+}