From 3a64443f3923a91840d72500e7b74437e283eba7 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Tue, 19 Mar 2024 20:06:22 -0400
Subject: [PATCH 01/49] trimming some unnecessary code

---
 src/fallback/implementation.cpp | 13 +++++++++++--
 src/scalar/base64.h             | 16 ----------------
 src/tables/base64_tables.h      | 16 ----------------
 3 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/src/fallback/implementation.cpp b/src/fallback/implementation.cpp
index 21d846103..8bf24a1fc 100644
--- a/src/fallback/implementation.cpp
+++ b/src/fallback/implementation.cpp
@@ -350,7 +350,16 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::base64_to_binary(input, length, output);
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
@@ -358,7 +367,7 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
 }
 
 size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output);
+  return scalar::base64::tail_encode_base64(output, input, length);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index 821f1ea2f..ff7368314 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -147,26 +147,10 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
   return  actual_length / 4 * 3 + (actual_length %4)  - 1;
 }
 
-simdutf_warn_unused simdutf_really_inline result base64_to_binary(const char * input, size_t length, char* output) noexcept {
-  if(length > 0 && input[length - 1] == '=') {
-    length -= 1;
-    if(length > 0 && input[length - 1] == '=') {
-      length -= 1;
-    }
-  }
-  if(length == 0) {
-    return {SUCCESS, 0};
-  }
-  return base64_tail_decode(output, input, length);
-}
-
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return (length + 2)/3 * 4; // We use padding to make the length a multiple of 4.
 }
 
-simdutf_really_inline size_t binary_to_base64(const char * input, size_t length, char* output) noexcept {
-  return tail_encode_base64(output, input, length);
-}
 } // namespace base64
 } // unnamed namespace
 } // namespace scalar
diff --git a/src/tables/base64_tables.h b/src/tables/base64_tables.h
index 8c4ed3066..a0f997733 100644
--- a/src/tables/base64_tables.h
+++ b/src/tables/base64_tables.h
@@ -68,22 +68,6 @@ const char e2[256] = {
     'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
     '/'};
 
-const int8_t decoding_table[256] = {
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -2, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, 62, -1, 62, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60,
-    61, -1, -1, -1, -1, -1, -1, -1, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-    11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1,
-    63, -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
-    43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1};
-
 /* SPECIAL DECODE TABLES FOR LITTLE ENDIAN CPUS */
 
 const uint32_t d0[256] = {

From 3f9cb0f8ffb457cc65e88340915e5ece1ec0b451 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 20 Mar 2024 16:29:32 -0400
Subject: [PATCH 02/49] fixing missing rvv implementation

---
 src/rvv/implementation.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/rvv/implementation.cpp b/src/rvv/implementation.cpp
index 815f28c81..63f1283c1 100644
--- a/src/rvv/implementation.cpp
+++ b/src/rvv/implementation.cpp
@@ -83,7 +83,16 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::base64_to_binary(input, length, output);
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
@@ -91,7 +100,7 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
 }
 
 size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output);
+  return scalar::base64::tail_encode_base64(output, input, length);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf

From a9ea1c6feb0a8c1807e21dea959184f8c4796590 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 21 Mar 2024 20:16:11 -0400
Subject: [PATCH 03/49] completing the base64 implementation.

---
 README.md                             |  51 ++++++++++-
 include/simdutf/error.h               |   1 +
 include/simdutf/implementation.h      | 120 ++++++++++++++++++++++++--
 src/arm64/arm_base64.cpp              |  16 +++-
 src/arm64/implementation.cpp          |   8 ++
 src/fallback/implementation.cpp       |  18 ++++
 src/haswell/implementation.cpp        |   8 ++
 src/icelake/implementation.cpp        |   9 ++
 src/implementation.cpp                |  62 +++++++++++++
 src/ppc64/implementation.cpp          |  17 ++++
 src/rvv/implementation.cpp            |  18 ++++
 src/scalar/base64.h                   | 119 +++++++++++++++++++++++--
 src/simdutf.cpp                       |   3 +-
 src/simdutf/arm64/implementation.h    |   2 +
 src/simdutf/fallback/implementation.h |   2 +
 src/simdutf/haswell/implementation.h  |   2 +
 src/simdutf/icelake/implementation.h  |   2 +
 src/simdutf/ppc64/implementation.h    |   2 +
 src/simdutf/rvv/implementation.h      |   2 +
 src/simdutf/westmere/implementation.h |   2 +
 src/westmere/implementation.cpp       |   8 ++
 21 files changed, 451 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index d18baab52..147842460 100644
--- a/README.md
+++ b/README.md
@@ -1583,7 +1583,9 @@ we prune spaces, we may need to adjust the result size afterword.
 std::vector<char> buffer(simdutf::maximal_binary_length_from_base64(base64.data(), base64.size()));
 simdutf::result r = simdutf::base64_to_binary(base64.data(), base64.size(), buffer.data());
 if(r.error) {
-  // We have some error, r.count tells you where the error was encountered in the input
+  // We have some error, r.count tells you where the error was encountered in the input if
+  // the error is INVALID_BASE64_CHARACTER. If the error is BASE64_INPUT_REMAINDER, then
+  // a single valid base64 remained, and r.count contains the number of bytes decoded.
 } else {
   buffer.resize(r.count); // resize the buffer according to actual number of bytes
 }
@@ -1604,10 +1606,21 @@ The specification of our base64 functions is as follows:
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return number of base64 bytes
+ * @return maximal number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process in UTF-16 (native endianess)
+ * @param length        the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
+
 /**
  * Convert a base64 input to a binary ouput.
  *
@@ -1618,10 +1631,14 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * See https://infra.spec.whatwg.org/#forgiving-base64-decode
  *
  * This function will fail in case of invalid input. There are two possible reasons for
- * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
  * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
  * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
@@ -1653,6 +1670,34 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  */
 size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
 
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process in UTF-16 (native endianess)
+ * @param length        the length of the string in 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result utf16_base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+
 ```
 
 
diff --git a/include/simdutf/error.h b/include/simdutf/error.h
index 0090ff1d6..a65303ce3 100644
--- a/include/simdutf/error.h
+++ b/include/simdutf/error.h
@@ -16,6 +16,7 @@ enum error_code {
                 // there must be no surrogate at all (Latin1)
   INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid base64 string.
   BASE64_INPUT_REMAINDER, // The base64 input terminates with a single character, excluding padding (=).
+  OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small.
   OTHER         // Not related to validation/transcoding.
 };
 
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index 27cb6027b..aafa9ff97 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1380,7 +1380,6 @@ simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t le
  */
 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
 
-
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1388,10 +1387,21 @@ simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t leng
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return number of base64 bytes
+ * @return maximal number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process, in ASCII stored as 16-bit units
+ * @param length        the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
+
 /**
  * Convert a base64 input to a binary ouput.
  *
@@ -1402,10 +1412,14 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * See https://infra.spec.whatwg.org/#forgiving-base64-decode
  *
  * This function will fail in case of invalid input. There are two possible reasons for
- * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
  * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
  * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
@@ -1437,6 +1451,67 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  */
 size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
 
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 16-bit units
+ * @param length        the length of the string in 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are three possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
+ * where we stopped decoding.
+ *
+ * In all case, the outlen parameter is modified to contain the number of bytes
+ * that have been written/decoded.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
+
 /**
  * An implementation of simdutf for a particular CPU architecture.
  *
@@ -2504,10 +2579,21 @@ class implementation {
    *
    * @param input         the base64 input to process
    * @param length        the length of the base64 input in bytes
-   * @return number of base64 bytes
+   * @return maximal number of binary bytes
    */
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept = 0;
 
+  /**
+   * Provide the maximal binary length in bytes given the base64 input.
+   * In general, if the input contains ASCII spaces, the result will be less than
+   * the maximum length.
+   *
+   * @param input         the base64 input to process, in ASCII stored as 16-bit units
+   * @param length        the length of the base64 input in 16-bit units
+   * @return maximal number of binary bytes
+   */
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept = 0;
+
   /**
    * Convert a base64 input to a binary ouput.
    *
@@ -2518,7 +2604,7 @@ class implementation {
    * See https://infra.spec.whatwg.org/#forgiving-base64-decode
    *
    * This function will fail in case of invalid input. There are two possible reasons for
-   * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+   * failure: the input contains a number of base64 characters that when divided by 4, leaves
    * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
    * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
    *
@@ -2532,6 +2618,30 @@ class implementation {
    */
   simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
 
+  /**
+   * Convert a base64 input to a binary ouput.
+   *
+   * This function follows the WHATWG forgiving-base64 format, which means that it will
+   * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+   * equal signs at the end) or an unpadded input (without any equal signs at the end).
+   *
+   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+   *
+   * This function will fail in case of invalid input. There are two possible reasons for
+   * failure: the input contains a number of base64 characters that when divided by 4, leaves
+   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+   *
+   * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+   * If you fail to provide that much space, the function may cause a buffer overflow.
+   *
+   * @param input         the base64 string to process, in ASCII stored as 16-bit units
+   * @param length        the length of the string in 16-bit units
+   * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+   */
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept = 0;
+
   /**
    * Provide the base64 length in bytes given the length of a binary input.
    *
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 2113a2cec..565b83746 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -210,6 +210,13 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
+void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = vld2q_u8(reinterpret_cast<const uint8_t *>(src)).val[0];
+  b->chunks[1] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 16).val[0];
+  b->chunks[2] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 32).val[0];
+  b->chunks[3] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 48).val[0];
+}
+
 // decode 64 bytes and output 48 bytes
 void base64_decode_block(char *out, const char *src) {
   uint8x16x4_t str = vld4q_u8((uint8_t *)src);
@@ -222,7 +229,8 @@ void base64_decode_block(char *out, const char *src) {
   vst3q_u8((uint8_t *)out, outvec);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename char_type>
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -232,15 +240,15 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
       equalsigns = 2;
     }
   }
-  const char *const srcinit = src;
+  const char_type *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const char_type *const srcend = src + srclen;
 
   constexpr size_t block_size = 10;
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const char_type *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index ff02797f8..f8d6a566a 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -843,6 +843,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/fallback/implementation.cpp b/src/fallback/implementation.cpp
index 8bf24a1fc..c469dbbef 100644
--- a/src/fallback/implementation.cpp
+++ b/src/fallback/implementation.cpp
@@ -362,6 +362,24 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return scalar::base64::base64_tail_decode(output, input, length);
 }
 
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 78d00a6ab..733f83b62 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -786,6 +786,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 035c77a50..dae4f0dfd 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1372,6 +1372,15 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/implementation.cpp b/src/implementation.cpp
index bd76c4075..253cf52d9 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -31,6 +31,8 @@ std::string toBinaryString(T b) {
 
 #include "scalar/utf8.h"
 #include "scalar/utf16.h"
+#include "scalar/utf32.h"
+#include "scalar/base64.h"
 
 namespace simdutf {
 bool implementation::supported_by_runtime_system() const {
@@ -460,6 +462,14 @@ class detect_best_supported_implementation_on_first_use final : public implement
     return set_best()->base64_to_binary(input, length, output);
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output);
+  }
+
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept override {
     return set_best()->base64_length_from_binary(length);
   }
@@ -816,6 +826,15 @@ class unsupported_implementation final : public implementation {
     return result(error_code::OTHER, 0);
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+
   simdutf_warn_unused size_t base64_length_from_binary(size_t) const noexcept override {
     return 0;
   }
@@ -1274,6 +1293,49 @@ simdutf_warn_unused result base64_to_binary(const char * input, size_t length, c
   return get_default_implementation()->base64_to_binary(input, length, output);
 }
 
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output);
+}
+
+template <typename chartype>
+simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
+  static_assert(std::is_same_v<chartype, char> || std::is_same_v<chartype, char16_t>, "Only char and char16_t are supported.");
+  // The implementation could be nicer, but we expect that most times, the user
+  // will provide us with a buffer that is large enough.
+  size_t max_length = maximal_binary_length_from_base64(input, length);
+  if(outlen >= max_length) {
+    return base64_to_binary(input, length, output);
+  }
+  // The output buffer is maybe too small. We will decode a truncated version of the input.
+  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
+  size_t safe_input = base64_length_from_binary(outlen3);
+  result r = base64_to_binary(input, safe_input, output);
+  if(r.error == error_code::INVALID_BASE64_CHARACTER) { return r; }
+  size_t offset = (r.error == error_code::BASE64_INPUT_REMAINDER) ? 1 :
+    ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
+  size_t output_index = r.count - (r.count % 3);
+  size_t input_index = safe_input;
+  while(offset > 0) {
+    char c = input[--input_index];
+    if(c == '=' || c == '\n' || c == '\r' || c == '\t' || c == ' ') {
+      offset--;
+    }
+  }
+  size_t remaining_out = outlen - output_index;
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, input + input_index, length - input_index);
+  outlen = output_index + remaining_out;
+  if(r.error == error_code::INVALID_BASE64_CHARACTER) {
+    r.count += input_index;
+  } else {
+    r.count = output_index;
+  }
+  return r;
+}
+
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return get_default_implementation()->base64_length_from_binary(length);
 }
diff --git a/src/ppc64/implementation.cpp b/src/ppc64/implementation.cpp
index 8390e01a3..161ae19d9 100644
--- a/src/ppc64/implementation.cpp
+++ b/src/ppc64/implementation.cpp
@@ -299,6 +299,23 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
   return scalar::base64::base64_to_binary(input, length, output);
 }
 
diff --git a/src/rvv/implementation.cpp b/src/rvv/implementation.cpp
index 63f1283c1..7dda20c8a 100644
--- a/src/rvv/implementation.cpp
+++ b/src/rvv/implementation.cpp
@@ -95,6 +95,24 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return scalar::base64::base64_tail_decode(output, input, length);
 }
 
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index ff7368314..ec2002618 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -9,12 +9,12 @@ namespace scalar {
 namespace {
 namespace base64 {
 
-// Returns true upon success. The destination buffer must be large enough and is
-// incremented by the number of bytes written and src is incremented by the number of bytes read.
+// Returns true upon success. The destination buffer must be large enough.
 // This functions assumes that the padding (=) has been removed.
-result base64_tail_decode(char *dst, const char *src, size_t length) {
-  const char *srcend = src + length;
-  const char *srcinit = src;
+template <class char_type>
+result base64_tail_decode(char *dst, const char_type *src, size_t length) {
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
   const char *dstinit = dst;
 
   uint32_t x;
@@ -34,7 +34,7 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
     idx = 0;
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
-      char c = *src;
+      char_type c = *src;
       uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
@@ -92,6 +92,108 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
   }
 }
 
+// like base64_tail_decode, but it will not write past the end of the ouput buffer.
+// outlen is modified to reflect the number of bytes written.
+template <class char_type>
+result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length) {
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+  const char *dstend = dst + outlen;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend &&
+           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
+                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if(match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      if(dst + 3 > dstend) {
+        outlen = size_t(dst - dstinit);
+        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    // we need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char c = *src;
+      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      buffer[idx] = uint8_t(code);
+      if (code <= 63) {
+        idx++;
+      } else if (code > 64) {
+        outlen = size_t(dst - dstinit);
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (idx == 2) {
+        if(dst == dstend) {
+          outlen = size_t(dst - dstinit);
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+        }
+        uint32_t triple =
+            (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 1);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 1);
+        }
+        dst += 1;
+
+      } else if (idx == 3) {
+        if(dst + 2 >= dstend) {
+          outlen = size_t(dst - dstinit);
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+        }
+        uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+                          (uint32_t(buffer[1]) << 2 * 6) +
+                          (uint32_t(buffer[2]) << 1 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 2);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 2);
+        }
+        dst += 2;
+      } else if (idx == 1) {
+        outlen = size_t(dst - dstinit);
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      outlen = size_t(dst - dstinit);
+      return {SUCCESS, size_t(dst - dstinit)};
+    }
+    if(dst + 3 >= dstend) {
+      outlen = size_t(dst - dstinit);
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+    }
+    uint32_t triple =
+        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+    if(match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
+}
+
 // Returns the number of bytes written. The destination buffer must be large
 // enough. It will add padding (=) if needed.
 size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
@@ -128,7 +230,8 @@ size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
   return (size_t)(out - dst);
 }
 
-simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept {
+template <class char_type>
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char_type * input, size_t length) noexcept {
   // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
   size_t padding = 0;
   if(length > 0) {
@@ -140,7 +243,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
     }
   }
   size_t actual_length = length - padding;
-  if(actual_length % 4 == 0) {
+  if(actual_length % 4 <= 1) {
     return actual_length / 4 * 3;
   }
   // if we have a valid input, then the remainder must be 2 or 3 adding one or two extra bytes.
diff --git a/src/simdutf.cpp b/src/simdutf.cpp
index 26ca712dd..fa889290c 100644
--- a/src/simdutf.cpp
+++ b/src/simdutf.cpp
@@ -1,10 +1,11 @@
 #include "simdutf.h"
+// We include base64_tables once.
+#include "tables/base64_tables.h"
 #include "implementation.cpp"
 #include "encoding_types.cpp"
 #include "error.cpp"
 // The large tables should be included once and they
 // should not depend on a kernel.
-#include "tables/base64_tables.h"
 #include "tables/utf8_to_utf16_tables.h"
 #include "tables/utf16_to_utf8_tables.h"
 // End of tables.
diff --git a/src/simdutf/arm64/implementation.h b/src/simdutf/arm64/implementation.h
index b686be9fe..5e0d89ace 100644
--- a/src/simdutf/arm64/implementation.h
+++ b/src/simdutf/arm64/implementation.h
@@ -91,6 +91,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/fallback/implementation.h b/src/simdutf/fallback/implementation.h
index 14d14cb42..c8dfc2037 100644
--- a/src/simdutf/fallback/implementation.h
+++ b/src/simdutf/fallback/implementation.h
@@ -94,6 +94,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/haswell/implementation.h b/src/simdutf/haswell/implementation.h
index c75e4a5e7..79969941b 100644
--- a/src/simdutf/haswell/implementation.h
+++ b/src/simdutf/haswell/implementation.h
@@ -93,6 +93,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/icelake/implementation.h b/src/simdutf/icelake/implementation.h
index 175b34040..4638bf9b9 100644
--- a/src/simdutf/icelake/implementation.h
+++ b/src/simdutf/icelake/implementation.h
@@ -93,6 +93,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/ppc64/implementation.h b/src/simdutf/ppc64/implementation.h
index f1df43a4c..7fd324493 100644
--- a/src/simdutf/ppc64/implementation.h
+++ b/src/simdutf/ppc64/implementation.h
@@ -71,6 +71,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/rvv/implementation.h b/src/simdutf/rvv/implementation.h
index f95dcf2ab..56f02362d 100644
--- a/src/simdutf/rvv/implementation.h
+++ b/src/simdutf/rvv/implementation.h
@@ -95,6 +95,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 private:
diff --git a/src/simdutf/westmere/implementation.h b/src/simdutf/westmere/implementation.h
index 4d992a49b..190693783 100644
--- a/src/simdutf/westmere/implementation.h
+++ b/src/simdutf/westmere/implementation.h
@@ -91,6 +91,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index d428e0084..a491818c1 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -787,6 +787,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }

From 0f49240f9d3a2395c9d90529a94a49894db3b2e7 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 21 Mar 2024 22:26:13 -0400
Subject: [PATCH 04/49] adding ppc64

---
 .github/workflows/aarch64.yml |  2 +-
 .github/workflows/ppc64le.yml | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ppc64le.yml

diff --git a/.github/workflows/aarch64.yml b/.github/workflows/aarch64.yml
index a94eb8eed..b54e2afa0 100644
--- a/.github/workflows/aarch64.yml
+++ b/.github/workflows/aarch64.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: uraimo/run-on-arch-action@v2
+      - uses: uraimo/run-on-arch-action@v4
         name: Test
         id: runcmd
         with:
diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml
new file mode 100644
index 000000000..c0c773928
--- /dev/null
+++ b/.github/workflows/ppc64le.yml
@@ -0,0 +1,28 @@
+name: Ubuntu aarch64 (GCC 11)
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: uraimo/run-on-arch-action@v4
+        name: Test
+        id: runcmd
+        with:
+          arch: ppc64le
+          githubToken: ${{ github.token }}
+          distro: ubuntu_latest
+          install: |
+            apt-get update -q -y
+            apt-get install -y cmake make g++
+          run: |
+            cmake -DCMAKE_BUILD_TYPE=Release -B build
+            cmake --build build -j=2

From 5daa520cb5537dcd57bd4b46f01dcb54dbaacebc Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Thu, 21 Mar 2024 23:03:50 -0400
Subject: [PATCH 05/49] saving

---
 src/haswell/avx2_base64.cpp        |  3 ++-
 src/icelake/icelake_base64.inl.cpp | 13 +++++++++----
 src/implementation.cpp             |  3 ++-
 src/westmere/sse_base64.cpp        |  3 ++-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 870d36f6f..c1151d174 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -315,7 +315,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 24, buffer, 24);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 74ea110a4..e09b117fd 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -106,6 +106,10 @@ static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
 }
 
+static inline void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+}shit
+
 static inline void base64_decode(char *out, __m512i str) {
   const __m512i merge_ab_and_bc =
       _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
@@ -130,7 +134,8 @@ static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -140,16 +145,16 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
       equalsigns = 2;
     }
   }
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   // figure out why block_size == 2 is sometimes best???
   constexpr size_t block_size = 6;
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
diff --git a/src/implementation.cpp b/src/implementation.cpp
index 253cf52d9..b266e1d26 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1,6 +1,7 @@
 #include "simdutf.h"
 #include <initializer_list>
 #include <climits>
+#include <type_traits>
 
 // Useful for debugging purposes
 namespace simdutf {
@@ -1303,7 +1304,7 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
 
 template <typename chartype>
 simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
-  static_assert(std::is_same_v<chartype, char> || std::is_same_v<chartype, char16_t>, "Only char and char16_t are supported.");
+  static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
   // The implementation could be nicer, but we expect that most times, the user
   // will provide us with a buffer that is large enough.
   size_t max_length = maximal_binary_length_from_base64(input, length);
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index f2f4d7211..7ef6fd4b5 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -323,7 +323,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 36, buffer, 12);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;

From 151aa0920add2b3b24d4f7f267bbcf794012b7d8 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 21 Mar 2024 23:07:14 -0400
Subject: [PATCH 06/49] saturated.

---
 src/arm64/arm_base64.cpp          | 14 ++++++++++----
 src/simdutf/arm64/simd16-inl.h    |  2 +-
 src/simdutf/haswell/simd16-inl.h  |  2 +-
 src/simdutf/westmere/simd16-inl.h |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 565b83746..239176d97 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -210,11 +210,17 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
+inline uint8x16_t load_satured(const uint16_t * data) {
+    uint16x8_t in1 = vld1q_u16(data);
+    uint16x8_t in2 = vld1q_u16(data+8);
+    return vqmovn_high_u16(vqmovn_u16(in1), in2);
+}
+
 void load_block(block64 *b, const char16_t *src) {
-  b->chunks[0] = vld2q_u8(reinterpret_cast<const uint8_t *>(src)).val[0];
-  b->chunks[1] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 16).val[0];
-  b->chunks[2] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 32).val[0];
-  b->chunks[3] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 48).val[0];
+  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
+  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
+  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
+  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
 }
 
 // decode 64 bytes and output 48 bytes
diff --git a/src/simdutf/arm64/simd16-inl.h b/src/simdutf/arm64/simd16-inl.h
index 66d1168b7..32734c0ab 100644
--- a/src/simdutf/arm64/simd16-inl.h
+++ b/src/simdutf/arm64/simd16-inl.h
@@ -156,7 +156,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
   simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
   simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
 
-  // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     return vqmovn_high_u16(vqmovn_u16(v0), v1);
   }
diff --git a/src/simdutf/haswell/simd16-inl.h b/src/simdutf/haswell/simd16-inl.h
index 04c1b7fe0..964ff4ebd 100644
--- a/src/simdutf/haswell/simd16-inl.h
+++ b/src/simdutf/haswell/simd16-inl.h
@@ -140,7 +140,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     return _mm256_shuffle_epi8(*this, swap);
   }
 
-  // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
     //       we have to shuffle lanes in order to produce bytes in the
diff --git a/src/simdutf/westmere/simd16-inl.h b/src/simdutf/westmere/simd16-inl.h
index bbcca0776..694d93d22 100644
--- a/src/simdutf/westmere/simd16-inl.h
+++ b/src/simdutf/westmere/simd16-inl.h
@@ -146,7 +146,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     return _mm_shuffle_epi8(*this, swap);
   }
 
-  // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     return _mm_packus_epi16(v0, v1);
   }

From b917aa8f46718635fe72cfea4f17ce6b69c46742 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 22 Mar 2024 00:26:52 -0400
Subject: [PATCH 07/49] finishing...

---
 README.md                          | 46 ++++++++++++++++++++++++++++--
 src/haswell/avx2_base64.cpp        | 19 ++++++++++--
 src/icelake/icelake_base64.inl.cpp |  7 +++--
 src/westmere/sse_base64.cpp        | 21 ++++++++++++--
 4 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 147842460..244aaf0dc 100644
--- a/README.md
+++ b/README.md
@@ -1591,11 +1591,19 @@ if(r.error) {
 }
 ```
 
+In some instances, you may want to limit the size of the output further when decoding base64.
+For this purpose, you may use the `base64_to_binary_safe` functions.
+
+In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
+we have function overloads for these cases as well.
+
 Some users may want to decode the base64 inputs in chunks, especially when doing
 file or networking programming. These users should see `tools/fastbase64.cpp`, a command-line
 utility designed for as an example. It reads and writes base64 files using chunks of at most
 a few tens of kilobytes.
 
+
+
 The specification of our base64 functions is as follows:
 
 ```C++
@@ -1615,7 +1623,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * In general, if the input contains ASCII spaces, the result will be less than
  * the maximum length.
  *
- * @param input         the base64 input to process in UTF-16 (native endianess)
+ * @param input         the base64 input to process in 16-bit units
  * @param length        the length of the base64 input in 16-bit units
  * @return maximal number of binary bytes
  */
@@ -1696,7 +1704,41 @@ size_t binary_to_base64(const char * input, size_t length, char* output) noexcep
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result utf16_base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are three possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
+ * where we stopped decoding.
+ *
+ * In all case, the outlen parameter is modified to contain the number of bytes
+ * that have been written/decoded.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
 
 ```
 
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index c1151d174..6eed08481 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -276,6 +276,19 @@ static inline void load_block(block64 *b, const char *src) {
       _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
 }
 
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x13);
+  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x13);
+  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+}
+
 static inline void base64_decode(char *out, __m256i str) {
   // credit: aqrit
   const __m256i pack_shuffle =
@@ -329,16 +342,16 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   char *end_of_safe_64byte_zone =
       (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
 
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
   static_assert(block_size >= 2, "block_size must be at least two");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index e09b117fd..20399ef9b 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -107,8 +107,11 @@ static inline void load_block(block64 *b, const char *src) {
 }
 
 static inline void load_block(block64 *b, const char16_t *src) {
-  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
-}shit
+  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 64));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
 
 static inline void base64_decode(char *out, __m512i str) {
   const __m512i merge_ab_and_bc =
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index 7ef6fd4b5..ef57f5184 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -274,6 +274,21 @@ static inline void load_block(block64 *b, const char *src) {
   b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
 }
 
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
+  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
+  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
+  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
+  b->chunks[0] = _mm_packus_epi16(m1, m2);
+  b->chunks[1] = _mm_packus_epi16(m3, m4);
+  b->chunks[2] = _mm_packus_epi16(m5, m6);
+  b->chunks[3] = _mm_packus_epi16(m7, m8);
+}
+
 static inline void base64_decode(char *out, __m128i str) {
   // credit: aqrit
 
@@ -337,16 +352,16 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   char *end_of_safe_64byte_zone =
       (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
 
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
   static_assert(block_size >= 2, "block should of size 2 or more");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);

From ca17560fe21967261ed111e447d8e41706a61250 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Wed, 27 Mar 2024 00:42:47 -0400
Subject: [PATCH 08/49] various fixes

---
 README.md                          |  12 +-
 include/simdutf/implementation.h   |  11 +-
 src/haswell/avx2_base64.cpp        |   4 +-
 src/icelake/icelake_base64.inl.cpp |   2 +-
 src/implementation.cpp             |  37 +-
 src/scalar/base64.h                |  15 +-
 tests/base64_tests.cpp             | 622 ++++++++++++++++++++++++-----
 tests/helpers/test.h               |  13 +-
 8 files changed, 581 insertions(+), 135 deletions(-)

diff --git a/README.md b/README.md
index 244aaf0dc..1216404a6 100644
--- a/README.md
+++ b/README.md
@@ -1592,7 +1592,9 @@ if(r.error) {
 ```
 
 In some instances, you may want to limit the size of the output further when decoding base64.
-For this purpose, you may use the `base64_to_binary_safe` functions.
+For this purpose, you may use the `base64_to_binary_safe` functions. The functions may also
+be useful if you seek to decode the input into segments having a maximal capacity.
+See our function specifications for more details.
 
 In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
 we have function overloads for these cases as well.
@@ -1602,8 +1604,6 @@ file or networking programming. These users should see `tools/fastbase64.cpp`, a
 utility designed for as an example. It reads and writes base64 files using chunks of at most
 a few tens of kilobytes.
 
-
-
 The specification of our base64 functions is as follows:
 
 ```C++
@@ -1732,10 +1732,10 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * that have been written/decoded.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
- * @param length        the length of the string in 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
- * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful. Note that the return convention of base64_to_binary_safe differs from base64_to_binary.
  */
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
 simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index aafa9ff97..fbf20f8fc 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1497,17 +1497,12 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
  *
- * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
- * where we stopped decoding.
- *
- * In all case, the outlen parameter is modified to contain the number of bytes
- * that have been written/decoded.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
- * @param length        the length of the string in 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
- * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
  */
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
 simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 6eed08481..1f222b3b8 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -282,9 +282,9 @@ static inline void load_block(block64 *b, const char16_t *src) {
   __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
   __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
   __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
-  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x13);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
   __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
-  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x13);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
   b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
   b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
 }
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 20399ef9b..a7ff0c091 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -108,7 +108,7 @@ static inline void load_block(block64 *b, const char *src) {
 
 static inline void load_block(block64 *b, const char16_t *src) {
   __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
-  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 64));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
   __m512i p = _mm512_packus_epi16(m1, m2);
   b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
 }
diff --git a/src/implementation.cpp b/src/implementation.cpp
index b266e1d26..e964ce565 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1303,13 +1303,16 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
 }
 
 template <typename chartype>
-simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
+simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
   static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
   // The implementation could be nicer, but we expect that most times, the user
   // will provide us with a buffer that is large enough.
   size_t max_length = maximal_binary_length_from_base64(input, length);
   if(outlen >= max_length) {
-    return base64_to_binary(input, length, output);
+    // fast path
+    result r = base64_to_binary(input, length, output);
+    if(r.error != error_code::INVALID_BASE64_CHARACTER) { outlen = r.count; r.count = length; }
+    return r;
   }
   // The output buffer is maybe too small. We will decode a truncated version of the input.
   size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
@@ -1320,23 +1323,37 @@ simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t
     ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
   size_t output_index = r.count - (r.count % 3);
   size_t input_index = safe_input;
-  while(offset > 0) {
-    char c = input[--input_index];
+  while(offset > 0 && input_index > 0) {
+    chartype c = input[--input_index];
     if(c == '=' || c == '\n' || c == '\r' || c == '\t' || c == ' ') {
+      // skipping
+    } else {
       offset--;
     }
   }
   size_t remaining_out = outlen - output_index;
-  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, input + input_index, length - input_index);
-  outlen = output_index + remaining_out;
-  if(r.error == error_code::INVALID_BASE64_CHARACTER) {
-    r.count += input_index;
-  } else {
-    r.count = output_index;
+  const chartype * tail_input = input + input_index;
+  size_t tail_length = length - input_index;
+  if(tail_length > 0 && tail_input[tail_length - 1] == '=') {
+    tail_length--;
+    if(tail_length > 0 && tail_input[tail_length - 1] == '=') {
+      tail_length--;
+    }
   }
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length);
+  outlen = output_index + remaining_out;
+  r.count += input_index;
   return r;
 }
 
+
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen);
+}
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen);
+}
+
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return get_default_implementation()->base64_length_from_binary(length);
 }
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index ec2002618..7a19087fe 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -41,6 +41,8 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
         idx++;
       } else if (code > 64) {
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
       }
       src++;
     }
@@ -94,6 +96,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
 
 // like base64_tail_decode, but it will not write past the end of the ouput buffer.
 // outlen is modified to reflect the number of bytes written.
+// This functions assumes that the padding (=) has been removed.
 template <class char_type>
 result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length) {
   const char_type *srcend = src + length;
@@ -120,9 +123,11 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       src += 4;
     }
     idx = 0;
+    const char_type *srccur = src;
+
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
-      char c = *src;
+      char_type c = *src;
       uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
@@ -130,6 +135,8 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       } else if (code > 64) {
         outlen = size_t(dst - dstinit);
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
       }
       src++;
     }
@@ -137,7 +144,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       if (idx == 2) {
         if(dst == dstend) {
           outlen = size_t(dst - dstinit);
-          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
         }
         uint32_t triple =
             (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
@@ -154,7 +161,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       } else if (idx == 3) {
         if(dst + 2 >= dstend) {
           outlen = size_t(dst - dstinit);
-          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
         }
         uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
                           (uint32_t(buffer[1]) << 2 * 6) +
@@ -177,7 +184,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
     }
     if(dst + 3 >= dstend) {
       outlen = size_t(dst - dstinit);
-      return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
     }
     uint32_t triple =
         (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 6263a4c8a..48b7b9e6c 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -11,35 +11,164 @@
 using random_generator = std::mt19937;
 static random_generator::result_type seed = 42;
 
+const uint8_t to_base64_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
+    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+template <typename char_type>
+size_t add_space(std::vector<char_type> &v, std::mt19937 &gen) {
+  const static std::array<char_type, 4> space = {' ', '\t', '\n', '\r'};
+  int padding = 0;
+  if (v.size() > 0 && v[v.size() - 1] == '=') {
+    padding++;
+    if (v.size() > 0 && v[v.size() - 1] == '=') {
+      padding++;
+    }
+  }
+  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
+  size_t i = index_dist(gen);
+  std::uniform_int_distribution<int> char_dist(0, 3);
+  v.insert(v.begin() + i, space[char_dist(gen)]);
+  return i;
+}
+
+template <typename char_type>
+size_t add_garbage(std::vector<char_type> &v, std::mt19937 &gen) {
+  int padding = 0;
+  if (v.size() > 0 && v[v.size() - 1] == '=') {
+    padding++;
+    if (v.size() > 0 && v[v.size() - 1] == '=') {
+      padding++;
+    }
+  }
+  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
+  size_t i = index_dist(gen);
+  std::uniform_int_distribution<int> char_dist(
+      0, (1 << (sizeof(char_type) * 8)) - 1);
+  uint8_t c = char_dist(gen);
+  while (uint8_t(c) == c && to_base64_value[uint8_t(c)] != 255) {
+    c = char_dist(gen);
+  }
+  v.insert(v.begin() + i, c);
+  return i;
+}
+
 TEST(decode_base64_cases) {
   std::vector<std::vector<char>> cases = {{0x53, 0x53}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
 
-  for(size_t i = 0; i < cases.size(); i++) {
-    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(cases[i].data(), cases[i].size()));
-    simdutf::result r = implementation.base64_to_binary(cases[i].data(), cases[i].size(), buffer.data());
-    ASSERT_EQUAL(r.error,codes[i]);
+  for (size_t i = 0; i < cases.size(); i++) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        cases[i].data(), cases[i].size()));
+    simdutf::result r = implementation.base64_to_binary(
+        cases[i].data(), cases[i].size(), buffer.data());
+    ASSERT_EQUAL(r.error, codes[i]);
     ASSERT_EQUAL(r.count, counts[i]);
   }
 }
 
 TEST(encode_base64_cases) {
-  std::vector<std::pair<std::string,std::string>> cases = {
-    {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
-    {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
-    {"123456", "MTIzNDU2"},
-    {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", "MTIzNDU2"},
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
-
-  for(std::pair<std::string,std::string> p : cases) {
-    std::vector<char> buffer(implementation.base64_length_from_binary(p.first.size()));
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(
+        implementation.base64_length_from_binary(p.first.size()));
     ASSERT_EQUAL(buffer.size(), p.second.size());
-    size_t s = implementation.binary_to_base64(p.first.data(),p.first.size(), buffer.data());
+    size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
+                                               buffer.data());
     ASSERT_EQUAL(s, p.second.size());
     ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
   }
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data());
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" --  ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
+TEST(encode_base64_cases_16) {
+  std::vector<std::pair<std::string, std::u16string>> cases = {
+      {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", u"MTIzNDU2"},
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data());
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
 }
 
 TEST(roundtrip_base64) {
@@ -61,16 +190,17 @@ TEST(roundtrip_base64) {
           implementation.base64_to_binary(buffer.data(), size, back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
       ASSERT_EQUAL(r.count, len);
-      if(back != source) {
+      if (back != source) {
         printf("=====input size %zu\n", len);
-        for(size_t i = 0; i < len; i++) {
-          if(back[i] != source[i]) {
-            std::cerr << "Mismatch at position " << i << " trial " << trial << std::endl;
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
           }
           printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
         }
         printf("=====base64 size %zu\n", size);
-        for(size_t i = 0; i < size; i++) {
+        for (size_t i = 0; i < size; i++) {
           printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
         }
       }
@@ -79,59 +209,49 @@ TEST(roundtrip_base64) {
   }
 }
 
-const uint8_t to_base64_value[] = {
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
-    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
-    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
-    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
-    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
-    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255};
-
-size_t add_space(std::vector<char> &v, std::mt19937 &gen) {
-  const static std::array<char, 4> space = {' ', '\t', '\n', '\r'};
-  int padding = 0;
-  if (v.size() > 0 && v[v.size() - 1] == '=') {
-    padding++;
-    if (v.size() > 0 && v[v.size() - 1] == '=') {
-      padding++;
-    }
-  }
-  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
-  size_t i = index_dist(gen);
-  std::uniform_int_distribution<int> char_dist(0, 3);
-  v.insert(v.begin() + i, space[char_dist(gen)]);
-  return i;
-}
+TEST(roundtrip_base64_16) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
 
-size_t add_garbage(std::vector<char> &v, std::mt19937 &gen) {
-  int padding = 0;
-  if (v.size() > 0 && v[v.size() - 1] == '=') {
-    padding++;
-    if (v.size() > 0 && v[v.size() - 1] == '=') {
-      padding++;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer16.data(), size, back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
     }
   }
-  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
-  size_t i = index_dist(gen);
-  std::uniform_int_distribution<int> char_dist(0, 255);
-  uint8_t c = char_dist(gen);
-  while(to_base64_value[uint8_t(c)] != 255) {
-    c = char_dist(gen);
-  }
-  v.insert(v.begin() + i, c);
-  return i;
 }
 
 TEST(doomed_base64_roundtrip) {
@@ -151,8 +271,13 @@ TEST(doomed_base64_roundtrip) {
       size_t location = add_garbage(buffer, gen);
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::INVALID_BASE64_CHARACTER);
+      ASSERT_EQUAL(r.count, location);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
       ASSERT_EQUAL(r.error, simdutf::error_code::INVALID_BASE64_CHARACTER);
       ASSERT_EQUAL(r.count, location);
     }
@@ -175,10 +300,49 @@ TEST(doomed_truncated_base64_roundtrip) {
       buffer.resize(size - 3);
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, (size - 4) / 4 * 3);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, buffer.size());
+    }
+  }
+}
+
+TEST(doomed_truncated_base64_roundtrip_16) {
+  for (size_t len = 1; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size - 3);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+          buffer16.data(), buffer16.size()));
       simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+          buffer16.data(), buffer16.size(), back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
-      ASSERT_EQUAL(r.count, (size-4)/4*3);
+      ASSERT_EQUAL(r.count, (size - 4) / 4 * 3);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer16.data(), buffer16.size(),
+                                         back.data(), back_length);
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, buffer16.size());
     }
   }
 }
@@ -200,21 +364,278 @@ TEST(roundtrip_base64_with_spaces) {
       for (size_t i = 0; i < 5; i++) {
         add_space(buffer, gen);
       }
-
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
 
       back.resize(
           r.count); // resize the buffer according to actual number of bytes
       ASSERT_EQUAL(r.count, len);
       ASSERT_TRUE(back == source);
+      back.resize(back.capacity());
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
+
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+
+      back.resize(
+          back_length); // resize the buffer according to actual number of bytes
+      ASSERT_EQUAL(r.count, buffer.size());
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(roundtrip_base64_16_with_spaces) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size);
+      for (size_t i = 0; i < 5; i++) {
+        add_space(buffer, gen);
+      }
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r = implementation.base64_to_binary(
+          buffer16.data(), buffer16.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+            buffer.data(), buffer.size()));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
     }
   }
 }
 
+TEST(aborted_safe_roundtrip_base64_16) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      std::vector<char16_t> buffer16;
+
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::vector<char> back(len);
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        buffer16.resize(buffer.size());
+        for (size_t i = 0; i < buffer.size(); i++) {
+          buffer16[i] = buffer[i];
+        }
+        ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_with_spaces) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        for (size_t i = 0; i < 5; i++) {
+          add_space(buffer, gen);
+        }
+        std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+            buffer.data(), buffer.size()));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_16_with_spaces) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      std::vector<char16_t> buffer16;
+
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::vector<char> back(len);
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        for (size_t i = 0; i < 5; i++) {
+          add_space(buffer, gen);
+        }
+        buffer16.resize(buffer.size());
+        for (size_t i = 0; i < buffer.size(); i++) {
+          buffer16[i] = buffer[i];
+        }
+        ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
 
 TEST(streaming_base64_roundtrip) {
   size_t len = 2048;
@@ -226,25 +647,25 @@ TEST(streaming_base64_roundtrip) {
   for (size_t i = 0; i < len; i++) {
     source[i] = byte_generator(gen);
   }
-  size_t size = implementation.binary_to_base64(
-          source.data(), source.size(), buffer.data());
+  size_t size = implementation.binary_to_base64(source.data(), source.size(),
+                                                buffer.data());
   buffer.resize(size);
   for (size_t window = 16; window <= 2048; window += 7) {
     // build a buffer with enough space to receive the decoded base64
     std::vector<char> back(len);
     size_t outpos = 0;
-    for(size_t pos = 0; pos < buffer.size(); pos += window) {
+    for (size_t pos = 0; pos < buffer.size(); pos += window) {
       size_t count = std::min(window, buffer.size() - pos);
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data() + pos, count, back.data() + outpos);
+      simdutf::result r = simdutf::base64_to_binary(buffer.data() + pos, count,
+                                                    back.data() + outpos);
       ASSERT_TRUE(r.error != simdutf::error_code::INVALID_BASE64_CHARACTER);
-      if(count + pos == buffer.size()) {
+      if (count + pos == buffer.size()) {
         // We must check that the last call to base64_to_binary did not
         // end with an BASE64_INPUT_REMAINDER error.
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
       } else {
         size_t tail_bytes_to_reprocess = 0;
-        if(r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+        if (r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
           tail_bytes_to_reprocess = 1;
         } else {
           tail_bytes_to_reprocess = (r.count % 3) == 0 ? 0 : (r.count % 3) + 1;
@@ -259,31 +680,34 @@ TEST(streaming_base64_roundtrip) {
   }
 }
 
-
 TEST(readme_test) {
   size_t len = 2048;
   std::vector<char> base64(len, 'a');
-  std::vector<char> back((len+3)/4*3);
+  std::vector<char> back((len + 3) / 4 * 3);
   size_t outpos = 0;
   size_t window = 512;
-  for(size_t pos = 0; pos < base64.size(); pos += window) {
+  for (size_t pos = 0; pos < base64.size(); pos += window) {
     // how many base64 characters we can process in this iteration
     size_t count = std::min(window, base64.size() - pos);
-    simdutf::result r = simdutf::base64_to_binary(
-        base64.data() + pos, count, back.data() + outpos);
-    if(r.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
-      std::cerr << "Invalid base64 character at position " << pos + r.count << std::endl;
+    simdutf::result r = simdutf::base64_to_binary(base64.data() + pos, count,
+                                                  back.data() + outpos);
+    if (r.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
+      std::cerr << "Invalid base64 character at position " << pos + r.count
+                << std::endl;
       return;
     }
-    // If we arrived at the end of the base64 input, we must check that the number
-    // of characters processed is a multiple of 4, or that we have a remainder of 0, 2 or 3.
-    if(count + pos == base64.size() && r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
-      std::cerr << "The base64 input contained an invalid number of characters " << std::endl;
+    // If we arrived at the end of the base64 input, we must check that the
+    // number of characters processed is a multiple of 4, or that we have a
+    // remainder of 0, 2 or 3.
+    if (count + pos == base64.size() &&
+        r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+      std::cerr << "The base64 input contained an invalid number of characters "
+                << std::endl;
     }
-    // If we are not at then end, we may have to reprocess either 1, 2 or 3 bytes, and
-    // to drop the last 0, 2 or 3 bytes decoded.
+    // If we are not at then end, we may have to reprocess either 1, 2 or 3
+    // bytes, and to drop the last 0, 2 or 3 bytes decoded.
     size_t tail_bytes_to_reprocess = 0;
-    if(r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+    if (r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
       tail_bytes_to_reprocess = 1;
     } else {
       tail_bytes_to_reprocess = (r.count % 3) == 0 ? 0 : (r.count % 3) + 1;
@@ -300,9 +724,9 @@ int main(int argc, char *argv[]) {
   if (argc == 2) {
     try {
       seed = std::stoi(argv[1]);
-    } catch (const std::exception& e) {
-        printf("%s\n", e.what());
-        return EXIT_FAILURE;
+    } catch (const std::exception &e) {
+      printf("%s\n", e.what());
+      return EXIT_FAILURE;
     }
   }
   return simdutf::test::main(argc, argv);
diff --git a/tests/helpers/test.h b/tests/helpers/test.h
index 1d7d20238..5c28a299c 100644
--- a/tests/helpers/test.h
+++ b/tests/helpers/test.h
@@ -41,11 +41,12 @@ void name(const simdutf::implementation& impl) {            \
 static simdutf::test::register_test test_register_##name(#name, name); \
 void test_impl_##name(const simdutf::implementation& implementation)
 
-#define ASSERT_EQUAL(a, b) {                                      \
-  const auto expr = (a);                                          \
-  if (expr != b) {                                                \
-    std::cout << "\nExpected " << expr << " to be " << b << ".\n";\
-    printf("%s \n",#a);                                           \
+#define ASSERT_EQUAL(a, b) {                                                   \
+  const auto expr = (a);                                                       \
+  if (expr != b) {                                                             \
+    std::cout << "\nExpected " << expr << " to be " << b << ".\n";             \
+    printf("%s \n",#a);                                                        \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                      \
   }                                                               \
 }
@@ -54,6 +55,7 @@ void test_impl_##name(const simdutf::implementation& implementation)
   const bool expr = (cond);                                 \
   if (!expr) {                                              \
     printf("expected %s to be true, it's false\n", #cond);  \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                \
   }                                                         \
 }
@@ -62,6 +64,7 @@ void test_impl_##name(const simdutf::implementation& implementation)
   const bool expr = !(cond);                                \
   if (!expr) {                                              \
     printf("expected %s to be false, it's true\n", #cond);  \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                \
   }                                                         \
 }

From 94b7dac8daa58b87ac9262b01ef4769970f20f32 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 12:12:12 -0400
Subject: [PATCH 09/49] Implemented bun benchmark

---
 benchmarks/base64/CMakeLists.txt       |  4 +-
 benchmarks/base64/benchmark_base64.cpp | 87 ++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/benchmarks/base64/CMakeLists.txt b/benchmarks/base64/CMakeLists.txt
index a866b9609..d00fc855e 100644
--- a/benchmarks/base64/CMakeLists.txt
+++ b/benchmarks/base64/CMakeLists.txt
@@ -9,6 +9,8 @@ CPMAddPackage(
 
 add_executable(benchmark_base64 benchmark_base64.cpp)
 
-target_link_libraries(benchmark_base64 PUBLIC simdutf)
+set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD 17)
+set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD_REQUIRED ON)
+
 target_link_libraries(benchmark_base64 PUBLIC base64)
 target_link_libraries(benchmark_base64 PUBLIC simdutf::benchmarks::benchmark)
diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 579d1e198..ed41a9406 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -10,8 +10,8 @@
 #include <vector>
 
 #include "libbase64.h"
-#include "simdutf.h"
 #include "node_base64.h"
+#include "simdutf.h"
 
 #include "event_counter.h"
 #include <atomic>
@@ -34,7 +34,7 @@ bool is_space(char c) {
 // This is for reference only, do not use this function in production
 // system.
 int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
-                                     size_t *outlen) {
+                              size_t *outlen) {
   struct base64_state state;
   base64_stream_decode_init(&state, 0);
   const char *srcend = src + srclen;
@@ -65,7 +65,7 @@ int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
   return !state.bytes;
 }
 
-enum : uint8_t { roundtrip = 0, decode = 1, encode = 2 };
+enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3 };
 
 event_collector collector;
 
@@ -277,14 +277,14 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
                      }
                    }));
     }
-    pretty_print(data.size(), volume, "node",
-                 bench([&data, &buffer1, &buffer2]() {
-                   for (const std::vector<char> &source : data) {
-                     int result = node::base64_decode(buffer1.data(), buffer1.size(),
-                                    source.data(), source.size());
-                     (void) result;
-                   }
-                 }));
+    pretty_print(
+        data.size(), volume, "node", bench([&data, &buffer1, &buffer2]() {
+          for (const std::vector<char> &source : data) {
+            int result = node::base64_decode(buffer1.data(), buffer1.size(),
+                                             source.data(), source.size());
+            (void)result;
+          }
+        }));
     for (auto &e : simdutf::get_available_implementations()) {
       if (!e->supported_by_runtime_system()) {
         continue;
@@ -316,7 +316,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
     printf("# encode\n");
     volatile size_t base64_size;
     pretty_print(data.size(), volume, "libbase64",
-                 bench([&data, &buffer1, &buffer2, &base64_size]() {
+                 bench([&data, &buffer1, &base64_size]() {
                    for (const std::vector<char> &source : data) {
                      size_t outlen;
                      base64_encode(source.data(), source.size(), buffer1.data(),
@@ -329,7 +329,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
         continue;
       }
       pretty_print(data.size(), volume, "simdutf::" + e->name(),
-                   bench([&data, &buffer1, &buffer2, &e, &base64_size]() {
+                   bench([&data, &buffer1, &e, &base64_size]() {
                      for (const std::vector<char> &source : data) {
                        base64_size = e->binary_to_base64(
                            source.data(), source.size(), buffer1.data());
@@ -341,6 +341,62 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
   }
 }
 
+int bench_bun() {
+  /**
+   * See
+   * https://github.com/oven-sh/bun/blob/main/bench/snippets/buffer-to-string.mjs
+   *
+   * const bigBuffer = Buffer.from("hello world".repeat(10000));
+   * const converted = bigBuffer.toString("base64");
+   * const uuid = crypto.randomBytes(16);
+   *
+   * bench(`Buffer(${bigBuffer.byteLength}).toString('base64')`, () => {
+   * return bigBuffer.toString("base64");
+   * });
+   *
+   * bench(`Buffer(${uuid.byteLength}).toString('base64')`, () => {
+   *  return uuid.toString("base64");
+   * });
+   */
+  printf("# benching bun (essentially an encoding bench)\n");
+  std::string bigBuffer = "hello world";
+  bigBuffer.reserve(10000 * bigBuffer.size());
+  for (size_t i = 1; i < 10000; i++) {
+    bigBuffer += "hello world";
+  }
+  std::string crypto;
+  for (size_t i = 0; i < 16; i++) {
+    crypto += rand();
+  }
+  std::vector<std::pair<std::string, std::string>> tests = {
+      {"big hello world", bigBuffer}, {"random 16 bytes", crypto}};
+  // Could be nicer with C++20
+  for (auto & i : tests) {
+    printf("# %s\n", i.first.c_str());
+    std::string source = i.second;
+    volatile size_t base64_size;
+    std::vector<char> buffer1(simdutf::base64_length_from_binary(source.size()));
+    pretty_print(1, source.size(), "libbase64",
+                 bench([&source, &buffer1, &base64_size]() {
+                   size_t outlen;
+                   base64_encode(source.data(), source.size(), buffer1.data(),
+                                 &outlen, 0);
+                   base64_size = outlen;
+                 }));
+    for (auto &e : simdutf::get_available_implementations()) {
+      if (!e->supported_by_runtime_system()) {
+        continue;
+      }
+      pretty_print(1, source.size(), "simdutf::" + e->name(),
+                   bench([&source, &buffer1, &e, &base64_size]() {
+                     base64_size = e->binary_to_base64(
+                         source.data(), source.size(), buffer1.data());
+                   }));
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
 int main(int argc, char **argv) {
   printf("# current system detected as %s.\n",
          simdutf::get_active_implementation()->name().c_str());
@@ -363,10 +419,15 @@ int main(int argc, char **argv) {
       mode = encode;
     } else if ((arg == "-r") || (arg == "--roundtrip")) {
       mode = roundtrip;
+    } else if ((arg == "-b") || (arg == "--bun")) {
+      mode = bun;
     } else {
       arguments.push_back(std::move(arg));
     }
   }
+  if (bun) {
+    return bench_bun();
+  }
   auto return_value = EXIT_SUCCESS;
   std::vector<std::vector<char>> input;
   printf("# loading files: ");

From c35d8df803f804d871d7d74e0a79f3dabb2d19e5 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 12:13:35 -0400
Subject: [PATCH 10/49] Obvious fix.

---
 benchmarks/base64/benchmark_base64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index ed41a9406..d1078cf07 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -425,7 +425,7 @@ int main(int argc, char **argv) {
       arguments.push_back(std::move(arg));
     }
   }
-  if (bun) {
+  if (mode == bun) {
     return bench_bun();
   }
   auto return_value = EXIT_SUCCESS;

From 1a90f2a8cc83066f59b7b38166e0921772de99d3 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 12:14:44 -0400
Subject: [PATCH 11/49] documentation

---
 benchmarks/base64/benchmark_base64.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index d1078cf07..b280e42cc 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -110,10 +110,12 @@ std::vector<char> read_file(const char *filename,
 void show_help() {
   printf("Usage: benchmark_base64 [options] file1 [file2 ...]\n");
   printf("Options:\n");
-  printf("  -h, --help     Show this help message and exit\n");
-  printf("  -d, --decode   Decode the input file\n");
-  printf("  -e, --encode   Encode the input file\n");
+  printf("  -h, --help        Show this help message and exit\n");
+  printf("  -d, --decode      Decode the input file\n");
+  printf("  -e, --encode      Encode the input file\n");
   printf("  -r, --roundtrip   Roundtrip the input file\n");
+  printf("  -b, --bun         Bun benchmark\n");
+
   printf(" See https://github.com/lemire/base64data for test data.\n");
 }
 void pretty_print(size_t, size_t bytes, std::string name, event_aggregate agg) {

From bd454eafd8467fc4d51b5e863b66be786e25e038 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 14:30:47 -0400
Subject: [PATCH 12/49] adding libbase64 competitor

---
 benchmarks/base64/benchmark_base64.cpp | 24 +++++++++++--
 benchmarks/base64/libbase64_spaces.h   | 50 ++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/base64/libbase64_spaces.h

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index b280e42cc..22eaf8dc5 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -10,7 +10,9 @@
 #include <vector>
 
 #include "libbase64.h"
+#include "libbase64_spaces.h"
 #include "node_base64.h"
+
 #include "simdutf.h"
 
 #include "event_counter.h"
@@ -261,7 +263,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
     bool spaces = contains_spaces(data);
     if (spaces) {
       printf("# the base64 data contains spaces, so we cannot use straigth "
-             "libbase64::base64_decode\n");
+             "libbase64::base64_decode directly\n");
     } else {
       pretty_print(data.size(), volume, "libbase64",
                    bench([&data, &buffer1, &buffer2]() {
@@ -279,6 +281,21 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
                      }
                    }));
     }
+    pretty_print(
+        data.size(), volume, "libbase64_space_decode",
+        bench([&data, &buffer1, &buffer2]() {
+          for (const std::vector<char> &source : data) {
+
+            size_t outlen;
+            bool ok = libbase64_space_decode(source.data(), source.size(),
+                                             buffer1.data(), &outlen);
+            if (!ok) {
+              std::cerr << "Error: "
+                        << " failed to decode base64 " << std::endl;
+              throw std::runtime_error("Error: failed to decode base64 ");
+            }
+          }
+        }));
     pretty_print(
         data.size(), volume, "node", bench([&data, &buffer1, &buffer2]() {
           for (const std::vector<char> &source : data) {
@@ -373,11 +390,12 @@ int bench_bun() {
   std::vector<std::pair<std::string, std::string>> tests = {
       {"big hello world", bigBuffer}, {"random 16 bytes", crypto}};
   // Could be nicer with C++20
-  for (auto & i : tests) {
+  for (auto &i : tests) {
     printf("# %s\n", i.first.c_str());
     std::string source = i.second;
     volatile size_t base64_size;
-    std::vector<char> buffer1(simdutf::base64_length_from_binary(source.size()));
+    std::vector<char> buffer1(
+        simdutf::base64_length_from_binary(source.size()));
     pretty_print(1, source.size(), "libbase64",
                  bench([&source, &buffer1, &base64_size]() {
                    size_t outlen;
diff --git a/benchmarks/base64/libbase64_spaces.h b/benchmarks/base64/libbase64_spaces.h
new file mode 100644
index 000000000..6e68c2caf
--- /dev/null
+++ b/benchmarks/base64/libbase64_spaces.h
@@ -0,0 +1,50 @@
+
+// https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L392
+static inline size_t libbase64_find_space(const char *p, const size_t avail) {
+  for (size_t len = 0; len < avail; len++) {
+    if (p[len] == '\n' || p[len] == '\r' || p[len] == ' ' || p[len] == '\t') {
+      return len;
+    }
+  }
+
+  return avail;
+}
+
+// Inspired by
+// https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L405
+
+static bool libbase64_space_decode(const char *start, size_t avail, char *outbuf,
+                   size_t *outlen) {
+  struct base64_state state;
+  *outlen = 0;
+
+  // Initialize the decoder's state structure.
+  base64_stream_decode_init(&state, 0);
+
+  while (avail > 0) {
+    size_t len = libbase64_find_space(start, avail);
+    if (len == 0) {
+      start++;
+      avail--;
+      continue;
+    }
+
+    // Decode the chunk into the raw buffer.
+    size_t outlen = 0;
+    if (base64_stream_decode(&state, start, len, outbuf, &outlen) == 0) {
+      // decoding error
+      return false;
+    }
+
+    // Update the output buffer pointer and total size.
+    outbuf += outlen;
+    outlen += outlen;
+    if(avail == len) {
+      break;
+    }
+
+    start += len + 1;
+    avail -= len + 1;
+  }
+  return true;
+}
\ No newline at end of file

From bdab72fcd5d1722f97b0eab8cd2c2cba28c4ff2e Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 15:06:29 -0400
Subject: [PATCH 13/49] more documentation.

---
 README.md              | 24 ++++++++++++++++++++++++
 tests/base64_tests.cpp | 30 +++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 1216404a6..e78038ab0 100644
--- a/README.md
+++ b/README.md
@@ -1594,6 +1594,30 @@ if(r.error) {
 In some instances, you may want to limit the size of the output further when decoding base64.
 For this purpose, you may use the `base64_to_binary_safe` functions. The functions may also
 be useful if you seek to decode the input into segments having a maximal capacity.
+
+
+```C++
+  size_t len = 72; // for simplicity we chose len divisible by 3
+  std::vector<char> base64(len, 'a'); // we want to decode 'aaaaa....'
+  std::vector<char> back((len + 3) / 4 * 3);
+  size_t limited_length = back.size() / 2; // Intentionally too small
+  // We proceed to decode half:
+  simdutf::result r = simdutf::base64_to_binary_safe(
+            base64.data(), base64.size(), back.data(), limited_length);
+  assert(r.error == simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+  // We decoded r.count base64 bytes to limited_length bytes
+  // Now let us decode the rest !!!
+  size_t input_index = r.count;
+  size_t limited_length2 = back.size();
+  r = simdutf::base64_to_binary_safe(base64.data() + input_index,
+                                           base64.size() - input_index,
+                                           back.data(), limited_length2);
+  assert(r.error == simdutf::error_code::SUCCESS);
+  // We decoded r.count base64 bytes to limited_length2 bytes
+  // We are done
+  assert(limited_length2 + limited_length == (len + 3) / 4 * 3);
+```
+
 See our function specifications for more details.
 
 In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 48b7b9e6c..e04dee0a1 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -472,7 +472,7 @@ TEST(aborted_safe_roundtrip_base64) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
 
         for (size_t i = 0; i < second_length; i++) {
@@ -524,7 +524,7 @@ TEST(aborted_safe_roundtrip_base64_16) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
         for (size_t i = 0; i < second_length; i++) {
           ASSERT_EQUAL(source[i + limited_length], back[i]);
@@ -572,9 +572,8 @@ TEST(aborted_safe_roundtrip_base64_with_spaces) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
-
         for (size_t i = 0; i < second_length; i++) {
           ASSERT_EQUAL(source[i + limited_length], back[i]);
         }
@@ -627,7 +626,7 @@ TEST(aborted_safe_roundtrip_base64_16_with_spaces) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
         for (size_t i = 0; i < second_length; i++) {
           ASSERT_EQUAL(source[i + limited_length], back[i]);
@@ -720,6 +719,27 @@ TEST(readme_test) {
   back.resize(outpos);
 }
 
+TEST(readme_safe) {
+  size_t len = 72;
+  std::vector<char> base64(len, 'a');
+  std::vector<char> back((len + 3) / 4 * 3);
+  size_t limited_length = back.size() / 2; // Intentionally too small
+  simdutf::result r = simdutf::base64_to_binary_safe(
+            base64.data(), base64.size(), back.data(), limited_length);
+  ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+
+  // We decoded 'limited_length' bytes to back.
+  // Now let us decode the rest !!!
+  size_t input_index = r.count;
+  size_t limited_length2 = back.size();
+  r = simdutf::base64_to_binary_safe(base64.data() + input_index,
+                                           base64.size() - input_index,
+                                           back.data(), limited_length2);
+  ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+  back.resize(limited_length2);
+  ASSERT_EQUAL(limited_length2 + limited_length, (len + 3) / 4 * 3);
+}
+
 int main(int argc, char *argv[]) {
   if (argc == 2) {
     try {

From 65f933b4d84fde4a1c0737592c40f8c5254f9e4a Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 22:45:18 -0400
Subject: [PATCH 14/49] base64url (first steps)

---
 README.md                             |  34 ++--
 include/simdutf/implementation.h      |  30 ++-
 scripts/base64/table.py               |  42 ++++
 src/arm64/arm_base64.cpp              |  15 +-
 src/arm64/implementation.cpp          |  12 +-
 src/fallback/implementation.cpp       |  12 +-
 src/haswell/avx2_base64.cpp           |  13 +-
 src/haswell/implementation.cpp        |  12 +-
 src/icelake/icelake_base64.inl.cpp    |  13 +-
 src/icelake/implementation.cpp        |  12 +-
 src/implementation.cpp                |  42 ++--
 src/ppc64/implementation.cpp          |  12 +-
 src/rvv/implementation.cpp            |  12 +-
 src/scalar/base64.h                   |  63 +++---
 src/simdutf/arm64/implementation.h    |   6 +-
 src/simdutf/fallback/implementation.h |   6 +-
 src/simdutf/haswell/implementation.h  |   6 +-
 src/simdutf/icelake/implementation.h  |   6 +-
 src/simdutf/ppc64/implementation.h    |   6 +-
 src/simdutf/rvv/implementation.h      |   6 +-
 src/simdutf/westmere/implementation.h |   6 +-
 src/tables/base64_tables.h            | 265 +++++++++++++++++++++++++-
 src/westmere/implementation.cpp       |  12 +-
 src/westmere/sse_base64.cpp           |  13 +-
 tests/base64_tests.cpp                |  20 ++
 25 files changed, 519 insertions(+), 157 deletions(-)
 create mode 100644 scripts/base64/table.py

diff --git a/README.md b/README.md
index e78038ab0..be0cff736 100644
--- a/README.md
+++ b/README.md
@@ -1631,6 +1631,14 @@ a few tens of kilobytes.
 The specification of our base64 functions is as follows:
 
 ```C++
+
+// base64_options are used to specify the base64 encoding options.
+using base64_options = uint64_t;
+enum : base64_options {
+  base64_default = 0, /* standard base64 format */
+  base64_url = 1 /* base64url format*/
+};
+
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1647,7 +1655,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * In general, if the input contains ASCII spaces, the result will be less than
  * the maximum length.
  *
- * @param input         the base64 input to process in 16-bit units
+ * @param input         the base64 input to process, in ASCII stored as 16-bit units
  * @param length        the length of the base64 input in 16-bit units
  * @return maximal number of binary bytes
  */
@@ -1677,9 +1685,10 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * in
  * @param input         the base64 string to process
  * @param length        the length of the string in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Provide the base64 length in bytes given the length of a binary input.
@@ -1698,9 +1707,10 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  * @param input         the binary to process
  * @param length        the length of the input in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return number of written bytes, will be equal to base64_length_from_binary(length)
  */
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1723,13 +1733,13 @@ size_t binary_to_base64(const char * input, size_t length, char* output) noexcep
  * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
- * @param input         the base64 string to process in UTF-16 (native endianess)
+ * @param input         the base64 string to process, in ASCII stored as 16-bit units
  * @param length        the length of the string in 16-bit units
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
-
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1749,20 +1759,16 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
  *
- * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
- * where we stopped decoding.
- *
- * In all case, the outlen parameter is modified to contain the number of bytes
- * that have been written/decoded.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
  * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
  * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful. Note that the return convention of base64_to_binary_safe differs from base64_to_binary.
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
  */
-simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
-simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 
 ```
 
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index fbf20f8fc..a6784ba3a 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1380,6 +1380,13 @@ simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t le
  */
 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
 
+// base64_options are used to specify the base64 encoding options.
+using base64_options = uint64_t;
+enum : base64_options {
+  base64_default = 0, /* standard base64 format */
+  base64_url = 1 /* base64url format*/
+};
+
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1426,9 +1433,10 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * in
  * @param input         the base64 string to process
  * @param length        the length of the string in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Provide the base64 length in bytes given the length of a binary input.
@@ -1447,9 +1455,10 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  * @param input         the binary to process
  * @param length        the length of the input in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return number of written bytes, will be equal to base64_length_from_binary(length)
  */
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1475,9 +1484,10 @@ size_t binary_to_base64(const char * input, size_t length, char* output) noexcep
  * @param input         the base64 string to process, in ASCII stored as 16-bit units
  * @param length        the length of the string in 16-bit units
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1502,10 +1512,11 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
  * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
  */
-simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
-simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 
 /**
  * An implementation of simdutf for a particular CPU architecture.
@@ -2609,9 +2620,10 @@ class implementation {
    * @param input         the base64 string to process
    * @param length        the length of the string in bytes
    * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
    */
-  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
   /**
    * Convert a base64 input to a binary ouput.
@@ -2633,9 +2645,10 @@ class implementation {
    * @param input         the base64 string to process, in ASCII stored as 16-bit units
    * @param length        the length of the string in 16-bit units
    * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
    */
-  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept = 0;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
   /**
    * Provide the base64 length in bytes given the length of a binary input.
@@ -2654,9 +2667,10 @@ class implementation {
    * @param input         the binary to process
    * @param length        the length of the input in bytes
    * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return number of written bytes, will be equal to base64_length_from_binary(length)
    */
-  virtual size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept = 0;
+  virtual size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
 
 protected:
diff --git a/scripts/base64/table.py b/scripts/base64/table.py
new file mode 100644
index 000000000..ff83aa316
--- /dev/null
+++ b/scripts/base64/table.py
@@ -0,0 +1,42 @@
+import base64
+#default:
+#t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
+#baseur::
+t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
+def formula(a, b, c, d):
+    if(a >= 64 or b >= 64 or c >= 64 or d >= 64):
+        return 0x01ffffff
+    z =[ ((a * 4) + (b // 16))% 256, ((b * 16) % 256 + (c // 4))% 256 , ((c * 64) % 256 + d)% 256 ]
+    return z[0] + (z[1] << 8) + (z[2] << 16)
+
+acc = "const uint32_t d0[256] = {"
+for i in range(256):
+    a = formula(t[i], 0, 0, 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d1[256] = {"
+for i in range(256):
+    a = formula(0, t[i], 0, 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d2[256] = {"
+for i in range(256):
+    a = formula(0, 0, t[i], 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d3[256] = {"
+for i in range(256):
+    a = formula(0, 0, 0, t[i])
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
\ No newline at end of file
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 239176d97..58d885e17 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -26,7 +26,7 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-size_t encode_base64(char *dst, const char *src, size_t srclen) {
+size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
   // credit: Wojciech Muła
   uint8_t *out = (uint8_t *)dst;
   constexpr static uint8_t source_table[64] = {
@@ -55,7 +55,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     vst4q_u8(out, result);
     out += 64;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 
   return size_t((char *)out - dst);
 }
@@ -236,7 +236,8 @@ void base64_decode_block(char *out, const char *src) {
 }
 
 template <typename char_type>
-result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -265,7 +266,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
         src -= 64;
 
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -302,7 +303,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -346,7 +347,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -387,7 +388,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index f8d6a566a..75fc037b5 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -839,24 +839,24 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 
 
diff --git a/src/fallback/implementation.cpp b/src/fallback/implementation.cpp
index c469dbbef..f7c7d9321 100644
--- a/src/fallback/implementation.cpp
+++ b/src/fallback/implementation.cpp
@@ -349,7 +349,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -359,7 +359,7 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 
@@ -367,7 +367,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -377,15 +377,15 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 1f222b3b8..12954b60c 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -149,7 +149,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     out += 32;
   }
   return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 }
 
 static inline void compress(__m128i data, uint16_t mask, char *output) {
@@ -329,7 +329,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -361,7 +362,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
       if (error) {
         src -= 64;
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -407,7 +408,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   if (last_block != 0 && srcend - src + last_block >= 64) {
 
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -455,7 +456,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -495,7 +496,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 733f83b62..8f24b7e2a 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -782,24 +782,24 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index a7ff0c091..94eb4feb8 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -58,7 +58,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     out += 64;
   }
   return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 }
 
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
@@ -138,7 +138,8 @@ static inline void base64_decode_block(char *out, block64 *b) {
 }
 
 template <typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -167,7 +168,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
       if (error) {
         src -= 64;
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -203,7 +204,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   if (last_block != 0 && srcend - src + last_block >= 64) {
 
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -245,7 +246,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -286,7 +287,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index dae4f0dfd..183de14bc 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1368,16 +1368,16 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 
@@ -1385,8 +1385,8 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 
 } // namespace SIMDUTF_IMPLEMENTATION
diff --git a/src/implementation.cpp b/src/implementation.cpp
index e964ce565..48a14e386 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -459,24 +459,24 @@ class detect_best_supported_implementation_on_first_use final : public implement
     return set_best()->maximal_binary_length_from_base64(input, length);
   }
 
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output);
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options);
   }
 
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept override {
     return set_best()->maximal_binary_length_from_base64(input, length);
   }
 
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output);
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options);
   }
 
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept override {
     return set_best()->base64_length_from_binary(length);
   }
 
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept override {
-    return set_best()->binary_to_base64(input, length, output);
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->binary_to_base64(input, length, output, options);
   }
 
   simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
@@ -823,7 +823,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*) const noexcept override {
+  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*, base64_options) const noexcept override {
     return result(error_code::OTHER, 0);
   }
 
@@ -831,7 +831,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*) const noexcept override {
+  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*, base64_options) const noexcept override {
     return result(error_code::OTHER, 0);
   }
 
@@ -840,7 +840,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  size_t binary_to_base64(const char *, size_t, char*) const noexcept override {
+  size_t binary_to_base64(const char *, size_t, char*, base64_options) const noexcept override {
     return 0;
   }
 
@@ -1290,34 +1290,34 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
   return get_default_implementation()->maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->base64_to_binary(input, length, output);
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output, options);
 }
 
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept {
   return get_default_implementation()->maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->base64_to_binary(input, length, output);
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output, options);
 }
 
 template <typename chartype>
-simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
+simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
   static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
   // The implementation could be nicer, but we expect that most times, the user
   // will provide us with a buffer that is large enough.
   size_t max_length = maximal_binary_length_from_base64(input, length);
   if(outlen >= max_length) {
     // fast path
-    result r = base64_to_binary(input, length, output);
+    result r = base64_to_binary(input, length, output, options);
     if(r.error != error_code::INVALID_BASE64_CHARACTER) { outlen = r.count; r.count = length; }
     return r;
   }
   // The output buffer is maybe too small. We will decode a truncated version of the input.
   size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
   size_t safe_input = base64_length_from_binary(outlen3);
-  result r = base64_to_binary(input, safe_input, output);
+  result r = base64_to_binary(input, safe_input, output, options);
   if(r.error == error_code::INVALID_BASE64_CHARACTER) { return r; }
   size_t offset = (r.error == error_code::BASE64_INPUT_REMAINDER) ? 1 :
     ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
@@ -1340,18 +1340,18 @@ simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, si
       tail_length--;
     }
   }
-  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length);
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length, options);
   outlen = output_index + remaining_out;
   r.count += input_index;
   return r;
 }
 
 
-simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept {
-  return base64_to_binary_safe_impl<char>(input, length, output, outlen);
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen, options);
 }
-simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept {
-  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen);
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen, options);
 }
 
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
diff --git a/src/ppc64/implementation.cpp b/src/ppc64/implementation.cpp
index 161ae19d9..f33444d41 100644
--- a/src/ppc64/implementation.cpp
+++ b/src/ppc64/implementation.cpp
@@ -298,7 +298,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -308,23 +308,23 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return scalar::base64::base64_to_binary(input, length, output);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::base64_to_binary(input, length, output, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/rvv/implementation.cpp b/src/rvv/implementation.cpp
index 7dda20c8a..7b4ecf96b 100644
--- a/src/rvv/implementation.cpp
+++ b/src/rvv/implementation.cpp
@@ -82,7 +82,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -92,7 +92,7 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 
@@ -100,7 +100,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -110,15 +110,15 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index 7a19087fe..3e3b617bb 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -12,7 +12,13 @@ namespace base64 {
 // Returns true upon success. The destination buffer must be large enough.
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
-result base64_tail_decode(char *dst, const char_type *src, size_t length) {
+result base64_tail_decode(char *dst, const char_type *src, size_t length, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3;
+
   const char_type *srcend = src + length;
   const char_type *srcinit = src;
   const char *dstinit = dst;
@@ -22,8 +28,8 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
   uint8_t buffer[4];
   while (true) {
     while (src + 4 <= srcend &&
-           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
-                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
         x = scalar::utf32::swap_bytes(x);
       }
@@ -35,7 +41,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
       char_type c = *src;
-      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
         idx++;
@@ -98,7 +104,13 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
 // outlen is modified to reflect the number of bytes written.
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
-result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length) {
+result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3;
+
   const char_type *srcend = src + length;
   const char_type *srcinit = src;
   const char *dstinit = dst;
@@ -109,8 +121,8 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
   uint8_t buffer[4];
   while (true) {
     while (src + 4 <= srcend &&
-           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
-                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
         x = scalar::utf32::swap_bytes(x);
       }
@@ -128,7 +140,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
       char_type c = *src;
-      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
         idx++;
@@ -203,35 +215,38 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
 
 // Returns the number of bytes written. The destination buffer must be large
 // enough. It will add padding (=) if needed.
-size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
+  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 : tables::base64::base64_default::e0;
+  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 : tables::base64::base64_default::e1;
+  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 : tables::base64::base64_default::e2;
   char *out = dst;
   size_t i = 0;
   uint8_t t1, t2, t3;
   for (; i + 2 < srclen; i += 3) {
-    t1 = (uint8_t)src[i];
-    t2 = (uint8_t)src[i + 1];
-    t3 = (uint8_t)src[i + 2];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = tables::base64::e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
-    *out++ = tables::base64::e2[t3];
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    t3 = uint8_t(src[i + 2]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+    *out++ = e2[t3];
   }
   switch (srclen - i) {
   case 0:
     break;
   case 1:
-    t1 = (uint8_t)src[i];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[(t1 & 0x03) << 4];
+    t1 = uint8_t(src[i]);
+    *out++ = e0[t1];
+    *out++ = e1[(t1 & 0x03) << 4];
     *out++ = '=';
     *out++ = '=';
     break;
   default: /* case 2 */
-    t1 = (uint8_t)src[i];
-    t2 = (uint8_t)src[i + 1];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = tables::base64::e2[(t2 & 0x0F) << 2];
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e2[(t2 & 0x0F) << 2];
     *out++ = '=';
   }
   return (size_t)(out - dst);
diff --git a/src/simdutf/arm64/implementation.h b/src/simdutf/arm64/implementation.h
index 5e0d89ace..1d6fbd423 100644
--- a/src/simdutf/arm64/implementation.h
+++ b/src/simdutf/arm64/implementation.h
@@ -90,11 +90,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace arm64
diff --git a/src/simdutf/fallback/implementation.h b/src/simdutf/fallback/implementation.h
index c8dfc2037..40fdcc246 100644
--- a/src/simdutf/fallback/implementation.h
+++ b/src/simdutf/fallback/implementation.h
@@ -93,11 +93,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 } // namespace fallback
 } // namespace simdutf
diff --git a/src/simdutf/haswell/implementation.h b/src/simdutf/haswell/implementation.h
index 79969941b..f3eb7e4db 100644
--- a/src/simdutf/haswell/implementation.h
+++ b/src/simdutf/haswell/implementation.h
@@ -92,11 +92,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace haswell
diff --git a/src/simdutf/icelake/implementation.h b/src/simdutf/icelake/implementation.h
index 4638bf9b9..495a05a59 100644
--- a/src/simdutf/icelake/implementation.h
+++ b/src/simdutf/icelake/implementation.h
@@ -92,11 +92,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace icelake
diff --git a/src/simdutf/ppc64/implementation.h b/src/simdutf/ppc64/implementation.h
index 7fd324493..ee0c7dcd4 100644
--- a/src/simdutf/ppc64/implementation.h
+++ b/src/simdutf/ppc64/implementation.h
@@ -70,11 +70,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace ppc64
diff --git a/src/simdutf/rvv/implementation.h b/src/simdutf/rvv/implementation.h
index 56f02362d..d4e668581 100644
--- a/src/simdutf/rvv/implementation.h
+++ b/src/simdutf/rvv/implementation.h
@@ -94,11 +94,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 private:
   const bool _supports_zvbb;
 
diff --git a/src/simdutf/westmere/implementation.h b/src/simdutf/westmere/implementation.h
index 190693783..d10dfb433 100644
--- a/src/simdutf/westmere/implementation.h
+++ b/src/simdutf/westmere/implementation.h
@@ -90,11 +90,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace westmere
diff --git a/src/tables/base64_tables.h b/src/tables/base64_tables.h
index a0f997733..f835f141b 100644
--- a/src/tables/base64_tables.h
+++ b/src/tables/base64_tables.h
@@ -7,6 +7,7 @@ namespace simdutf {
 namespace {
 namespace tables {
 namespace base64 {
+namespace base64_default {
 
 const char e0[256] = {
     'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
@@ -68,8 +69,6 @@ const char e2[256] = {
     'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
     '/'};
 
-/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN CPUS */
-
 const uint32_t d0[256] = {
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
@@ -249,6 +248,247 @@ const uint32_t d3[256] = {
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_default
+
+namespace base64_url {
+
+const char e0[256] = {
+    'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+    'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+    'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+    'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+    'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+    'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+    'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+    'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+    'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+    'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+    'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+    'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+    't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+    'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+    '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+    '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+    '8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_',
+    '_'};
+
+const char e1[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+    '_'};
+
+const char e2[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+    '_'};
+
+const uint32_t d0[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff,
+    0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+    0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+    0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+    0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+    0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+    0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
+    0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+    0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+    0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+    0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+    0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d1[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff,
+    0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+    0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+    0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+    0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+    0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+    0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
+    0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+    0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+    0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+    0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+    0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d2[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff,
+    0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+    0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+    0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+    0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+    0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+    0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
+    0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+    0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+    0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+    0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+    0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d3[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff,
+    0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+    0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+    0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+    0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+    0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+    0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
+    0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+    0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+    0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+    0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+    0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_url
 const uint64_t thintable_epi8[256] = {
     0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
     0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
@@ -388,6 +628,27 @@ const uint8_t to_base64_value[] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255};
+
+const uint8_t to_base64_url_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    62,  255, 255, 52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 63,  255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
 } // namespace base64
 } // namespace tables
 } // unnamed namespace
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index a491818c1..c995df881 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -783,16 +783,16 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length, base64_options options) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length, options);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index ef57f5184..c72118c7b 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -158,7 +158,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
   }
 
   return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 }
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
@@ -339,7 +339,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -371,7 +372,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
       if (error) {
         src -= 64;
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -416,7 +417,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -464,7 +465,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -506,7 +507,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index e04dee0a1..a22dd18d0 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -31,6 +31,26 @@ const uint8_t to_base64_value[] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255};
 
+
+const uint8_t to_base64url_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  255,
+    62, 255, 255,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 63, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
 template <typename char_type>
 size_t add_space(std::vector<char_type> &v, std::mt19937 &gen) {
   const static std::array<char_type, 4> space = {' ', '\t', '\n', '\r'};

From 4aa837d44b1126a32badaa9fc7971afbe23c46a6 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 28 Mar 2024 11:56:01 -0400
Subject: [PATCH 15/49] working through

---
 scripts/base64/neon_decode.py      | 33 ++++++++++++++++++++++++++++++
 src/arm64/arm_base64.cpp           | 16 +++++++++++----
 src/arm64/implementation.cpp       |  4 ++--
 src/haswell/avx2_base64.cpp        |  4 ++--
 src/haswell/implementation.cpp     |  4 ++--
 src/icelake/icelake_base64.inl.cpp |  4 ++--
 src/icelake/implementation.cpp     |  4 ++--
 src/westmere/implementation.cpp    |  4 ++--
 src/westmere/sse_base64.cpp        |  4 ++--
 9 files changed, 59 insertions(+), 18 deletions(-)
 create mode 100644 scripts/base64/neon_decode.py

diff --git a/scripts/base64/neon_decode.py b/scripts/base64/neon_decode.py
new file mode 100644
index 000000000..6ce185cb5
--- /dev/null
+++ b/scripts/base64/neon_decode.py
@@ -0,0 +1,33 @@
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
+spaces=' \t\n\r'
+lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4]
+lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
+roll = [0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
+def decode(s):
+    low = s & 0xf
+    high = s >> 4
+    m = lut_lo[low] & lut_hi[high]
+    if(m > 0x3):
+        return (m, None)
+    if s == 0x2f:
+        off = roll[high - 1]
+    else:
+        off = roll[high]
+    return (m,(s + off)&0xff)
+
+for i in range(256):
+    m,d = decode(i)
+    if d is None:
+        assert t.find(chr(i)) == -1
+        assert spaces.find(chr(i)) == -1
+        continue
+    if m == 0:
+        assert d >= 0
+        # we must have a base64 element
+        v = t.find(chr(i))
+        #print(i, chr(i), v, d)
+        assert v == d
+    else:
+        # we must have a space
+        v = spaces.find(chr(i))
+        assert v >= 0
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 58d885e17..698a3e723 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -36,8 +36,15 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options o
       '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
       'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
   };
+  constexpr static uint8_t source_table_url[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
+  };
   const uint8x16_t v3f = vdupq_n_u8(0x3f);
-  const uint8x16x4_t table = vld4q_u8(source_table);
+  const uint8x16x4_t table = vld4q_u8((options&base64_url) ? source_table_url : source_table);
   size_t i = 0;
   for (; i + 16 * 3 <= srclen; i += 16 * 3) {
     const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
@@ -94,6 +101,7 @@ struct block64 {
   uint8x16_t chunks[4];
 };
 static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
+template <bool base64_url>
 uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t v0f = vdupq_n_u8(0xf);
 
@@ -235,9 +243,9 @@ void base64_decode_block(char *out, const char *src) {
   vst3q_u8((uint8_t *)out, outvec);
 }
 
-template <typename char_type>
+template <bool base64_url, typename char_type>
 result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -261,7 +269,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, ba
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
 
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index 75fc037b5..e0a35f071 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -840,7 +840,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
@@ -848,7 +848,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 12954b60c..d6886aa86 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -328,9 +328,9 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 24, buffer, 24);
 }
 
-template <typename chartype>
+template <bool base64_url, typename chartype>
 result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 8f24b7e2a..4d3f1951e 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -783,7 +783,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
@@ -791,7 +791,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 94eb4feb8..8b1882ca8 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -137,9 +137,9 @@ static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
 }
 
-template <typename chartype>
+template <bool base64_url, typename chartype>
 result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 183de14bc..8aa9cf886 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1369,7 +1369,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
@@ -1377,7 +1377,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index c995df881..14565397b 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -784,7 +784,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length, base64_options options) const noexcept {
@@ -792,7 +792,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index c72118c7b..2fc986acc 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -338,9 +338,9 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 36, buffer, 12);
 }
 
-template <typename chartype>
+template <bool base64_url, typename chartype>
 result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;

From 8dc79aa9352940ce24d08868f7021cae820bd7d4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 28 Mar 2024 20:34:13 -0400
Subject: [PATCH 16/49] implemented base64url for ARM.

---
 benchmarks/base64/benchmark_base64.cpp |  32 ++++-
 scripts/base64/neon_decode.py          | 119 ++++++++++++++++
 src/arm64/arm_base64.cpp               | 143 +++++++++++++------
 tests/base64_tests.cpp                 | 182 ++++++++++++++++++++++++-
 4 files changed, 435 insertions(+), 41 deletions(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 22eaf8dc5..3fb475d58 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -67,7 +67,7 @@ int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
   return !state.bytes;
 }
 
-enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3 };
+enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3, roundtripurl = 4 };
 
 event_collector collector;
 
@@ -116,6 +116,7 @@ void show_help() {
   printf("  -d, --decode      Decode the input file\n");
   printf("  -e, --encode      Encode the input file\n");
   printf("  -r, --roundtrip   Roundtrip the input file\n");
+  printf("  --roundtripurl    Roundtrip the input file (URL)\n");
   printf("  -b, --bun         Bun benchmark\n");
 
   printf(" See https://github.com/lemire/base64data for test data.\n");
@@ -212,6 +213,33 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
   printf("# number of inputs: %zu\n", data.size());
 
   switch (mode) {
+
+  case roundtripurl: {
+    printf("# roundtrip (url)\n");
+    for (auto &e : simdutf::get_available_implementations()) {
+      if (!e->supported_by_runtime_system()) {
+        continue;
+      }
+      pretty_print(data.size(), volume, "simdutf::" + e->name(),
+                   bench([&data, &buffer1, &buffer2, &e]() {
+                     for (const std::vector<char> &source : data) {
+                       size_t base64_size = e->binary_to_base64(
+                           source.data(), source.size(), buffer1.data(), simdutf::base64_url);
+                       auto err = e->base64_to_binary(
+                           buffer1.data(), base64_size, buffer2.data(), simdutf::base64_url);
+                       if (err.error) {
+                         std::cerr << "Error:  at position " << err.count
+                                   << std::endl;
+                       } else if (err.count != source.size()) {
+                         std::cerr << "Error: " << err.count
+                                   << " bytes decoded, expected "
+                                   << source.size() << std::endl;
+                       }
+                     }
+                   }));
+    }
+    break;
+  }
   case roundtrip: {
     printf("# roundtrip\n");
     pretty_print(
@@ -439,6 +467,8 @@ int main(int argc, char **argv) {
       mode = encode;
     } else if ((arg == "-r") || (arg == "--roundtrip")) {
       mode = roundtrip;
+    } else if (arg == "--roundtripurl") {
+      mode = roundtripurl;
     } else if ((arg == "-b") || (arg == "--bun")) {
       mode = bun;
     } else {
diff --git a/scripts/base64/neon_decode.py b/scripts/base64/neon_decode.py
index 6ce185cb5..88e5cab95 100644
--- a/scripts/base64/neon_decode.py
+++ b/scripts/base64/neon_decode.py
@@ -31,3 +31,122 @@ def decode(s):
         # we must have a space
         v = spaces.find(chr(i))
         assert v >= 0
+
+
+
+
+## 0x2d is '-' in base64
+## 0x5f is '_' in base64
+
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+spaces=' \t\n\r'
+
+#3 numbers
+#4-6 letters
+#5-7 letters
+
+#0x2d
+#0x5f
+
+lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb0]
+lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
+roll = [0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+spaces=' \t\n\r' ## ['0x20', '0x9', '0xa', '0xd']
+
+
+lut_lo = [0x0 for i in range(16)]
+lut_hi = [0x0 for i in range(16)]
+#roll = [0 for i in range(16)]
+
+#0x00 are forbidden except for \t \n \r which go to one
+lut_hi[0] = 0x11
+#for c in '\t\n\r':
+#    lut_lo[ord(c) & 0xf] = 0x1
+for z in range(16):
+    if '\t\n\r'.find(chr(z)) != -1:
+        lut_lo[z & 0xf] = 0x1 # allowed
+    else:
+        lut_lo[z] = 0x10 # forbidden
+#0x10 and 0x80 all forbidden
+lut_hi[0x1] = 0x20
+for z in range(0x8, 16):
+    lut_hi[z] = 0x20
+#lut_hi[0x8] = 0x20
+
+for z in range(16):
+    lut_lo[z] |= 0x20
+
+#0x20 selective
+lut_hi[0x2] = 0x42
+for z in range(16):
+    if z == 0:
+        lut_lo[z] |= 0x2
+    elif z != 0xd:
+        lut_lo[z] |= 0x40
+
+
+#0x30 numbers
+lut_hi[0x3] = 0x80
+for z in range(10,16):
+    lut_lo[z] |= 0x80
+
+#0x40, 0x60 letters
+lut_hi[0x4] = 0x8
+lut_hi[0x6] = 0x8
+lut_lo[0] |= 0x8
+
+#0x7 letters
+#0x5 letters
+lut_hi[0x5] |= 0x4
+lut_hi[0x7] |= 0x4
+for i in range(0xb,16):
+    lut_lo[i] |= 0x4
+
+
+
+
+
+def decode(s):
+    low = s & 0xf
+    high = s >> 4
+    m = lut_lo[low] & lut_hi[high]
+    is_underscore = s == 0x5f
+    if(is_underscore):
+        m = 0
+        high = 0
+    if(m > 0x3):
+        return (m, None)
+    if s == 0x2d:
+        off = roll[high - 1]
+    else:
+        off = roll[high]
+    return (m,(s + off)&0xff)
+print(",".join([hex(c) for c in lut_lo]))
+print(",".join([hex(c) for c in lut_hi]))
+print(",".join([hex(c) for c in roll]))
+
+#for c in spaces:
+#    print(hex(ord(c)),decode(ord(c)))
+
+#import sys
+#sys.exit(0)
+
+for i in range(256):
+    m,d = decode(i)
+    #print(hex(i), m, d, chr(i))
+    if d is None:
+        assert t.find(chr(i)) == -1
+        assert spaces.find(chr(i)) == -1
+        continue
+    if m == 0:
+        assert d >= 0
+        # we must have a base64 element
+        v = t.find(chr(i))
+        if(v != d): 
+            print(hex(i), chr(i), v, d)
+        #assert v == d
+    else:
+        # we must have a space
+        v = spaces.find(chr(i))
+        assert v >= 0
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 698a3e723..877d9bdb7 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -26,7 +26,8 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
   // credit: Wojciech Muła
   uint8_t *out = (uint8_t *)dst;
   constexpr static uint8_t source_table[64] = {
@@ -44,7 +45,8 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options o
       'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
   };
   const uint8x16_t v3f = vdupq_n_u8(0x3f);
-  const uint8x16x4_t table = vld4q_u8((options&base64_url) ? source_table_url : source_table);
+  const uint8x16x4_t table =
+      vld4q_u8((options & base64_url) ? source_table_url : source_table);
   size_t i = 0;
   for (; i + 16 * 3 <= srclen; i += 16 * 3) {
     const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
@@ -62,7 +64,8 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options o
     vst4q_u8(out, result);
     out += 64;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
+                                            options);
 
   return size_t((char *)out - dst);
 }
@@ -101,10 +104,22 @@ struct block64 {
   uint8x16_t chunks[4];
 };
 static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
-template <bool base64_url>
-uint64_t to_base64_mask(block64 *b, bool *error) {
+template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t v0f = vdupq_n_u8(0xf);
 
+  uint8x16_t underscore0, underscore1, underscore2, underscore3;
+  if (base64_url) {
+    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
+    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
+    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
+    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
+  } else {
+    (void)underscore0;
+    (void)underscore1;
+    (void)underscore2;
+    (void)underscore3;
+  }
+
   uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
   uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
   uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
@@ -114,31 +129,62 @@ uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
   uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
   uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+  uint8x16_t lut_lo;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t lut_lo =
-      simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+  if (base64_url) {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4);
+  } else {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+  }
 #else
-  const uint8x16_t lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                             0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+  if (base64_url) {
+    lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+              0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4};
+  } else {
+    lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+  }
 #endif
   uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
   uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
   uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
   uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+  uint8x16_t lut_hi;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t lut_hi =
-      simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, 0x20,
-                              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  if (base64_url) {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  } else {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  }
 #else
-  const uint8x16_t lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                             0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  if (base64_url) {
+    lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  } else {
+    lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  }
 #endif
   uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
   uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
   uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
   uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
 
+  if (base64_url) {
+    hi0 = vbicq_u8(hi0, underscore0);
+    hi1 = vbicq_u8(hi1, underscore1);
+    hi2 = vbicq_u8(hi2, underscore2);
+    hi3 = vbicq_u8(hi3, underscore3);
+  }
+
   uint8_t checks =
       vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
                          vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
@@ -169,23 +215,41 @@ uint64_t to_base64_mask(block64 *b, bool *error) {
   }
   // This is the transformation step that can be done while we are waiting for
   // sum0
+  uint8x16_t roll_lut;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t roll_lut =
-      simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0,
-                              0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  if (base64_url) {
+    roll_lut =
+        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  } else {
+    roll_lut =
+        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  }
 #else
-  const uint8x16_t roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                               0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  if (base64_url) {
+    roll_lut = {0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  } else {
+    roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  }
 #endif
-  uint8x16_t v2f = vdupq_n_u8(0x2f);
-  uint8x16_t roll0 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], v2f), hi_nibbles0));
-  uint8x16_t roll1 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], v2f), hi_nibbles1));
-  uint8x16_t roll2 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], v2f), hi_nibbles2));
-  uint8x16_t roll3 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], v2f), hi_nibbles3));
+  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
+  if (base64_url) {
+    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
+    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
+    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
+    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+  }
+  uint8x16_t roll0 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
+  uint8x16_t roll1 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
+  uint8x16_t roll2 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
+  uint8x16_t roll3 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
   b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
   b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
   b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
@@ -218,10 +282,10 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
-inline uint8x16_t load_satured(const uint16_t * data) {
-    uint16x8_t in1 = vld1q_u16(data);
-    uint16x8_t in2 = vld1q_u16(data+8);
-    return vqmovn_high_u16(vqmovn_u16(in1), in2);
+inline uint8x16_t load_satured(const uint16_t *data) {
+  uint16x8_t in1 = vld1q_u16(data);
+  uint16x8_t in2 = vld1q_u16(data + 8);
+  return vqmovn_high_u16(vqmovn_u16(in1), in2);
 }
 
 void load_block(block64 *b, const char16_t *src) {
@@ -244,8 +308,10 @@ void base64_decode_block(char *out, const char *src) {
 }
 
 template <bool base64_url, typename char_type>
-result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -270,11 +336,11 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, ba
       src += 64;
       bool error = false;
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if(badcharmask)
       if (error) {
         src -= 64;
 
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -396,7 +462,8 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, ba
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index a22dd18d0..6465811a6 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -109,7 +109,8 @@ TEST(encode_base64_cases) {
       {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
       {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
       {"123456", "MTIzNDU2"},
-      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", "IVJ+SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c+fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
   printf(" -- ");
@@ -152,12 +153,63 @@ TEST(encode_base64_cases) {
   }
 }
 
+
+TEST(encode_base64url_cases) {
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", "MTIzNDU2"},
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", "IVJ-SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c-fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(
+        implementation.base64_length_from_binary(p.first.size()));
+    ASSERT_EQUAL(buffer.size(), p.second.size());
+    size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
+                                               buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(s, p.second.size());
+    ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" --  ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length, simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
 TEST(encode_base64_cases_16) {
   std::vector<std::pair<std::string, std::u16string>> cases = {
       {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
       {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
       {"123456", u"MTIzNDU2"},
-      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"}};
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", u"IVJ+SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c+fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
   printf(" -- ");
@@ -191,6 +243,47 @@ TEST(encode_base64_cases_16) {
   }
 }
 
+
+TEST(encode_base64url_cases_16) {
+  std::vector<std::pair<std::string, std::u16string>> cases = {
+      {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", u"MTIzNDU2"},
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", u"IVJ-SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c-fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length, simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
 TEST(roundtrip_base64) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);
@@ -274,6 +367,91 @@ TEST(roundtrip_base64_16) {
   }
 }
 
+
+
+TEST(roundtrip_base64url) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data(), simdutf::base64_url);
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer.data(), size, back.data(), simdutf::base64_url);
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(roundtrip_base64url_16) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data(), simdutf::base64_url);
+      buffer.resize(size);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer16.data(), size, back.data(), simdutf::base64_url);
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
 TEST(doomed_base64_roundtrip) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);

From fe1138fdf6502b9d1ae5b575bcfa44a2a9684df9 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 28 Mar 2024 20:39:11 -0400
Subject: [PATCH 17/49] documentation.

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index be0cff736..63cbc326b 100644
--- a/README.md
+++ b/README.md
@@ -56,8 +56,8 @@ This library provide fast Unicode functions such as
 - From an UTF-16LE/BE string, compute the size of the UTF-32 equivalent string (equivalent to UTF-16 character counting),
 - UTF-8 and UTF-16LE/BE character counting,
 - UTF-16 endianness change (UTF16-LE/BE to UTF-16-BE/LE),
-- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary,
-- Binary to base64.
+- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) (with or without URL encoding) to binary,
+- Binary to base64 (with or without URL encoding).
 
 The functions are accelerated using SIMD instructions (e.g., ARM NEON, SSE, AVX, AVX-512, RISC-V Vector Extension, etc.). When your strings contain hundreds of characters, we can often transcode them at speeds exceeding a billion characters per second. You should expect high speeds not only with English strings (ASCII) but also Chinese, Japanese, Arabic, and so forth. We handle the full character range (including, for example, emojis).
 
@@ -1568,7 +1568,7 @@ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * o
 Base64
 -----
 
-We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary.
+We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary. We also support the base64 URL encoding alternative.
 
 Converting binary data to base64 always succeeds and is relatively simple:
 ```C++

From 5d1d0d59159adc0f2e6668fcac5b57bc6947356b Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 29 Mar 2024 22:58:55 -0400
Subject: [PATCH 18/49] prototype base64url

---
 scripts/base64/avx512.py           |  79 ++++++++++
 scripts/base64/sse.py              | 239 +++++++++++++++++++++++++++++
 src/haswell/avx2_base64.cpp        | 151 ++++++++++++------
 src/haswell/implementation.cpp     |   6 +-
 src/icelake/icelake_base64.inl.cpp |  73 ++++++---
 src/icelake/implementation.cpp     |   6 +-
 src/westmere/implementation.cpp    |  12 +-
 src/westmere/sse_base64.cpp        | 119 +++++++++-----
 tests/base64_tests.cpp             |  16 ++
 9 files changed, 588 insertions(+), 113 deletions(-)
 create mode 100644 scripts/base64/avx512.py
 create mode 100644 scripts/base64/sse.py

diff --git a/scripts/base64/avx512.py b/scripts/base64/avx512.py
new file mode 100644
index 000000000..b09265cc8
--- /dev/null
+++ b/scripts/base64/avx512.py
@@ -0,0 +1,79 @@
+lookup_0 = [0 for i in range(64)]
+lookup_1 = [0 for i in range(64)]
+for i in range(64):
+    lookup_0[i] = 0x80
+    lookup_1[i] = 0x80
+lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+for i in range(64):
+    val = ord(lookup[i])
+    bit6 = val & 0x40
+    bits05 = val & 0x3f
+    if bit6:
+        lookup_1[bits05] = i
+    else:
+        lookup_0[bits05] = i
+allowed = "\t\r\n "
+for z in allowed:
+    lookup_0[ord(z)] = 0xff
+def sign8(x):
+    if x >= 128:
+        return x - 256
+    return x
+lookup_0.reverse()
+lookup_1.reverse()
+print("lookup0:")
+print(", ".join([str(sign8(i)) for i in lookup_0]))
+print("lookup1:")
+print(", ".join([str(sign8(i)) for i in lookup_1]))
+lookup = [0 for i in range(64)]
+output = 0
+for ifrom in range(16):
+    lookup[ifrom*4 + 0] = output + 3
+    lookup[ifrom*4 + 1] = output + 2
+    lookup[ifrom*4 + 2] = output + 1
+    lookup[ifrom*4 + 3] = output + 0
+    output += 4
+lookup.reverse()
+print("reverse:")
+print(", ".join([str(i) for i in lookup]))
+
+print("====")
+
+lookup_0 = [0 for i in range(64)]
+lookup_1 = [0 for i in range(64)]
+for i in range(64):
+    lookup_0[i] = 0x80
+    lookup_1[i] = 0x80
+lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+for i in range(64):
+    val = ord(lookup[i])
+    bit6 = val & 0x40
+    bits05 = val & 0x3f
+    if bit6:
+        lookup_1[bits05] = i
+    else:
+        lookup_0[bits05] = i
+allowed = "\0\t\r\n "
+for z in allowed:
+    lookup_0[ord(z)] = 0xff
+def sign8(x):
+    if x >= 128:
+        return x - 256
+    return x
+lookup_0.reverse()
+lookup_1.reverse()
+print("lookup0:")
+print(", ".join([str(sign8(i)) for i in lookup_0]))
+print("lookup1:")
+print(", ".join([str(sign8(i)) for i in lookup_1]))
+lookup = [0 for i in range(64)]
+output = 0
+for ifrom in range(16):
+    lookup[ifrom*4 + 0] = output + 3
+    lookup[ifrom*4 + 1] = output + 2
+    lookup[ifrom*4 + 2] = output + 1
+    lookup[ifrom*4 + 3] = output + 0
+    output += 4
+lookup.reverse()
+print("reverse:")
+print(", ".join([str(i) for i in lookup]))
\ No newline at end of file
diff --git a/scripts/base64/sse.py b/scripts/base64/sse.py
new file mode 100644
index 000000000..d50cfd9f6
--- /dev/null
+++ b/scripts/base64/sse.py
@@ -0,0 +1,239 @@
+import sys
+delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
+check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
+
+delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
+check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
+
+
+valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+
+def safechr(i):
+    if i < 32:
+        return '.'
+    if i > 127:
+        return '?'
+    return chr(i)
+
+def safehex(x):
+    return "0x{0:2x}".format(x)
+
+def to_signed(x):
+    if(x >= 128):
+        return x - 256
+    return x
+
+def to_unsigned(x):
+    if(x < 0):
+        return x + 256
+    return x
+
+def sat(x, y):
+    x = to_signed(x)
+    y = to_signed(y)
+    z = x + y
+    if(z > 127):
+        return 127
+    if(z < -128):
+        return to_unsigned(-128)
+    return to_unsigned(z)
+
+def lookup(table, index):
+    print("looking up ", hex(index))
+    if(index >= 128):
+        return 0
+    return table[index&0xf]
+
+
+
+def quietlookup(table, index):
+    if(index >= 128):
+        return 0
+    return table[index&0xf]
+
+def process(src):
+    shifted = (src >> 3)%256
+    delta_hash = (lookup(delta_asso,src) + shifted + 1) >> 1
+    check_hash = (lookup(check_asso,src) + shifted + 1) >> 1
+    out = sat(lookup(delta_values,delta_hash), src)
+    chk = sat(lookup(check_values,check_hash), src)
+    mask = chk & 0x80
+    return (out, mask)
+
+def processquiet(src):
+    shifted = (src >> 3)%256
+    delta_hash = (quietlookup(delta_asso,src) + shifted + 1) >> 1
+    check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+    out = sat(quietlookup(delta_values,delta_hash), src)
+    chk = sat(quietlookup(check_values,check_hash), src)
+    mask = chk & 0x80
+    return (out, mask)
+
+def is_ok(i):
+    out, mask = processquiet(i)
+    if mask == 0:
+        return 1
+    return 0
+
+def computestring():
+    s = ""
+    for i in range(256):
+        out, mask = processquiet(i)
+        if(mask == 0):
+            s +=  safechr(i)
+    return s
+print(computestring() + " " + str(len(computestring())))
+
+def print_layout():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in range(16):
+        if check_hash in t:
+            off = quietlookup(check_values,check_hash)
+            print(hex(check_hash), hex(off), end="")
+            print("\t", " ".join(["   "+safechr(c) for c in t[check_hash]]))
+        else:
+            continue
+
+
+def is_valid():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        if check_hash in t:
+            array = t[check_hash]
+            i = 0
+            while i < len(array) and valid.find(chr(array[i])) == -1:
+                i += 1
+            while i < len(array) and valid.find(chr(array[i])) != -1:
+                i += 1
+            while i < len(array) and array[i] >= 128:
+                i += 1
+            if i < len(array):
+                return False
+        else:
+            continue
+    return True
+
+print_layout()
+print(is_valid())
+
+valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+print("----")
+def fun_adjust():
+    for zz in range(256):
+        check_asso[ord('-')&0xf] = zz
+        for yy in range(256):
+            check_asso[ord('_')&0xf] = yy
+            if(is_valid()):
+                print("----")
+                print_layout()
+                print(is_valid())
+                print("found")
+                return
+fun_adjust()
+            #sys.exit(0)
+
+def adjust(array, start, end, check_hash):
+    for j in range(256):
+        is_ok = True
+        for i in range(len(array)):
+            valid = (sat(j,array[i])&0x80 == 0) # sat(quietlookup(check_values,check_hash), src)
+            should_be_valid = (i>=start and i < end)
+            is_ok = is_ok and (valid == should_be_valid)
+        if(is_ok):
+            check_values[check_hash&0xf] = j
+            return
+    raise "unexpected"
+
+
+    chk = sat(quietlookup(check_values,check_hash), src)
+    mask = chk & 0x80
+
+def process():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        if check_hash in t:
+            array = t[check_hash]
+            i = 0
+            while i < len(array) and valid.find(chr(array[i])) == -1:
+                i += 1
+            if i < len(array) and valid.find(chr(array[i])) != -1:
+                start = i
+                while i < len(array) and valid.find(chr(array[i])) != -1:
+                    i += 1
+                end = i
+                adjust(array, start, end, check_hash)
+        else:
+            continue
+    return True
+print("process")
+process()
+print("string")
+print(computestring()+ " "+str(len(computestring())))
+
+for c in valid:
+    print(c,processquiet(ord(c)))
+
+def examine():
+    t={}
+    for i in valid:
+        src = ord(i)
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(delta_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        print(check_hash, t[check_hash])
+    return True
+examine()
+
+delta_values[10] += 1 
+
+delta_values[13] += 33 
+
+for c in valid:
+    print(c,processquiet(ord(c)))
+
+
+
+delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
+check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
+
+delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
+check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
+
+def casthex(v):
+    if(v >= 0x80):
+        return "uint8_t("+"0x{:X}".format(v)+")"
+    return "0x{:X}".format(v)
+def printme(c):
+    print(",".join([casthex(i) for i in c]))
+print("delta_asso")
+printme(delta_asso)
+print("check_asso")
+printme(check_asso)
+print("delta_values")
+printme(delta_values)
+print("check_values")
+printme(check_values)
\ No newline at end of file
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index d6886aa86..615535881 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -26,23 +26,35 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-__m256i lookup_pshufb_improved(const __m256i input) {
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
   // credit: Wojciech Muła
   __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
   const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
   result =
       _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-  const __m256i shift_LUT = _mm256_setr_epi8(
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
-
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m256i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
 
   result = _mm256_shuffle_epi8(shift_LUT, result);
   return _mm256_add_epi8(result, input);
 }
 
+template <base64_options options>
 size_t encode_base64(char *dst, const char *src, size_t srclen) {
   // credit: Wojciech Muła
   const uint8_t *input = (const uint8_t *)src;
@@ -110,18 +122,18 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input0));
+                        lookup_pshufb_improved<options == base64_url>(input0));
     out += 32;
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input1));
+                        lookup_pshufb_improved<options == base64_url>(input1));
     out += 32;
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input2));
+                        lookup_pshufb_improved<options == base64_url>(input2));
     out += 32;
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input3));
+                        lookup_pshufb_improved<options == base64_url>(input3));
     out += 32;
   }
   for (; i + 28 <= srclen; i += 24) {
@@ -145,11 +157,11 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m256i indices = _mm256_or_si256(t1, t3);
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(indices));
+                        lookup_pshufb_improved<options == base64_url>(indices));
     out += 32;
   }
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
 static inline void compress(__m128i data, uint16_t mask, char *output) {
@@ -200,36 +212,79 @@ struct block64 {
   __m256i chunks[2];
 };
 
+template <bool base64_url>
 static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   const __m256i ascii_space_tbl =
       _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
                        0x0, 0x0, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
                        0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0x0, 0xd, 0x0, 0x0);
   // credit: aqrit
-  const __m256i delta_asso = _mm256_setr_epi8(
-      0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  const __m256i delta_values = _mm256_setr_epi8(
-      int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
-      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
-      int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-      int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
-      int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
-      int8_t(0xB9), int8_t(0xB9));
-  const __m256i check_asso = _mm256_setr_epi8(
-      0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
-      0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-      0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
-  const __m256i check_values = _mm256_setr_epi8(
-      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
-      int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
-      int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
-      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
-      int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
-      int8_t(0x91), int8_t(0x80));
+  __m256i delta_asso;
+  if (base64_url) {
+    delta_asso =
+        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+    delta_asso = _mm256_setr_epi8(
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+
+  __m256i delta_values;
+  if (base64_url) {
+    delta_values = _mm256_setr_epi8(
+        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+        uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xBF),
+        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+    delta_values = _mm256_setr_epi8(
+        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+        int8_t(0xB9), int8_t(0xB9));
+  }
+  __m256i check_asso;
+
+  if (base64_url) {
+    check_asso =
+        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+                         0x7, 0xB, 0xB, 0xB, 0xF, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+  } else {
+
+    check_asso = _mm256_setr_epi8(
+        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m256i check_values;
+  if (base64_url) {
+    check_values = _mm256_setr_epi8(
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
+        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
+        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+  } else {
+    check_values = _mm256_setr_epi8(
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+        int8_t(0x91), int8_t(0x80));
+  }
   const __m256i shifted = _mm256_srli_epi32(*src, 3);
 
   const __m256i delta_hash =
@@ -250,10 +305,12 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   *src = out;
   return (uint32_t)mask;
 }
+
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   *error = 0;
-  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
-  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
   return m0 | (m1 << 32);
 }
 
@@ -329,8 +386,10 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <bool base64_url, typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -358,11 +417,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -496,7 +554,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 4d3f1951e..f11325864 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -799,7 +799,11 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
 }
 
 size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return encode_base64(output, input, length, options);
+  if(options & base64_url) {
+    return encode_base64<base64_url>(output, input, length);
+  } else {
+    return encode_base64<base64_default>(output, input, length);
+  }
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 8b1882ca8..a6e3908fa 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -31,14 +31,17 @@ struct block64 {
   __m512i chunks[1];
 };
 
-size_t encode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
   // credit: Wojciech Muła
-
   const uint8_t *input = (const uint8_t *)src;
 
   uint8_t *out = (uint8_t *)dst;
   static const char *lookup_tbl =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+      base64_url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
   const __m512i shuffle_input = _mm512_setr_epi32(
       0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
@@ -57,27 +60,48 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
     out += 64;
   }
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   __m512i input = b->chunks[0];
   const __m512i ascii_space_tbl = _mm512_set_epi8(
       0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9,
       0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0,
       32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
-  __m512i lookup0 = _mm512_set_epi8(
-      -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-      52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
-      -128, -128, -128, -64, -128, -128, -128, -128, -128, -128, -128, -128,
-      -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -64, -128,
-      -128, -64, -64, -128, -128, -128, -128, -128, -128, -128, -128, -64);
-  __m512i lookup1 = _mm512_set_epi8(
-      -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41,
-      40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, -128,
-      -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14,
-      13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  __m512i lookup0;
+  if (base64_url) {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
+        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
+  } else {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
+        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
+  }
+  __m512i lookup1;
+  if (base64_url) {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  } else {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  }
+
   const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
   const __m512i combined = _mm512_or_si512(translated, input);
   const __mmask64 mask = _mm512_movepi8_mask(combined);
@@ -110,7 +134,8 @@ static inline void load_block(block64 *b, const char16_t *src) {
   __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
   __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
   __m512i p = _mm512_packus_epi16(m1, m2);
-  b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
 }
 
 static inline void base64_decode(char *out, __m512i str) {
@@ -138,8 +163,10 @@ static inline void base64_decode_block(char *out, block64 *b) {
 }
 
 template <bool base64_url, typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -164,11 +191,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -287,7 +313,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 8aa9cf886..356159808 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1386,7 +1386,11 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
 }
 
 size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return encode_base64(output, input, length, options);
+  if(options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
 }
 
 } // namespace SIMDUTF_IMPLEMENTATION
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index 14565397b..e95e5f331 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -787,8 +787,8 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length, base64_options options) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length, options);
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
@@ -799,8 +799,12 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  if(options == base64_url) {
+    return encode_base64<base64_url>(output, input, length);
+  } else {
+    return encode_base64<base64_default>(output, input, length);
+  }
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index 2fc986acc..6966fc864 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -25,8 +25,7 @@
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-
-__m128i lookup_pshufb_improved(const __m128i input) {
+template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
   // credit: Wojciech Muła
   // reduce  0..51 -> 0
   //        52..61 -> 1 .. 10
@@ -40,9 +39,16 @@ __m128i lookup_pshufb_improved(const __m128i input) {
   const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
   result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
 
-  const __m128i shift_LUT = _mm_setr_epi8(
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m128i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
 
   // read shift
   result = _mm_shuffle_epi8(shift_LUT, result);
@@ -50,6 +56,7 @@ __m128i lookup_pshufb_improved(const __m128i input) {
   return _mm_add_epi8(result, input);
 }
 
+template <base64_options options>
 size_t encode_base64(char *dst, const char *src, size_t srclen) {
   // credit: Wojciech Muła
   // SSE (lookup: pshufb improved unrolled)
@@ -101,19 +108,19 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m128i input3 = _mm_or_si128(t1_3, t3_3);
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input0));
+                     lookup_pshufb_improved<options & base64_url>(input0));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input1));
+                     lookup_pshufb_improved<options & base64_url>(input1));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input2));
+                     lookup_pshufb_improved<options & base64_url>(input2));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input3));
+                     lookup_pshufb_improved<options & base64_url>(input3));
     out += 16;
   }
   for (; i + 16 <= srclen; i += 12) {
@@ -153,12 +160,12 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m128i indices = _mm_or_si128(t1, t3);
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(indices));
+                     lookup_pshufb_improved<options & base64_url>(indices));
     out += 16;
   }
 
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
@@ -198,27 +205,59 @@ struct block64 {
   __m128i chunks[4];
 };
 
+template <bool base64_url>
 static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   const __m128i ascii_space_tbl =
       _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
                     0x0, 0xd, 0x0, 0x0);
   // credit: aqrit
-  const __m128i delta_asso =
-      _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
-                    0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  const __m128i delta_values =
-      _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-                    int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-                    int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
-                    int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
-  const __m128i check_asso =
-      _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                    0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
-  const __m128i check_values =
-      _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-                    int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
-                    int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
-                    int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  __m128i delta_asso;
+  if (base64_url) {
+    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+
+    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+  __m128i delta_values;
+  if (base64_url) {
+    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
+                                 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+
+    delta_values =
+        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+  }
+  __m128i check_asso;
+  if (base64_url) {
+    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                               0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+  } else {
+
+    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m128i check_values;
+  if (base64_url) {
+    check_values = _mm_setr_epi8(
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
+        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+  } else {
+
+    check_values =
+        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  }
   const __m128i shifted = _mm_srli_epi32(*src, 3);
 
   const __m128i delta_hash =
@@ -239,12 +278,14 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   *src = out;
   return (uint16_t)mask;
 }
+
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   *error = 0;
-  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
-  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
-  uint64_t m2 = to_base64_mask(&b->chunks[2], error);
-  uint64_t m3 = to_base64_mask(&b->chunks[3], error);
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
   return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
 }
 
@@ -339,8 +380,10 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <bool base64_url, typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -368,11 +411,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -507,7 +549,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 6465811a6..19c0ef947 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -8,6 +8,9 @@
 #include <tests/helpers/test.h>
 #include <tests/helpers/transcode_test_base.h>
 
+// We may disable base64url tests by commenting out this next line.
+#define SIMDUTF_BASE64URL_TESTS 1
+
 using random_generator = std::mt19937;
 static random_generator::result_type seed = 42;
 
@@ -153,6 +156,7 @@ TEST(encode_base64_cases) {
   }
 }
 
+#if SIMDUTF_BASE64URL_TESTS
 
 TEST(encode_base64url_cases) {
   std::vector<std::pair<std::string, std::string>> cases = {
@@ -171,6 +175,11 @@ TEST(encode_base64url_cases) {
     size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
                                                buffer.data(), simdutf::base64_url);
     ASSERT_EQUAL(s, p.second.size());
+    if(std::string(buffer.data(), buffer.size()) != p.second) {
+      printf("difference:\n");
+      printf(" %.*s\n", (int)s, buffer.data());
+      printf(" %.*s\n", (int)s, p.second.data());
+    }
     ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
   }
   printf(" -- ");
@@ -203,6 +212,8 @@ TEST(encode_base64url_cases) {
   }
 }
 
+#endif
+
 TEST(encode_base64_cases_16) {
   std::vector<std::pair<std::string, std::u16string>> cases = {
       {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
@@ -243,6 +254,7 @@ TEST(encode_base64_cases_16) {
   }
 }
 
+#if SIMDUTF_BASE64URL_TESTS
 
 TEST(encode_base64url_cases_16) {
   std::vector<std::pair<std::string, std::u16string>> cases = {
@@ -284,6 +296,8 @@ TEST(encode_base64url_cases_16) {
   }
 }
 
+#endif
+
 TEST(roundtrip_base64) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);
@@ -368,6 +382,7 @@ TEST(roundtrip_base64_16) {
 }
 
 
+#if SIMDUTF_BASE64URL_TESTS
 
 TEST(roundtrip_base64url) {
   for (size_t len = 0; len < 2048; len++) {
@@ -451,6 +466,7 @@ TEST(roundtrip_base64url_16) {
     }
   }
 }
+#endif
 
 TEST(doomed_base64_roundtrip) {
   for (size_t len = 0; len < 2048; len++) {

From 21717c466d91f9b361311264dc87de74d9f24eb7 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 29 Mar 2024 23:37:23 -0400
Subject: [PATCH 19/49] solved based64url

---
 scripts/base64/sse.py       | 27 ++++++++++++++++++++-------
 src/haswell/avx2_base64.cpp | 28 +++++++++++++---------------
 src/westmere/sse_base64.cpp | 16 ++++++++--------
 3 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/scripts/base64/sse.py b/scripts/base64/sse.py
index d50cfd9f6..25369c754 100644
--- a/scripts/base64/sse.py
+++ b/scripts/base64/sse.py
@@ -217,12 +217,6 @@ def examine():
 
 
 
-delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
-check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
-
-delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
-check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
-
 def casthex(v):
     if(v >= 0x80):
         return "uint8_t("+"0x{:X}".format(v)+")"
@@ -236,4 +230,23 @@ def printme(c):
 print("delta_values")
 printme(delta_values)
 print("check_values")
-printme(check_values)
\ No newline at end of file
+printme(check_values)
+
+def processverbose(src):
+    print("processing ", hex(src))
+    shifted = (src >> 3)%256
+    print("shifted ", hex(shifted))
+    delta_hash = (lookup(delta_asso,src) + shifted + 1) >> 1
+    print("delta_hash ", hex(delta_hash))
+    check_hash = (lookup(check_asso,src) + shifted + 1) >> 1
+    print("check_hash ", hex(check_hash))
+    out = sat(lookup(delta_values,delta_hash), src)
+    print("out ", hex(out))
+    chk = sat(lookup(check_values,check_hash), src)
+    print("chk ", hex(chk))
+
+    mask = chk & 0x80
+    return (out, mask)
+processverbose(ord('-'))
+
+print(computestring()+ " "+str(len(computestring())))
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 615535881..4205f8b86 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -236,10 +236,10 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   if (base64_url) {
     delta_values = _mm256_setr_epi8(
         0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
-        uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xBF),
+        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
         uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
-        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3),
-        uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
   } else {
     delta_values = _mm256_setr_epi8(
         int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
@@ -255,8 +255,8 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   if (base64_url) {
     check_asso =
         _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
-                         0x7, 0xB, 0xB, 0xB, 0xF, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+                         0x7, 0xB, 0x6, 0xB, 0x12, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
   } else {
 
     check_asso = _mm256_setr_epi8(
@@ -267,14 +267,13 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   __m256i check_values;
   if (base64_url) {
     check_values = _mm256_setr_epi8(
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
-        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80),
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
-        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF),
+        uint8_t(0xBF), uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
+        uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), uint8_t(0xB0),
+        uint8_t(0x80), 0x0, 0x0, 0x0, uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD3),
+        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD0),
+        uint8_t(0x80), uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
   } else {
     check_values = _mm256_setr_epi8(
         int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
@@ -286,12 +285,10 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
         int8_t(0x91), int8_t(0x80));
   }
   const __m256i shifted = _mm256_srli_epi32(*src, 3);
-
   const __m256i delta_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
   const __m256i check_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
-
   const __m256i out =
       _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
   const __m256i chk =
@@ -420,6 +417,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
+        printf("ERROOOROOROROR\n");
         while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index 6966fc864..c6a8e4dbc 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -224,8 +224,8 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   if (base64_url) {
     delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
                                  uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
-                                 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF),
-                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
   } else {
 
     delta_values =
@@ -237,7 +237,7 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   __m128i check_asso;
   if (base64_url) {
     check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                               0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+                               0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
   } else {
 
     check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
@@ -245,11 +245,11 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   }
   __m128i check_values;
   if (base64_url) {
-    check_values = _mm_setr_epi8(
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
-        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+    check_values = _mm_setr_epi8(0x0, uint8_t(0x80), uint8_t(0x80),
+                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
+                                 uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
+                                 uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80),
+                                 uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
   } else {
 
     check_values =

From c96ac907ab282c045b8ef3a8e75aeddd094b26fd Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 21 Mar 2024 20:16:11 -0400
Subject: [PATCH 20/49] completing the base64 implementation.

---
 README.md                             |  51 ++++++++++-
 include/simdutf/error.h               |   1 +
 include/simdutf/implementation.h      | 120 ++++++++++++++++++++++++--
 src/arm64/arm_base64.cpp              |  16 +++-
 src/arm64/implementation.cpp          |   8 ++
 src/fallback/implementation.cpp       |  18 ++++
 src/haswell/implementation.cpp        |   8 ++
 src/icelake/implementation.cpp        |   9 ++
 src/implementation.cpp                |  62 +++++++++++++
 src/ppc64/implementation.cpp          |  17 ++++
 src/rvv/implementation.cpp            |  18 ++++
 src/scalar/base64.h                   | 119 +++++++++++++++++++++++--
 src/simdutf.cpp                       |   3 +-
 src/simdutf/arm64/implementation.h    |   2 +
 src/simdutf/fallback/implementation.h |   2 +
 src/simdutf/haswell/implementation.h  |   2 +
 src/simdutf/icelake/implementation.h  |   2 +
 src/simdutf/ppc64/implementation.h    |   2 +
 src/simdutf/rvv/implementation.h      |   2 +
 src/simdutf/westmere/implementation.h |   2 +
 src/westmere/implementation.cpp       |   8 ++
 21 files changed, 451 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index d18baab52..147842460 100644
--- a/README.md
+++ b/README.md
@@ -1583,7 +1583,9 @@ we prune spaces, we may need to adjust the result size afterword.
 std::vector<char> buffer(simdutf::maximal_binary_length_from_base64(base64.data(), base64.size()));
 simdutf::result r = simdutf::base64_to_binary(base64.data(), base64.size(), buffer.data());
 if(r.error) {
-  // We have some error, r.count tells you where the error was encountered in the input
+  // We have some error, r.count tells you where the error was encountered in the input if
+  // the error is INVALID_BASE64_CHARACTER. If the error is BASE64_INPUT_REMAINDER, then
+  // a single valid base64 remained, and r.count contains the number of bytes decoded.
 } else {
   buffer.resize(r.count); // resize the buffer according to actual number of bytes
 }
@@ -1604,10 +1606,21 @@ The specification of our base64 functions is as follows:
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return number of base64 bytes
+ * @return maximal number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process in UTF-16 (native endianess)
+ * @param length        the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
+
 /**
  * Convert a base64 input to a binary ouput.
  *
@@ -1618,10 +1631,14 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * See https://infra.spec.whatwg.org/#forgiving-base64-decode
  *
  * This function will fail in case of invalid input. There are two possible reasons for
- * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
  * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
  * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
@@ -1653,6 +1670,34 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  */
 size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
 
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process in UTF-16 (native endianess)
+ * @param length        the length of the string in 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result utf16_base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+
 ```
 
 
diff --git a/include/simdutf/error.h b/include/simdutf/error.h
index 0090ff1d6..a65303ce3 100644
--- a/include/simdutf/error.h
+++ b/include/simdutf/error.h
@@ -16,6 +16,7 @@ enum error_code {
                 // there must be no surrogate at all (Latin1)
   INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid base64 string.
   BASE64_INPUT_REMAINDER, // The base64 input terminates with a single character, excluding padding (=).
+  OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small.
   OTHER         // Not related to validation/transcoding.
 };
 
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index 27cb6027b..aafa9ff97 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1380,7 +1380,6 @@ simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t le
  */
 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
 
-
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1388,10 +1387,21 @@ simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t leng
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return number of base64 bytes
+ * @return maximal number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process, in ASCII stored as 16-bit units
+ * @param length        the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
+
 /**
  * Convert a base64 input to a binary ouput.
  *
@@ -1402,10 +1412,14 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * See https://infra.spec.whatwg.org/#forgiving-base64-decode
  *
  * This function will fail in case of invalid input. There are two possible reasons for
- * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
  * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
  * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
@@ -1437,6 +1451,67 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  */
 size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
 
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 16-bit units
+ * @param length        the length of the string in 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are three possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
+ * where we stopped decoding.
+ *
+ * In all case, the outlen parameter is modified to contain the number of bytes
+ * that have been written/decoded.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
+
 /**
  * An implementation of simdutf for a particular CPU architecture.
  *
@@ -2504,10 +2579,21 @@ class implementation {
    *
    * @param input         the base64 input to process
    * @param length        the length of the base64 input in bytes
-   * @return number of base64 bytes
+   * @return maximal number of binary bytes
    */
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept = 0;
 
+  /**
+   * Provide the maximal binary length in bytes given the base64 input.
+   * In general, if the input contains ASCII spaces, the result will be less than
+   * the maximum length.
+   *
+   * @param input         the base64 input to process, in ASCII stored as 16-bit units
+   * @param length        the length of the base64 input in 16-bit units
+   * @return maximal number of binary bytes
+   */
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept = 0;
+
   /**
    * Convert a base64 input to a binary ouput.
    *
@@ -2518,7 +2604,7 @@ class implementation {
    * See https://infra.spec.whatwg.org/#forgiving-base64-decode
    *
    * This function will fail in case of invalid input. There are two possible reasons for
-   * failure: the input is contains a number of base64 characters that when divided by 4, leaves
+   * failure: the input contains a number of base64 characters that when divided by 4, leaves
    * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
    * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
    *
@@ -2532,6 +2618,30 @@ class implementation {
    */
   simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
 
+  /**
+   * Convert a base64 input to a binary ouput.
+   *
+   * This function follows the WHATWG forgiving-base64 format, which means that it will
+   * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+   * equal signs at the end) or an unpadded input (without any equal signs at the end).
+   *
+   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+   *
+   * This function will fail in case of invalid input. There are two possible reasons for
+   * failure: the input contains a number of base64 characters that when divided by 4, leaves
+   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+   *
+   * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+   * If you fail to provide that much space, the function may cause a buffer overflow.
+   *
+   * @param input         the base64 string to process, in ASCII stored as 16-bit units
+   * @param length        the length of the string in 16-bit units
+   * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+   */
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept = 0;
+
   /**
    * Provide the base64 length in bytes given the length of a binary input.
    *
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 2113a2cec..565b83746 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -210,6 +210,13 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
+void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = vld2q_u8(reinterpret_cast<const uint8_t *>(src)).val[0];
+  b->chunks[1] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 16).val[0];
+  b->chunks[2] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 32).val[0];
+  b->chunks[3] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 48).val[0];
+}
+
 // decode 64 bytes and output 48 bytes
 void base64_decode_block(char *out, const char *src) {
   uint8x16x4_t str = vld4q_u8((uint8_t *)src);
@@ -222,7 +229,8 @@ void base64_decode_block(char *out, const char *src) {
   vst3q_u8((uint8_t *)out, outvec);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename char_type>
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -232,15 +240,15 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
       equalsigns = 2;
     }
   }
-  const char *const srcinit = src;
+  const char_type *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const char_type *const srcend = src + srclen;
 
   constexpr size_t block_size = 10;
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const char_type *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index ff02797f8..f8d6a566a 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -843,6 +843,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/fallback/implementation.cpp b/src/fallback/implementation.cpp
index 8bf24a1fc..c469dbbef 100644
--- a/src/fallback/implementation.cpp
+++ b/src/fallback/implementation.cpp
@@ -362,6 +362,24 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return scalar::base64::base64_tail_decode(output, input, length);
 }
 
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 78d00a6ab..733f83b62 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -786,6 +786,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 035c77a50..dae4f0dfd 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1372,6 +1372,15 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/implementation.cpp b/src/implementation.cpp
index bd76c4075..253cf52d9 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -31,6 +31,8 @@ std::string toBinaryString(T b) {
 
 #include "scalar/utf8.h"
 #include "scalar/utf16.h"
+#include "scalar/utf32.h"
+#include "scalar/base64.h"
 
 namespace simdutf {
 bool implementation::supported_by_runtime_system() const {
@@ -460,6 +462,14 @@ class detect_best_supported_implementation_on_first_use final : public implement
     return set_best()->base64_to_binary(input, length, output);
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output);
+  }
+
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept override {
     return set_best()->base64_length_from_binary(length);
   }
@@ -816,6 +826,15 @@ class unsupported_implementation final : public implementation {
     return result(error_code::OTHER, 0);
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+
   simdutf_warn_unused size_t base64_length_from_binary(size_t) const noexcept override {
     return 0;
   }
@@ -1274,6 +1293,49 @@ simdutf_warn_unused result base64_to_binary(const char * input, size_t length, c
   return get_default_implementation()->base64_to_binary(input, length, output);
 }
 
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output);
+}
+
+template <typename chartype>
+simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
+  static_assert(std::is_same_v<chartype, char> || std::is_same_v<chartype, char16_t>, "Only char and char16_t are supported.");
+  // The implementation could be nicer, but we expect that most times, the user
+  // will provide us with a buffer that is large enough.
+  size_t max_length = maximal_binary_length_from_base64(input, length);
+  if(outlen >= max_length) {
+    return base64_to_binary(input, length, output);
+  }
+  // The output buffer is maybe too small. We will decode a truncated version of the input.
+  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
+  size_t safe_input = base64_length_from_binary(outlen3);
+  result r = base64_to_binary(input, safe_input, output);
+  if(r.error == error_code::INVALID_BASE64_CHARACTER) { return r; }
+  size_t offset = (r.error == error_code::BASE64_INPUT_REMAINDER) ? 1 :
+    ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
+  size_t output_index = r.count - (r.count % 3);
+  size_t input_index = safe_input;
+  while(offset > 0) {
+    char c = input[--input_index];
+    if(c == '=' || c == '\n' || c == '\r' || c == '\t' || c == ' ') {
+      offset--;
+    }
+  }
+  size_t remaining_out = outlen - output_index;
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, input + input_index, length - input_index);
+  outlen = output_index + remaining_out;
+  if(r.error == error_code::INVALID_BASE64_CHARACTER) {
+    r.count += input_index;
+  } else {
+    r.count = output_index;
+  }
+  return r;
+}
+
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return get_default_implementation()->base64_length_from_binary(length);
 }
diff --git a/src/ppc64/implementation.cpp b/src/ppc64/implementation.cpp
index 8390e01a3..161ae19d9 100644
--- a/src/ppc64/implementation.cpp
+++ b/src/ppc64/implementation.cpp
@@ -299,6 +299,23 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
   return scalar::base64::base64_to_binary(input, length, output);
 }
 
diff --git a/src/rvv/implementation.cpp b/src/rvv/implementation.cpp
index 63f1283c1..7dda20c8a 100644
--- a/src/rvv/implementation.cpp
+++ b/src/rvv/implementation.cpp
@@ -95,6 +95,24 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return scalar::base64::base64_tail_decode(output, input, length);
 }
 
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index ff7368314..ec2002618 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -9,12 +9,12 @@ namespace scalar {
 namespace {
 namespace base64 {
 
-// Returns true upon success. The destination buffer must be large enough and is
-// incremented by the number of bytes written and src is incremented by the number of bytes read.
+// Returns true upon success. The destination buffer must be large enough.
 // This functions assumes that the padding (=) has been removed.
-result base64_tail_decode(char *dst, const char *src, size_t length) {
-  const char *srcend = src + length;
-  const char *srcinit = src;
+template <class char_type>
+result base64_tail_decode(char *dst, const char_type *src, size_t length) {
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
   const char *dstinit = dst;
 
   uint32_t x;
@@ -34,7 +34,7 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
     idx = 0;
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
-      char c = *src;
+      char_type c = *src;
       uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
@@ -92,6 +92,108 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
   }
 }
 
+// like base64_tail_decode, but it will not write past the end of the ouput buffer.
+// outlen is modified to reflect the number of bytes written.
+template <class char_type>
+result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length) {
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+  const char *dstend = dst + outlen;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend &&
+           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
+                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if(match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      if(dst + 3 > dstend) {
+        outlen = size_t(dst - dstinit);
+        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    // we need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char c = *src;
+      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      buffer[idx] = uint8_t(code);
+      if (code <= 63) {
+        idx++;
+      } else if (code > 64) {
+        outlen = size_t(dst - dstinit);
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (idx == 2) {
+        if(dst == dstend) {
+          outlen = size_t(dst - dstinit);
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+        }
+        uint32_t triple =
+            (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 1);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 1);
+        }
+        dst += 1;
+
+      } else if (idx == 3) {
+        if(dst + 2 >= dstend) {
+          outlen = size_t(dst - dstinit);
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+        }
+        uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+                          (uint32_t(buffer[1]) << 2 * 6) +
+                          (uint32_t(buffer[2]) << 1 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 2);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 2);
+        }
+        dst += 2;
+      } else if (idx == 1) {
+        outlen = size_t(dst - dstinit);
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      outlen = size_t(dst - dstinit);
+      return {SUCCESS, size_t(dst - dstinit)};
+    }
+    if(dst + 3 >= dstend) {
+      outlen = size_t(dst - dstinit);
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+    }
+    uint32_t triple =
+        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+    if(match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
+}
+
 // Returns the number of bytes written. The destination buffer must be large
 // enough. It will add padding (=) if needed.
 size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
@@ -128,7 +230,8 @@ size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
   return (size_t)(out - dst);
 }
 
-simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept {
+template <class char_type>
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char_type * input, size_t length) noexcept {
   // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
   size_t padding = 0;
   if(length > 0) {
@@ -140,7 +243,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
     }
   }
   size_t actual_length = length - padding;
-  if(actual_length % 4 == 0) {
+  if(actual_length % 4 <= 1) {
     return actual_length / 4 * 3;
   }
   // if we have a valid input, then the remainder must be 2 or 3 adding one or two extra bytes.
diff --git a/src/simdutf.cpp b/src/simdutf.cpp
index 26ca712dd..fa889290c 100644
--- a/src/simdutf.cpp
+++ b/src/simdutf.cpp
@@ -1,10 +1,11 @@
 #include "simdutf.h"
+// We include base64_tables once.
+#include "tables/base64_tables.h"
 #include "implementation.cpp"
 #include "encoding_types.cpp"
 #include "error.cpp"
 // The large tables should be included once and they
 // should not depend on a kernel.
-#include "tables/base64_tables.h"
 #include "tables/utf8_to_utf16_tables.h"
 #include "tables/utf16_to_utf8_tables.h"
 // End of tables.
diff --git a/src/simdutf/arm64/implementation.h b/src/simdutf/arm64/implementation.h
index b686be9fe..5e0d89ace 100644
--- a/src/simdutf/arm64/implementation.h
+++ b/src/simdutf/arm64/implementation.h
@@ -91,6 +91,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/fallback/implementation.h b/src/simdutf/fallback/implementation.h
index 14d14cb42..c8dfc2037 100644
--- a/src/simdutf/fallback/implementation.h
+++ b/src/simdutf/fallback/implementation.h
@@ -94,6 +94,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/haswell/implementation.h b/src/simdutf/haswell/implementation.h
index c75e4a5e7..79969941b 100644
--- a/src/simdutf/haswell/implementation.h
+++ b/src/simdutf/haswell/implementation.h
@@ -93,6 +93,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/icelake/implementation.h b/src/simdutf/icelake/implementation.h
index 175b34040..4638bf9b9 100644
--- a/src/simdutf/icelake/implementation.h
+++ b/src/simdutf/icelake/implementation.h
@@ -93,6 +93,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/ppc64/implementation.h b/src/simdutf/ppc64/implementation.h
index f1df43a4c..7fd324493 100644
--- a/src/simdutf/ppc64/implementation.h
+++ b/src/simdutf/ppc64/implementation.h
@@ -71,6 +71,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/simdutf/rvv/implementation.h b/src/simdutf/rvv/implementation.h
index f95dcf2ab..56f02362d 100644
--- a/src/simdutf/rvv/implementation.h
+++ b/src/simdutf/rvv/implementation.h
@@ -95,6 +95,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 private:
diff --git a/src/simdutf/westmere/implementation.h b/src/simdutf/westmere/implementation.h
index 4d992a49b..190693783 100644
--- a/src/simdutf/westmere/implementation.h
+++ b/src/simdutf/westmere/implementation.h
@@ -91,6 +91,8 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
   simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
   size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
 };
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index d428e0084..a491818c1 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -787,6 +787,14 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return compress_decode_base64(output, input, length);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+  return compress_decode_base64(output, input, length);
+}
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }

From 106e18ca1f0b0a2d1765c633b61fda204bb1303d Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 21 Mar 2024 22:26:13 -0400
Subject: [PATCH 21/49] adding ppc64

---
 .github/workflows/aarch64.yml |  2 +-
 .github/workflows/ppc64le.yml | 28 ++++++++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/ppc64le.yml

diff --git a/.github/workflows/aarch64.yml b/.github/workflows/aarch64.yml
index a94eb8eed..b54e2afa0 100644
--- a/.github/workflows/aarch64.yml
+++ b/.github/workflows/aarch64.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: uraimo/run-on-arch-action@v2
+      - uses: uraimo/run-on-arch-action@v4
         name: Test
         id: runcmd
         with:
diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml
new file mode 100644
index 000000000..c0c773928
--- /dev/null
+++ b/.github/workflows/ppc64le.yml
@@ -0,0 +1,28 @@
+name: Ubuntu aarch64 (GCC 11)
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: uraimo/run-on-arch-action@v4
+        name: Test
+        id: runcmd
+        with:
+          arch: ppc64le
+          githubToken: ${{ github.token }}
+          distro: ubuntu_latest
+          install: |
+            apt-get update -q -y
+            apt-get install -y cmake make g++
+          run: |
+            cmake -DCMAKE_BUILD_TYPE=Release -B build
+            cmake --build build -j=2

From d1c9cbcfde35b1bc8d8d0e288b7570dc38e92c22 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Thu, 21 Mar 2024 23:03:50 -0400
Subject: [PATCH 22/49] saving

---
 src/haswell/avx2_base64.cpp        |  3 ++-
 src/icelake/icelake_base64.inl.cpp | 13 +++++++++----
 src/implementation.cpp             |  3 ++-
 src/westmere/sse_base64.cpp        |  3 ++-
 4 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 870d36f6f..c1151d174 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -315,7 +315,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 24, buffer, 24);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 74ea110a4..e09b117fd 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -106,6 +106,10 @@ static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
 }
 
+static inline void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+}shit
+
 static inline void base64_decode(char *out, __m512i str) {
   const __m512i merge_ab_and_bc =
       _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
@@ -130,7 +134,8 @@ static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -140,16 +145,16 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
       equalsigns = 2;
     }
   }
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   // figure out why block_size == 2 is sometimes best???
   constexpr size_t block_size = 6;
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
diff --git a/src/implementation.cpp b/src/implementation.cpp
index 253cf52d9..b266e1d26 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1,6 +1,7 @@
 #include "simdutf.h"
 #include <initializer_list>
 #include <climits>
+#include <type_traits>
 
 // Useful for debugging purposes
 namespace simdutf {
@@ -1303,7 +1304,7 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
 
 template <typename chartype>
 simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
-  static_assert(std::is_same_v<chartype, char> || std::is_same_v<chartype, char16_t>, "Only char and char16_t are supported.");
+  static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
   // The implementation could be nicer, but we expect that most times, the user
   // will provide us with a buffer that is large enough.
   size_t max_length = maximal_binary_length_from_base64(input, length);
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index f2f4d7211..7ef6fd4b5 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -323,7 +323,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 36, buffer, 12);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;

From 86067981e824fa08cc32cc0bb0477c140ee9bd7e Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 21 Mar 2024 23:07:14 -0400
Subject: [PATCH 23/49] saturated.

---
 src/arm64/arm_base64.cpp          | 14 ++++++++++----
 src/simdutf/arm64/simd16-inl.h    |  2 +-
 src/simdutf/haswell/simd16-inl.h  |  2 +-
 src/simdutf/westmere/simd16-inl.h |  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 565b83746..239176d97 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -210,11 +210,17 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
+inline uint8x16_t load_satured(const uint16_t * data) {
+    uint16x8_t in1 = vld1q_u16(data);
+    uint16x8_t in2 = vld1q_u16(data+8);
+    return vqmovn_high_u16(vqmovn_u16(in1), in2);
+}
+
 void load_block(block64 *b, const char16_t *src) {
-  b->chunks[0] = vld2q_u8(reinterpret_cast<const uint8_t *>(src)).val[0];
-  b->chunks[1] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 16).val[0];
-  b->chunks[2] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 32).val[0];
-  b->chunks[3] = vld2q_u8(reinterpret_cast<const uint8_t *>(src) + 48).val[0];
+  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
+  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
+  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
+  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
 }
 
 // decode 64 bytes and output 48 bytes
diff --git a/src/simdutf/arm64/simd16-inl.h b/src/simdutf/arm64/simd16-inl.h
index 66d1168b7..32734c0ab 100644
--- a/src/simdutf/arm64/simd16-inl.h
+++ b/src/simdutf/arm64/simd16-inl.h
@@ -156,7 +156,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
   simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
   simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
 
-  // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     return vqmovn_high_u16(vqmovn_u16(v0), v1);
   }
diff --git a/src/simdutf/haswell/simd16-inl.h b/src/simdutf/haswell/simd16-inl.h
index 04c1b7fe0..964ff4ebd 100644
--- a/src/simdutf/haswell/simd16-inl.h
+++ b/src/simdutf/haswell/simd16-inl.h
@@ -140,7 +140,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     return _mm256_shuffle_epi8(*this, swap);
   }
 
-  // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
     //       we have to shuffle lanes in order to produce bytes in the
diff --git a/src/simdutf/westmere/simd16-inl.h b/src/simdutf/westmere/simd16-inl.h
index bbcca0776..694d93d22 100644
--- a/src/simdutf/westmere/simd16-inl.h
+++ b/src/simdutf/westmere/simd16-inl.h
@@ -146,7 +146,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     return _mm_shuffle_epi8(*this, swap);
   }
 
-  // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     return _mm_packus_epi16(v0, v1);
   }

From e7eae70329db3f4e1a4ef22d0ff30399905ab049 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 22 Mar 2024 00:26:52 -0400
Subject: [PATCH 24/49] finishing...

---
 README.md                          | 46 ++++++++++++++++++++++++++++--
 src/haswell/avx2_base64.cpp        | 19 ++++++++++--
 src/icelake/icelake_base64.inl.cpp |  7 +++--
 src/westmere/sse_base64.cpp        | 21 ++++++++++++--
 4 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 147842460..244aaf0dc 100644
--- a/README.md
+++ b/README.md
@@ -1591,11 +1591,19 @@ if(r.error) {
 }
 ```
 
+In some instances, you may want to limit the size of the output further when decoding base64.
+For this purpose, you may use the `base64_to_binary_safe` functions.
+
+In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
+we have function overloads for these cases as well.
+
 Some users may want to decode the base64 inputs in chunks, especially when doing
 file or networking programming. These users should see `tools/fastbase64.cpp`, a command-line
 utility designed for as an example. It reads and writes base64 files using chunks of at most
 a few tens of kilobytes.
 
+
+
 The specification of our base64 functions is as follows:
 
 ```C++
@@ -1615,7 +1623,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * In general, if the input contains ASCII spaces, the result will be less than
  * the maximum length.
  *
- * @param input         the base64 input to process in UTF-16 (native endianess)
+ * @param input         the base64 input to process in 16-bit units
  * @param length        the length of the base64 input in 16-bit units
  * @return maximal number of binary bytes
  */
@@ -1696,7 +1704,41 @@ size_t binary_to_base64(const char * input, size_t length, char* output) noexcep
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result utf16_base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are three possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a singler remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
+ * where we stopped decoding.
+ *
+ * In all case, the outlen parameter is modified to contain the number of bytes
+ * that have been written/decoded.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
 
 ```
 
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index c1151d174..6eed08481 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -276,6 +276,19 @@ static inline void load_block(block64 *b, const char *src) {
       _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
 }
 
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x13);
+  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x13);
+  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+}
+
 static inline void base64_decode(char *out, __m256i str) {
   // credit: aqrit
   const __m256i pack_shuffle =
@@ -329,16 +342,16 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   char *end_of_safe_64byte_zone =
       (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
 
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
   static_assert(block_size >= 2, "block_size must be at least two");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index e09b117fd..20399ef9b 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -107,8 +107,11 @@ static inline void load_block(block64 *b, const char *src) {
 }
 
 static inline void load_block(block64 *b, const char16_t *src) {
-  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
-}shit
+  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 64));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
 
 static inline void base64_decode(char *out, __m512i str) {
   const __m512i merge_ab_and_bc =
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index 7ef6fd4b5..ef57f5184 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -274,6 +274,21 @@ static inline void load_block(block64 *b, const char *src) {
   b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
 }
 
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
+  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
+  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
+  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
+  b->chunks[0] = _mm_packus_epi16(m1, m2);
+  b->chunks[1] = _mm_packus_epi16(m3, m4);
+  b->chunks[2] = _mm_packus_epi16(m5, m6);
+  b->chunks[3] = _mm_packus_epi16(m7, m8);
+}
+
 static inline void base64_decode(char *out, __m128i str) {
   // credit: aqrit
 
@@ -337,16 +352,16 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   char *end_of_safe_64byte_zone =
       (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
 
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
   static_assert(block_size >= 2, "block should of size 2 or more");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);

From 9262b4b372a900cb31bf41debff59e4c22717f86 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Wed, 27 Mar 2024 00:42:47 -0400
Subject: [PATCH 25/49] various fixes

---
 README.md                          |  12 +-
 include/simdutf/implementation.h   |  11 +-
 src/haswell/avx2_base64.cpp        |   4 +-
 src/icelake/icelake_base64.inl.cpp |   2 +-
 src/implementation.cpp             |  37 +-
 src/scalar/base64.h                |  15 +-
 tests/base64_tests.cpp             | 622 ++++++++++++++++++++++++-----
 tests/helpers/test.h               |  13 +-
 8 files changed, 581 insertions(+), 135 deletions(-)

diff --git a/README.md b/README.md
index 244aaf0dc..1216404a6 100644
--- a/README.md
+++ b/README.md
@@ -1592,7 +1592,9 @@ if(r.error) {
 ```
 
 In some instances, you may want to limit the size of the output further when decoding base64.
-For this purpose, you may use the `base64_to_binary_safe` functions.
+For this purpose, you may use the `base64_to_binary_safe` functions. The functions may also
+be useful if you seek to decode the input into segments having a maximal capacity.
+See our function specifications for more details.
 
 In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
 we have function overloads for these cases as well.
@@ -1602,8 +1604,6 @@ file or networking programming. These users should see `tools/fastbase64.cpp`, a
 utility designed for as an example. It reads and writes base64 files using chunks of at most
 a few tens of kilobytes.
 
-
-
 The specification of our base64 functions is as follows:
 
 ```C++
@@ -1732,10 +1732,10 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * that have been written/decoded.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
- * @param length        the length of the string in 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
- * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful. Note that the return convention of base64_to_binary_safe differs from base64_to_binary.
  */
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
 simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index aafa9ff97..fbf20f8fc 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1497,17 +1497,12 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
  *
- * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
- * where we stopped decoding.
- *
- * In all case, the outlen parameter is modified to contain the number of bytes
- * that have been written/decoded.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
- * @param length        the length of the string in 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
- * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how mnay bytes were written.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
  */
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
 simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 6eed08481..1f222b3b8 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -282,9 +282,9 @@ static inline void load_block(block64 *b, const char16_t *src) {
   __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
   __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
   __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
-  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x13);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
   __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
-  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x13);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
   b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
   b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
 }
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 20399ef9b..a7ff0c091 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -108,7 +108,7 @@ static inline void load_block(block64 *b, const char *src) {
 
 static inline void load_block(block64 *b, const char16_t *src) {
   __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
-  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 64));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
   __m512i p = _mm512_packus_epi16(m1, m2);
   b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
 }
diff --git a/src/implementation.cpp b/src/implementation.cpp
index b266e1d26..e964ce565 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1303,13 +1303,16 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
 }
 
 template <typename chartype>
-simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
+simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
   static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
   // The implementation could be nicer, but we expect that most times, the user
   // will provide us with a buffer that is large enough.
   size_t max_length = maximal_binary_length_from_base64(input, length);
   if(outlen >= max_length) {
-    return base64_to_binary(input, length, output);
+    // fast path
+    result r = base64_to_binary(input, length, output);
+    if(r.error != error_code::INVALID_BASE64_CHARACTER) { outlen = r.count; r.count = length; }
+    return r;
   }
   // The output buffer is maybe too small. We will decode a truncated version of the input.
   size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
@@ -1320,23 +1323,37 @@ simdutf_warn_unused result base64_to_binary_safe(const chartype * input, size_t
     ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
   size_t output_index = r.count - (r.count % 3);
   size_t input_index = safe_input;
-  while(offset > 0) {
-    char c = input[--input_index];
+  while(offset > 0 && input_index > 0) {
+    chartype c = input[--input_index];
     if(c == '=' || c == '\n' || c == '\r' || c == '\t' || c == ' ') {
+      // skipping
+    } else {
       offset--;
     }
   }
   size_t remaining_out = outlen - output_index;
-  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, input + input_index, length - input_index);
-  outlen = output_index + remaining_out;
-  if(r.error == error_code::INVALID_BASE64_CHARACTER) {
-    r.count += input_index;
-  } else {
-    r.count = output_index;
+  const chartype * tail_input = input + input_index;
+  size_t tail_length = length - input_index;
+  if(tail_length > 0 && tail_input[tail_length - 1] == '=') {
+    tail_length--;
+    if(tail_length > 0 && tail_input[tail_length - 1] == '=') {
+      tail_length--;
+    }
   }
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length);
+  outlen = output_index + remaining_out;
+  r.count += input_index;
   return r;
 }
 
+
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen);
+}
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen);
+}
+
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return get_default_implementation()->base64_length_from_binary(length);
 }
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index ec2002618..7a19087fe 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -41,6 +41,8 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
         idx++;
       } else if (code > 64) {
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
       }
       src++;
     }
@@ -94,6 +96,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
 
 // like base64_tail_decode, but it will not write past the end of the ouput buffer.
 // outlen is modified to reflect the number of bytes written.
+// This functions assumes that the padding (=) has been removed.
 template <class char_type>
 result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length) {
   const char_type *srcend = src + length;
@@ -120,9 +123,11 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       src += 4;
     }
     idx = 0;
+    const char_type *srccur = src;
+
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
-      char c = *src;
+      char_type c = *src;
       uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
@@ -130,6 +135,8 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       } else if (code > 64) {
         outlen = size_t(dst - dstinit);
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
       }
       src++;
     }
@@ -137,7 +144,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       if (idx == 2) {
         if(dst == dstend) {
           outlen = size_t(dst - dstinit);
-          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
         }
         uint32_t triple =
             (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
@@ -154,7 +161,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       } else if (idx == 3) {
         if(dst + 2 >= dstend) {
           outlen = size_t(dst - dstinit);
-          return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
         }
         uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
                           (uint32_t(buffer[1]) << 2 * 6) +
@@ -177,7 +184,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
     }
     if(dst + 3 >= dstend) {
       outlen = size_t(dst - dstinit);
-      return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
     }
     uint32_t triple =
         (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 6263a4c8a..48b7b9e6c 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -11,35 +11,164 @@
 using random_generator = std::mt19937;
 static random_generator::result_type seed = 42;
 
+const uint8_t to_base64_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
+    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+template <typename char_type>
+size_t add_space(std::vector<char_type> &v, std::mt19937 &gen) {
+  const static std::array<char_type, 4> space = {' ', '\t', '\n', '\r'};
+  int padding = 0;
+  if (v.size() > 0 && v[v.size() - 1] == '=') {
+    padding++;
+    if (v.size() > 0 && v[v.size() - 1] == '=') {
+      padding++;
+    }
+  }
+  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
+  size_t i = index_dist(gen);
+  std::uniform_int_distribution<int> char_dist(0, 3);
+  v.insert(v.begin() + i, space[char_dist(gen)]);
+  return i;
+}
+
+template <typename char_type>
+size_t add_garbage(std::vector<char_type> &v, std::mt19937 &gen) {
+  int padding = 0;
+  if (v.size() > 0 && v[v.size() - 1] == '=') {
+    padding++;
+    if (v.size() > 0 && v[v.size() - 1] == '=') {
+      padding++;
+    }
+  }
+  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
+  size_t i = index_dist(gen);
+  std::uniform_int_distribution<int> char_dist(
+      0, (1 << (sizeof(char_type) * 8)) - 1);
+  uint8_t c = char_dist(gen);
+  while (uint8_t(c) == c && to_base64_value[uint8_t(c)] != 255) {
+    c = char_dist(gen);
+  }
+  v.insert(v.begin() + i, c);
+  return i;
+}
+
 TEST(decode_base64_cases) {
   std::vector<std::vector<char>> cases = {{0x53, 0x53}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
 
-  for(size_t i = 0; i < cases.size(); i++) {
-    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(cases[i].data(), cases[i].size()));
-    simdutf::result r = implementation.base64_to_binary(cases[i].data(), cases[i].size(), buffer.data());
-    ASSERT_EQUAL(r.error,codes[i]);
+  for (size_t i = 0; i < cases.size(); i++) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        cases[i].data(), cases[i].size()));
+    simdutf::result r = implementation.base64_to_binary(
+        cases[i].data(), cases[i].size(), buffer.data());
+    ASSERT_EQUAL(r.error, codes[i]);
     ASSERT_EQUAL(r.count, counts[i]);
   }
 }
 
 TEST(encode_base64_cases) {
-  std::vector<std::pair<std::string,std::string>> cases = {
-    {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
-    {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
-    {"123456", "MTIzNDU2"},
-    {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", "MTIzNDU2"},
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
-
-  for(std::pair<std::string,std::string> p : cases) {
-    std::vector<char> buffer(implementation.base64_length_from_binary(p.first.size()));
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(
+        implementation.base64_length_from_binary(p.first.size()));
     ASSERT_EQUAL(buffer.size(), p.second.size());
-    size_t s = implementation.binary_to_base64(p.first.data(),p.first.size(), buffer.data());
+    size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
+                                               buffer.data());
     ASSERT_EQUAL(s, p.second.size());
     ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
   }
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data());
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" --  ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
+TEST(encode_base64_cases_16) {
+  std::vector<std::pair<std::string, std::u16string>> cases = {
+      {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", u"MTIzNDU2"},
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data());
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
 }
 
 TEST(roundtrip_base64) {
@@ -61,16 +190,17 @@ TEST(roundtrip_base64) {
           implementation.base64_to_binary(buffer.data(), size, back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
       ASSERT_EQUAL(r.count, len);
-      if(back != source) {
+      if (back != source) {
         printf("=====input size %zu\n", len);
-        for(size_t i = 0; i < len; i++) {
-          if(back[i] != source[i]) {
-            std::cerr << "Mismatch at position " << i << " trial " << trial << std::endl;
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
           }
           printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
         }
         printf("=====base64 size %zu\n", size);
-        for(size_t i = 0; i < size; i++) {
+        for (size_t i = 0; i < size; i++) {
           printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
         }
       }
@@ -79,59 +209,49 @@ TEST(roundtrip_base64) {
   }
 }
 
-const uint8_t to_base64_value[] = {
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
-    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
-    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
-    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
-    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
-    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255};
-
-size_t add_space(std::vector<char> &v, std::mt19937 &gen) {
-  const static std::array<char, 4> space = {' ', '\t', '\n', '\r'};
-  int padding = 0;
-  if (v.size() > 0 && v[v.size() - 1] == '=') {
-    padding++;
-    if (v.size() > 0 && v[v.size() - 1] == '=') {
-      padding++;
-    }
-  }
-  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
-  size_t i = index_dist(gen);
-  std::uniform_int_distribution<int> char_dist(0, 3);
-  v.insert(v.begin() + i, space[char_dist(gen)]);
-  return i;
-}
+TEST(roundtrip_base64_16) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
 
-size_t add_garbage(std::vector<char> &v, std::mt19937 &gen) {
-  int padding = 0;
-  if (v.size() > 0 && v[v.size() - 1] == '=') {
-    padding++;
-    if (v.size() > 0 && v[v.size() - 1] == '=') {
-      padding++;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer16.data(), size, back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
     }
   }
-  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
-  size_t i = index_dist(gen);
-  std::uniform_int_distribution<int> char_dist(0, 255);
-  uint8_t c = char_dist(gen);
-  while(to_base64_value[uint8_t(c)] != 255) {
-    c = char_dist(gen);
-  }
-  v.insert(v.begin() + i, c);
-  return i;
 }
 
 TEST(doomed_base64_roundtrip) {
@@ -151,8 +271,13 @@ TEST(doomed_base64_roundtrip) {
       size_t location = add_garbage(buffer, gen);
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::INVALID_BASE64_CHARACTER);
+      ASSERT_EQUAL(r.count, location);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
       ASSERT_EQUAL(r.error, simdutf::error_code::INVALID_BASE64_CHARACTER);
       ASSERT_EQUAL(r.count, location);
     }
@@ -175,10 +300,49 @@ TEST(doomed_truncated_base64_roundtrip) {
       buffer.resize(size - 3);
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, (size - 4) / 4 * 3);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, buffer.size());
+    }
+  }
+}
+
+TEST(doomed_truncated_base64_roundtrip_16) {
+  for (size_t len = 1; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size - 3);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+          buffer16.data(), buffer16.size()));
       simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+          buffer16.data(), buffer16.size(), back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
-      ASSERT_EQUAL(r.count, (size-4)/4*3);
+      ASSERT_EQUAL(r.count, (size - 4) / 4 * 3);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer16.data(), buffer16.size(),
+                                         back.data(), back_length);
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, buffer16.size());
     }
   }
 }
@@ -200,21 +364,278 @@ TEST(roundtrip_base64_with_spaces) {
       for (size_t i = 0; i < 5; i++) {
         add_space(buffer, gen);
       }
-
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
 
       back.resize(
           r.count); // resize the buffer according to actual number of bytes
       ASSERT_EQUAL(r.count, len);
       ASSERT_TRUE(back == source);
+      back.resize(back.capacity());
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
+
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+
+      back.resize(
+          back_length); // resize the buffer according to actual number of bytes
+      ASSERT_EQUAL(r.count, buffer.size());
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(roundtrip_base64_16_with_spaces) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size);
+      for (size_t i = 0; i < 5; i++) {
+        add_space(buffer, gen);
+      }
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r = implementation.base64_to_binary(
+          buffer16.data(), buffer16.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+            buffer.data(), buffer.size()));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
     }
   }
 }
 
+TEST(aborted_safe_roundtrip_base64_16) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      std::vector<char16_t> buffer16;
+
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::vector<char> back(len);
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        buffer16.resize(buffer.size());
+        for (size_t i = 0; i < buffer.size(); i++) {
+          buffer16[i] = buffer[i];
+        }
+        ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_with_spaces) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        for (size_t i = 0; i < 5; i++) {
+          add_space(buffer, gen);
+        }
+        std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+            buffer.data(), buffer.size()));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_16_with_spaces) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      std::vector<char16_t> buffer16;
+
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::vector<char> back(len);
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        for (size_t i = 0; i < 5; i++) {
+          add_space(buffer, gen);
+        }
+        buffer16.resize(buffer.size());
+        for (size_t i = 0; i < buffer.size(); i++) {
+          buffer16[i] = buffer[i];
+        }
+        ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(r.count);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
 
 TEST(streaming_base64_roundtrip) {
   size_t len = 2048;
@@ -226,25 +647,25 @@ TEST(streaming_base64_roundtrip) {
   for (size_t i = 0; i < len; i++) {
     source[i] = byte_generator(gen);
   }
-  size_t size = implementation.binary_to_base64(
-          source.data(), source.size(), buffer.data());
+  size_t size = implementation.binary_to_base64(source.data(), source.size(),
+                                                buffer.data());
   buffer.resize(size);
   for (size_t window = 16; window <= 2048; window += 7) {
     // build a buffer with enough space to receive the decoded base64
     std::vector<char> back(len);
     size_t outpos = 0;
-    for(size_t pos = 0; pos < buffer.size(); pos += window) {
+    for (size_t pos = 0; pos < buffer.size(); pos += window) {
       size_t count = std::min(window, buffer.size() - pos);
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data() + pos, count, back.data() + outpos);
+      simdutf::result r = simdutf::base64_to_binary(buffer.data() + pos, count,
+                                                    back.data() + outpos);
       ASSERT_TRUE(r.error != simdutf::error_code::INVALID_BASE64_CHARACTER);
-      if(count + pos == buffer.size()) {
+      if (count + pos == buffer.size()) {
         // We must check that the last call to base64_to_binary did not
         // end with an BASE64_INPUT_REMAINDER error.
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
       } else {
         size_t tail_bytes_to_reprocess = 0;
-        if(r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+        if (r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
           tail_bytes_to_reprocess = 1;
         } else {
           tail_bytes_to_reprocess = (r.count % 3) == 0 ? 0 : (r.count % 3) + 1;
@@ -259,31 +680,34 @@ TEST(streaming_base64_roundtrip) {
   }
 }
 
-
 TEST(readme_test) {
   size_t len = 2048;
   std::vector<char> base64(len, 'a');
-  std::vector<char> back((len+3)/4*3);
+  std::vector<char> back((len + 3) / 4 * 3);
   size_t outpos = 0;
   size_t window = 512;
-  for(size_t pos = 0; pos < base64.size(); pos += window) {
+  for (size_t pos = 0; pos < base64.size(); pos += window) {
     // how many base64 characters we can process in this iteration
     size_t count = std::min(window, base64.size() - pos);
-    simdutf::result r = simdutf::base64_to_binary(
-        base64.data() + pos, count, back.data() + outpos);
-    if(r.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
-      std::cerr << "Invalid base64 character at position " << pos + r.count << std::endl;
+    simdutf::result r = simdutf::base64_to_binary(base64.data() + pos, count,
+                                                  back.data() + outpos);
+    if (r.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
+      std::cerr << "Invalid base64 character at position " << pos + r.count
+                << std::endl;
       return;
     }
-    // If we arrived at the end of the base64 input, we must check that the number
-    // of characters processed is a multiple of 4, or that we have a remainder of 0, 2 or 3.
-    if(count + pos == base64.size() && r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
-      std::cerr << "The base64 input contained an invalid number of characters " << std::endl;
+    // If we arrived at the end of the base64 input, we must check that the
+    // number of characters processed is a multiple of 4, or that we have a
+    // remainder of 0, 2 or 3.
+    if (count + pos == base64.size() &&
+        r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+      std::cerr << "The base64 input contained an invalid number of characters "
+                << std::endl;
     }
-    // If we are not at then end, we may have to reprocess either 1, 2 or 3 bytes, and
-    // to drop the last 0, 2 or 3 bytes decoded.
+    // If we are not at then end, we may have to reprocess either 1, 2 or 3
+    // bytes, and to drop the last 0, 2 or 3 bytes decoded.
     size_t tail_bytes_to_reprocess = 0;
-    if(r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+    if (r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
       tail_bytes_to_reprocess = 1;
     } else {
       tail_bytes_to_reprocess = (r.count % 3) == 0 ? 0 : (r.count % 3) + 1;
@@ -300,9 +724,9 @@ int main(int argc, char *argv[]) {
   if (argc == 2) {
     try {
       seed = std::stoi(argv[1]);
-    } catch (const std::exception& e) {
-        printf("%s\n", e.what());
-        return EXIT_FAILURE;
+    } catch (const std::exception &e) {
+      printf("%s\n", e.what());
+      return EXIT_FAILURE;
     }
   }
   return simdutf::test::main(argc, argv);
diff --git a/tests/helpers/test.h b/tests/helpers/test.h
index 1d7d20238..5c28a299c 100644
--- a/tests/helpers/test.h
+++ b/tests/helpers/test.h
@@ -41,11 +41,12 @@ void name(const simdutf::implementation& impl) {            \
 static simdutf::test::register_test test_register_##name(#name, name); \
 void test_impl_##name(const simdutf::implementation& implementation)
 
-#define ASSERT_EQUAL(a, b) {                                      \
-  const auto expr = (a);                                          \
-  if (expr != b) {                                                \
-    std::cout << "\nExpected " << expr << " to be " << b << ".\n";\
-    printf("%s \n",#a);                                           \
+#define ASSERT_EQUAL(a, b) {                                                   \
+  const auto expr = (a);                                                       \
+  if (expr != b) {                                                             \
+    std::cout << "\nExpected " << expr << " to be " << b << ".\n";             \
+    printf("%s \n",#a);                                                        \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                      \
   }                                                               \
 }
@@ -54,6 +55,7 @@ void test_impl_##name(const simdutf::implementation& implementation)
   const bool expr = (cond);                                 \
   if (!expr) {                                              \
     printf("expected %s to be true, it's false\n", #cond);  \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                \
   }                                                         \
 }
@@ -62,6 +64,7 @@ void test_impl_##name(const simdutf::implementation& implementation)
   const bool expr = !(cond);                                \
   if (!expr) {                                              \
     printf("expected %s to be false, it's true\n", #cond);  \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                \
   }                                                         \
 }

From 3444f4e4cbfb4d34c490714dcd5e9df7d3eddad4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 12:12:12 -0400
Subject: [PATCH 26/49] Implemented bun benchmark

---
 benchmarks/base64/CMakeLists.txt       |  4 +-
 benchmarks/base64/benchmark_base64.cpp | 87 ++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/benchmarks/base64/CMakeLists.txt b/benchmarks/base64/CMakeLists.txt
index a866b9609..d00fc855e 100644
--- a/benchmarks/base64/CMakeLists.txt
+++ b/benchmarks/base64/CMakeLists.txt
@@ -9,6 +9,8 @@ CPMAddPackage(
 
 add_executable(benchmark_base64 benchmark_base64.cpp)
 
-target_link_libraries(benchmark_base64 PUBLIC simdutf)
+set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD 17)
+set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD_REQUIRED ON)
+
 target_link_libraries(benchmark_base64 PUBLIC base64)
 target_link_libraries(benchmark_base64 PUBLIC simdutf::benchmarks::benchmark)
diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 579d1e198..ed41a9406 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -10,8 +10,8 @@
 #include <vector>
 
 #include "libbase64.h"
-#include "simdutf.h"
 #include "node_base64.h"
+#include "simdutf.h"
 
 #include "event_counter.h"
 #include <atomic>
@@ -34,7 +34,7 @@ bool is_space(char c) {
 // This is for reference only, do not use this function in production
 // system.
 int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
-                                     size_t *outlen) {
+                              size_t *outlen) {
   struct base64_state state;
   base64_stream_decode_init(&state, 0);
   const char *srcend = src + srclen;
@@ -65,7 +65,7 @@ int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
   return !state.bytes;
 }
 
-enum : uint8_t { roundtrip = 0, decode = 1, encode = 2 };
+enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3 };
 
 event_collector collector;
 
@@ -277,14 +277,14 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
                      }
                    }));
     }
-    pretty_print(data.size(), volume, "node",
-                 bench([&data, &buffer1, &buffer2]() {
-                   for (const std::vector<char> &source : data) {
-                     int result = node::base64_decode(buffer1.data(), buffer1.size(),
-                                    source.data(), source.size());
-                     (void) result;
-                   }
-                 }));
+    pretty_print(
+        data.size(), volume, "node", bench([&data, &buffer1, &buffer2]() {
+          for (const std::vector<char> &source : data) {
+            int result = node::base64_decode(buffer1.data(), buffer1.size(),
+                                             source.data(), source.size());
+            (void)result;
+          }
+        }));
     for (auto &e : simdutf::get_available_implementations()) {
       if (!e->supported_by_runtime_system()) {
         continue;
@@ -316,7 +316,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
     printf("# encode\n");
     volatile size_t base64_size;
     pretty_print(data.size(), volume, "libbase64",
-                 bench([&data, &buffer1, &buffer2, &base64_size]() {
+                 bench([&data, &buffer1, &base64_size]() {
                    for (const std::vector<char> &source : data) {
                      size_t outlen;
                      base64_encode(source.data(), source.size(), buffer1.data(),
@@ -329,7 +329,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
         continue;
       }
       pretty_print(data.size(), volume, "simdutf::" + e->name(),
-                   bench([&data, &buffer1, &buffer2, &e, &base64_size]() {
+                   bench([&data, &buffer1, &e, &base64_size]() {
                      for (const std::vector<char> &source : data) {
                        base64_size = e->binary_to_base64(
                            source.data(), source.size(), buffer1.data());
@@ -341,6 +341,62 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
   }
 }
 
+int bench_bun() {
+  /**
+   * See
+   * https://github.com/oven-sh/bun/blob/main/bench/snippets/buffer-to-string.mjs
+   *
+   * const bigBuffer = Buffer.from("hello world".repeat(10000));
+   * const converted = bigBuffer.toString("base64");
+   * const uuid = crypto.randomBytes(16);
+   *
+   * bench(`Buffer(${bigBuffer.byteLength}).toString('base64')`, () => {
+   * return bigBuffer.toString("base64");
+   * });
+   *
+   * bench(`Buffer(${uuid.byteLength}).toString('base64')`, () => {
+   *  return uuid.toString("base64");
+   * });
+   */
+  printf("# benching bun (essentially an encoding bench)\n");
+  std::string bigBuffer = "hello world";
+  bigBuffer.reserve(10000 * bigBuffer.size());
+  for (size_t i = 1; i < 10000; i++) {
+    bigBuffer += "hello world";
+  }
+  std::string crypto;
+  for (size_t i = 0; i < 16; i++) {
+    crypto += rand();
+  }
+  std::vector<std::pair<std::string, std::string>> tests = {
+      {"big hello world", bigBuffer}, {"random 16 bytes", crypto}};
+  // Could be nicer with C++20
+  for (auto & i : tests) {
+    printf("# %s\n", i.first.c_str());
+    std::string source = i.second;
+    volatile size_t base64_size;
+    std::vector<char> buffer1(simdutf::base64_length_from_binary(source.size()));
+    pretty_print(1, source.size(), "libbase64",
+                 bench([&source, &buffer1, &base64_size]() {
+                   size_t outlen;
+                   base64_encode(source.data(), source.size(), buffer1.data(),
+                                 &outlen, 0);
+                   base64_size = outlen;
+                 }));
+    for (auto &e : simdutf::get_available_implementations()) {
+      if (!e->supported_by_runtime_system()) {
+        continue;
+      }
+      pretty_print(1, source.size(), "simdutf::" + e->name(),
+                   bench([&source, &buffer1, &e, &base64_size]() {
+                     base64_size = e->binary_to_base64(
+                         source.data(), source.size(), buffer1.data());
+                   }));
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
 int main(int argc, char **argv) {
   printf("# current system detected as %s.\n",
          simdutf::get_active_implementation()->name().c_str());
@@ -363,10 +419,15 @@ int main(int argc, char **argv) {
       mode = encode;
     } else if ((arg == "-r") || (arg == "--roundtrip")) {
       mode = roundtrip;
+    } else if ((arg == "-b") || (arg == "--bun")) {
+      mode = bun;
     } else {
       arguments.push_back(std::move(arg));
     }
   }
+  if (bun) {
+    return bench_bun();
+  }
   auto return_value = EXIT_SUCCESS;
   std::vector<std::vector<char>> input;
   printf("# loading files: ");

From 6949b2c6b297ae2d49272cce2a9a48c12c6fdabd Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 12:13:35 -0400
Subject: [PATCH 27/49] Obvious fix.

---
 benchmarks/base64/benchmark_base64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index ed41a9406..d1078cf07 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -425,7 +425,7 @@ int main(int argc, char **argv) {
       arguments.push_back(std::move(arg));
     }
   }
-  if (bun) {
+  if (mode == bun) {
     return bench_bun();
   }
   auto return_value = EXIT_SUCCESS;

From 381945b1315e861d0a66f1df64a1f9c5a53189ae Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 12:14:44 -0400
Subject: [PATCH 28/49] documentation

---
 benchmarks/base64/benchmark_base64.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index d1078cf07..b280e42cc 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -110,10 +110,12 @@ std::vector<char> read_file(const char *filename,
 void show_help() {
   printf("Usage: benchmark_base64 [options] file1 [file2 ...]\n");
   printf("Options:\n");
-  printf("  -h, --help     Show this help message and exit\n");
-  printf("  -d, --decode   Decode the input file\n");
-  printf("  -e, --encode   Encode the input file\n");
+  printf("  -h, --help        Show this help message and exit\n");
+  printf("  -d, --decode      Decode the input file\n");
+  printf("  -e, --encode      Encode the input file\n");
   printf("  -r, --roundtrip   Roundtrip the input file\n");
+  printf("  -b, --bun         Bun benchmark\n");
+
   printf(" See https://github.com/lemire/base64data for test data.\n");
 }
 void pretty_print(size_t, size_t bytes, std::string name, event_aggregate agg) {

From 7b304d3cd21efb484a814ba9dcf42ff6808f6e73 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 14:30:47 -0400
Subject: [PATCH 29/49] adding libbase64 competitor

---
 benchmarks/base64/benchmark_base64.cpp | 24 +++++++++++--
 benchmarks/base64/libbase64_spaces.h   | 50 ++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/base64/libbase64_spaces.h

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index b280e42cc..22eaf8dc5 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -10,7 +10,9 @@
 #include <vector>
 
 #include "libbase64.h"
+#include "libbase64_spaces.h"
 #include "node_base64.h"
+
 #include "simdutf.h"
 
 #include "event_counter.h"
@@ -261,7 +263,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
     bool spaces = contains_spaces(data);
     if (spaces) {
       printf("# the base64 data contains spaces, so we cannot use straigth "
-             "libbase64::base64_decode\n");
+             "libbase64::base64_decode directly\n");
     } else {
       pretty_print(data.size(), volume, "libbase64",
                    bench([&data, &buffer1, &buffer2]() {
@@ -279,6 +281,21 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
                      }
                    }));
     }
+    pretty_print(
+        data.size(), volume, "libbase64_space_decode",
+        bench([&data, &buffer1, &buffer2]() {
+          for (const std::vector<char> &source : data) {
+
+            size_t outlen;
+            bool ok = libbase64_space_decode(source.data(), source.size(),
+                                             buffer1.data(), &outlen);
+            if (!ok) {
+              std::cerr << "Error: "
+                        << " failed to decode base64 " << std::endl;
+              throw std::runtime_error("Error: failed to decode base64 ");
+            }
+          }
+        }));
     pretty_print(
         data.size(), volume, "node", bench([&data, &buffer1, &buffer2]() {
           for (const std::vector<char> &source : data) {
@@ -373,11 +390,12 @@ int bench_bun() {
   std::vector<std::pair<std::string, std::string>> tests = {
       {"big hello world", bigBuffer}, {"random 16 bytes", crypto}};
   // Could be nicer with C++20
-  for (auto & i : tests) {
+  for (auto &i : tests) {
     printf("# %s\n", i.first.c_str());
     std::string source = i.second;
     volatile size_t base64_size;
-    std::vector<char> buffer1(simdutf::base64_length_from_binary(source.size()));
+    std::vector<char> buffer1(
+        simdutf::base64_length_from_binary(source.size()));
     pretty_print(1, source.size(), "libbase64",
                  bench([&source, &buffer1, &base64_size]() {
                    size_t outlen;
diff --git a/benchmarks/base64/libbase64_spaces.h b/benchmarks/base64/libbase64_spaces.h
new file mode 100644
index 000000000..6e68c2caf
--- /dev/null
+++ b/benchmarks/base64/libbase64_spaces.h
@@ -0,0 +1,50 @@
+
+// https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L392
+static inline size_t libbase64_find_space(const char *p, const size_t avail) {
+  for (size_t len = 0; len < avail; len++) {
+    if (p[len] == '\n' || p[len] == '\r' || p[len] == ' ' || p[len] == '\t') {
+      return len;
+    }
+  }
+
+  return avail;
+}
+
+// Inspired by
+// https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L405
+
+static bool libbase64_space_decode(const char *start, size_t avail, char *outbuf,
+                   size_t *outlen) {
+  struct base64_state state;
+  *outlen = 0;
+
+  // Initialize the decoder's state structure.
+  base64_stream_decode_init(&state, 0);
+
+  while (avail > 0) {
+    size_t len = libbase64_find_space(start, avail);
+    if (len == 0) {
+      start++;
+      avail--;
+      continue;
+    }
+
+    // Decode the chunk into the raw buffer.
+    size_t outlen = 0;
+    if (base64_stream_decode(&state, start, len, outbuf, &outlen) == 0) {
+      // decoding error
+      return false;
+    }
+
+    // Update the output buffer pointer and total size.
+    outbuf += outlen;
+    outlen += outlen;
+    if(avail == len) {
+      break;
+    }
+
+    start += len + 1;
+    avail -= len + 1;
+  }
+  return true;
+}
\ No newline at end of file

From f51ffdfa61cecb0863d944497b72562626591384 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 15:06:29 -0400
Subject: [PATCH 30/49] more documentation.

---
 README.md              | 24 ++++++++++++++++++++++++
 tests/base64_tests.cpp | 30 +++++++++++++++++++++++++-----
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 1216404a6..e78038ab0 100644
--- a/README.md
+++ b/README.md
@@ -1594,6 +1594,30 @@ if(r.error) {
 In some instances, you may want to limit the size of the output further when decoding base64.
 For this purpose, you may use the `base64_to_binary_safe` functions. The functions may also
 be useful if you seek to decode the input into segments having a maximal capacity.
+
+
+```C++
+  size_t len = 72; // for simplicity we chose len divisible by 3
+  std::vector<char> base64(len, 'a'); // we want to decode 'aaaaa....'
+  std::vector<char> back((len + 3) / 4 * 3);
+  size_t limited_length = back.size() / 2; // Intentionally too small
+  // We proceed to decode half:
+  simdutf::result r = simdutf::base64_to_binary_safe(
+            base64.data(), base64.size(), back.data(), limited_length);
+  assert(r.error == simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+  // We decoded r.count base64 bytes to limited_length bytes
+  // Now let us decode the rest !!!
+  size_t input_index = r.count;
+  size_t limited_length2 = back.size();
+  r = simdutf::base64_to_binary_safe(base64.data() + input_index,
+                                           base64.size() - input_index,
+                                           back.data(), limited_length2);
+  assert(r.error == simdutf::error_code::SUCCESS);
+  // We decoded r.count base64 bytes to limited_length2 bytes
+  // We are done
+  assert(limited_length2 + limited_length == (len + 3) / 4 * 3);
+```
+
 See our function specifications for more details.
 
 In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 48b7b9e6c..e04dee0a1 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -472,7 +472,7 @@ TEST(aborted_safe_roundtrip_base64) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
 
         for (size_t i = 0; i < second_length; i++) {
@@ -524,7 +524,7 @@ TEST(aborted_safe_roundtrip_base64_16) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
         for (size_t i = 0; i < second_length; i++) {
           ASSERT_EQUAL(source[i + limited_length], back[i]);
@@ -572,9 +572,8 @@ TEST(aborted_safe_roundtrip_base64_with_spaces) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
-
         for (size_t i = 0; i < second_length; i++) {
           ASSERT_EQUAL(source[i + limited_length], back[i]);
         }
@@ -627,7 +626,7 @@ TEST(aborted_safe_roundtrip_base64_16_with_spaces) {
                                            buffer.size() - input_index,
                                            back.data(), second_length);
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
-        back.resize(r.count);
+        back.resize(second_length);
         ASSERT_EQUAL(second_length + limited_length, len);
         for (size_t i = 0; i < second_length; i++) {
           ASSERT_EQUAL(source[i + limited_length], back[i]);
@@ -720,6 +719,27 @@ TEST(readme_test) {
   back.resize(outpos);
 }
 
+TEST(readme_safe) {
+  size_t len = 72;
+  std::vector<char> base64(len, 'a');
+  std::vector<char> back((len + 3) / 4 * 3);
+  size_t limited_length = back.size() / 2; // Intentionally too small
+  simdutf::result r = simdutf::base64_to_binary_safe(
+            base64.data(), base64.size(), back.data(), limited_length);
+  ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+
+  // We decoded 'limited_length' bytes to back.
+  // Now let us decode the rest !!!
+  size_t input_index = r.count;
+  size_t limited_length2 = back.size();
+  r = simdutf::base64_to_binary_safe(base64.data() + input_index,
+                                           base64.size() - input_index,
+                                           back.data(), limited_length2);
+  ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+  back.resize(limited_length2);
+  ASSERT_EQUAL(limited_length2 + limited_length, (len + 3) / 4 * 3);
+}
+
 int main(int argc, char *argv[]) {
   if (argc == 2) {
     try {

From 3d87826adf5d9c224749a0f9cff04ead6b004ed6 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 27 Mar 2024 22:45:18 -0400
Subject: [PATCH 31/49] base64url (first steps)

---
 README.md                             |  34 ++--
 include/simdutf/implementation.h      |  30 ++-
 scripts/base64/table.py               |  42 ++++
 src/arm64/arm_base64.cpp              |  15 +-
 src/arm64/implementation.cpp          |  12 +-
 src/fallback/implementation.cpp       |  12 +-
 src/haswell/avx2_base64.cpp           |  13 +-
 src/haswell/implementation.cpp        |  12 +-
 src/icelake/icelake_base64.inl.cpp    |  13 +-
 src/icelake/implementation.cpp        |  12 +-
 src/implementation.cpp                |  42 ++--
 src/ppc64/implementation.cpp          |  12 +-
 src/rvv/implementation.cpp            |  12 +-
 src/scalar/base64.h                   |  63 +++---
 src/simdutf/arm64/implementation.h    |   6 +-
 src/simdutf/fallback/implementation.h |   6 +-
 src/simdutf/haswell/implementation.h  |   6 +-
 src/simdutf/icelake/implementation.h  |   6 +-
 src/simdutf/ppc64/implementation.h    |   6 +-
 src/simdutf/rvv/implementation.h      |   6 +-
 src/simdutf/westmere/implementation.h |   6 +-
 src/tables/base64_tables.h            | 265 +++++++++++++++++++++++++-
 src/westmere/implementation.cpp       |  12 +-
 src/westmere/sse_base64.cpp           |  13 +-
 tests/base64_tests.cpp                |  20 ++
 25 files changed, 519 insertions(+), 157 deletions(-)
 create mode 100644 scripts/base64/table.py

diff --git a/README.md b/README.md
index e78038ab0..be0cff736 100644
--- a/README.md
+++ b/README.md
@@ -1631,6 +1631,14 @@ a few tens of kilobytes.
 The specification of our base64 functions is as follows:
 
 ```C++
+
+// base64_options are used to specify the base64 encoding options.
+using base64_options = uint64_t;
+enum : base64_options {
+  base64_default = 0, /* standard base64 format */
+  base64_url = 1 /* base64url format*/
+};
+
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1647,7 +1655,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * In general, if the input contains ASCII spaces, the result will be less than
  * the maximum length.
  *
- * @param input         the base64 input to process in 16-bit units
+ * @param input         the base64 input to process, in ASCII stored as 16-bit units
  * @param length        the length of the base64 input in 16-bit units
  * @return maximal number of binary bytes
  */
@@ -1677,9 +1685,10 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * in
  * @param input         the base64 string to process
  * @param length        the length of the string in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Provide the base64 length in bytes given the length of a binary input.
@@ -1698,9 +1707,10 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  * @param input         the binary to process
  * @param length        the length of the input in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return number of written bytes, will be equal to base64_length_from_binary(length)
  */
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1723,13 +1733,13 @@ size_t binary_to_base64(const char * input, size_t length, char* output) noexcep
  * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
- * @param input         the base64 string to process in UTF-16 (native endianess)
+ * @param input         the base64 string to process, in ASCII stored as 16-bit units
  * @param length        the length of the string in 16-bit units
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
-
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1749,20 +1759,16 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
  *
- * When the error is OUTPUT_BUFFER_TOO_SMALL, then r.count contains the location in the input
- * where we stopped decoding.
- *
- * In all case, the outlen parameter is modified to contain the number of bytes
- * that have been written/decoded.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
  * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
  * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful. Note that the return convention of base64_to_binary_safe differs from base64_to_binary.
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
  */
-simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
-simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 
 ```
 
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index fbf20f8fc..a6784ba3a 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1380,6 +1380,13 @@ simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t le
  */
 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
 
+// base64_options are used to specify the base64 encoding options.
+using base64_options = uint64_t;
+enum : base64_options {
+  base64_default = 0, /* standard base64 format */
+  base64_url = 1 /* base64url format*/
+};
+
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1426,9 +1433,10 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * in
  * @param input         the base64 string to process
  * @param length        the length of the string in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Provide the base64 length in bytes given the length of a binary input.
@@ -1447,9 +1455,10 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  * @param input         the binary to process
  * @param length        the length of the input in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return number of written bytes, will be equal to base64_length_from_binary(length)
  */
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1475,9 +1484,10 @@ size_t binary_to_base64(const char * input, size_t length, char* output) noexcep
  * @param input         the base64 string to process, in ASCII stored as 16-bit units
  * @param length        the length of the string in 16-bit units
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output)  noexcept;
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
 /**
  * Convert a base64 input to a binary ouput.
@@ -1502,10 +1512,11 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
  * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
  */
-simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept;
-simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 
 /**
  * An implementation of simdutf for a particular CPU architecture.
@@ -2609,9 +2620,10 @@ class implementation {
    * @param input         the base64 string to process
    * @param length        the length of the string in bytes
    * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
    */
-  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
   /**
    * Convert a base64 input to a binary ouput.
@@ -2633,9 +2645,10 @@ class implementation {
    * @param input         the base64 string to process, in ASCII stored as 16-bit units
    * @param length        the length of the string in 16-bit units
    * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
    */
-  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept = 0;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
   /**
    * Provide the base64 length in bytes given the length of a binary input.
@@ -2654,9 +2667,10 @@ class implementation {
    * @param input         the binary to process
    * @param length        the length of the input in bytes
    * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return number of written bytes, will be equal to base64_length_from_binary(length)
    */
-  virtual size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept = 0;
+  virtual size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
 
 protected:
diff --git a/scripts/base64/table.py b/scripts/base64/table.py
new file mode 100644
index 000000000..ff83aa316
--- /dev/null
+++ b/scripts/base64/table.py
@@ -0,0 +1,42 @@
+import base64
+#default:
+#t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
+#baseur::
+t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
+def formula(a, b, c, d):
+    if(a >= 64 or b >= 64 or c >= 64 or d >= 64):
+        return 0x01ffffff
+    z =[ ((a * 4) + (b // 16))% 256, ((b * 16) % 256 + (c // 4))% 256 , ((c * 64) % 256 + d)% 256 ]
+    return z[0] + (z[1] << 8) + (z[2] << 16)
+
+acc = "const uint32_t d0[256] = {"
+for i in range(256):
+    a = formula(t[i], 0, 0, 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d1[256] = {"
+for i in range(256):
+    a = formula(0, t[i], 0, 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d2[256] = {"
+for i in range(256):
+    a = formula(0, 0, t[i], 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d3[256] = {"
+for i in range(256):
+    a = formula(0, 0, 0, t[i])
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
\ No newline at end of file
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 239176d97..58d885e17 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -26,7 +26,7 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-size_t encode_base64(char *dst, const char *src, size_t srclen) {
+size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
   // credit: Wojciech Muła
   uint8_t *out = (uint8_t *)dst;
   constexpr static uint8_t source_table[64] = {
@@ -55,7 +55,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     vst4q_u8(out, result);
     out += 64;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 
   return size_t((char *)out - dst);
 }
@@ -236,7 +236,8 @@ void base64_decode_block(char *out, const char *src) {
 }
 
 template <typename char_type>
-result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -265,7 +266,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
         src -= 64;
 
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -302,7 +303,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -346,7 +347,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -387,7 +388,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index f8d6a566a..75fc037b5 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -839,24 +839,24 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 
 
diff --git a/src/fallback/implementation.cpp b/src/fallback/implementation.cpp
index c469dbbef..f7c7d9321 100644
--- a/src/fallback/implementation.cpp
+++ b/src/fallback/implementation.cpp
@@ -349,7 +349,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -359,7 +359,7 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 
@@ -367,7 +367,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -377,15 +377,15 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 1f222b3b8..12954b60c 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -149,7 +149,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     out += 32;
   }
   return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 }
 
 static inline void compress(__m128i data, uint16_t mask, char *output) {
@@ -329,7 +329,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -361,7 +362,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
       if (error) {
         src -= 64;
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -407,7 +408,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   if (last_block != 0 && srcend - src + last_block >= 64) {
 
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -455,7 +456,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -495,7 +496,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 733f83b62..8f24b7e2a 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -782,24 +782,24 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index a7ff0c091..94eb4feb8 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -58,7 +58,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     out += 64;
   }
   return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 }
 
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
@@ -138,7 +138,8 @@ static inline void base64_decode_block(char *out, block64 *b) {
 }
 
 template <typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -167,7 +168,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
       if (error) {
         src -= 64;
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -203,7 +204,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   if (last_block != 0 && srcend - src + last_block >= 64) {
 
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -245,7 +246,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -286,7 +287,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index dae4f0dfd..183de14bc 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1368,16 +1368,16 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 
@@ -1385,8 +1385,8 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 
 } // namespace SIMDUTF_IMPLEMENTATION
diff --git a/src/implementation.cpp b/src/implementation.cpp
index e964ce565..48a14e386 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -459,24 +459,24 @@ class detect_best_supported_implementation_on_first_use final : public implement
     return set_best()->maximal_binary_length_from_base64(input, length);
   }
 
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output);
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options);
   }
 
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept override {
     return set_best()->maximal_binary_length_from_base64(input, length);
   }
 
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output);
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options);
   }
 
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept override {
     return set_best()->base64_length_from_binary(length);
   }
 
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept override {
-    return set_best()->binary_to_base64(input, length, output);
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->binary_to_base64(input, length, output, options);
   }
 
   simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
@@ -823,7 +823,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*) const noexcept override {
+  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*, base64_options) const noexcept override {
     return result(error_code::OTHER, 0);
   }
 
@@ -831,7 +831,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*) const noexcept override {
+  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*, base64_options) const noexcept override {
     return result(error_code::OTHER, 0);
   }
 
@@ -840,7 +840,7 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  size_t binary_to_base64(const char *, size_t, char*) const noexcept override {
+  size_t binary_to_base64(const char *, size_t, char*, base64_options) const noexcept override {
     return 0;
   }
 
@@ -1290,34 +1290,34 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
   return get_default_implementation()->maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->base64_to_binary(input, length, output);
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output, options);
 }
 
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept {
   return get_default_implementation()->maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->base64_to_binary(input, length, output);
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output, options);
 }
 
 template <typename chartype>
-simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen) noexcept {
+simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
   static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
   // The implementation could be nicer, but we expect that most times, the user
   // will provide us with a buffer that is large enough.
   size_t max_length = maximal_binary_length_from_base64(input, length);
   if(outlen >= max_length) {
     // fast path
-    result r = base64_to_binary(input, length, output);
+    result r = base64_to_binary(input, length, output, options);
     if(r.error != error_code::INVALID_BASE64_CHARACTER) { outlen = r.count; r.count = length; }
     return r;
   }
   // The output buffer is maybe too small. We will decode a truncated version of the input.
   size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
   size_t safe_input = base64_length_from_binary(outlen3);
-  result r = base64_to_binary(input, safe_input, output);
+  result r = base64_to_binary(input, safe_input, output, options);
   if(r.error == error_code::INVALID_BASE64_CHARACTER) { return r; }
   size_t offset = (r.error == error_code::BASE64_INPUT_REMAINDER) ? 1 :
     ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
@@ -1340,18 +1340,18 @@ simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, si
       tail_length--;
     }
   }
-  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length);
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length, options);
   outlen = output_index + remaining_out;
   r.count += input_index;
   return r;
 }
 
 
-simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen) noexcept {
-  return base64_to_binary_safe_impl<char>(input, length, output, outlen);
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen, options);
 }
-simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen) noexcept {
-  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen);
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen, options);
 }
 
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
diff --git a/src/ppc64/implementation.cpp b/src/ppc64/implementation.cpp
index 161ae19d9..f33444d41 100644
--- a/src/ppc64/implementation.cpp
+++ b/src/ppc64/implementation.cpp
@@ -298,7 +298,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -308,23 +308,23 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return scalar::base64::base64_to_binary(input, length, output);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::base64_to_binary(input, length, output, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/rvv/implementation.cpp b/src/rvv/implementation.cpp
index 7dda20c8a..7b4ecf96b 100644
--- a/src/rvv/implementation.cpp
+++ b/src/rvv/implementation.cpp
@@ -82,7 +82,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -92,7 +92,7 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 
@@ -100,7 +100,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -110,15 +110,15 @@ simdutf_warn_unused result implementation::base64_to_binary(const char16_t * inp
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index 7a19087fe..3e3b617bb 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -12,7 +12,13 @@ namespace base64 {
 // Returns true upon success. The destination buffer must be large enough.
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
-result base64_tail_decode(char *dst, const char_type *src, size_t length) {
+result base64_tail_decode(char *dst, const char_type *src, size_t length, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3;
+
   const char_type *srcend = src + length;
   const char_type *srcinit = src;
   const char *dstinit = dst;
@@ -22,8 +28,8 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
   uint8_t buffer[4];
   while (true) {
     while (src + 4 <= srcend &&
-           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
-                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
         x = scalar::utf32::swap_bytes(x);
       }
@@ -35,7 +41,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
       char_type c = *src;
-      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
         idx++;
@@ -98,7 +104,13 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length) {
 // outlen is modified to reflect the number of bytes written.
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
-result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length) {
+result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3;
+
   const char_type *srcend = src + length;
   const char_type *srcinit = src;
   const char *dstinit = dst;
@@ -109,8 +121,8 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
   uint8_t buffer[4];
   while (true) {
     while (src + 4 <= srcend &&
-           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
-                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
         x = scalar::utf32::swap_bytes(x);
       }
@@ -128,7 +140,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
       char_type c = *src;
-      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
         idx++;
@@ -203,35 +215,38 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
 
 // Returns the number of bytes written. The destination buffer must be large
 // enough. It will add padding (=) if needed.
-size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
+  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 : tables::base64::base64_default::e0;
+  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 : tables::base64::base64_default::e1;
+  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 : tables::base64::base64_default::e2;
   char *out = dst;
   size_t i = 0;
   uint8_t t1, t2, t3;
   for (; i + 2 < srclen; i += 3) {
-    t1 = (uint8_t)src[i];
-    t2 = (uint8_t)src[i + 1];
-    t3 = (uint8_t)src[i + 2];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = tables::base64::e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
-    *out++ = tables::base64::e2[t3];
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    t3 = uint8_t(src[i + 2]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+    *out++ = e2[t3];
   }
   switch (srclen - i) {
   case 0:
     break;
   case 1:
-    t1 = (uint8_t)src[i];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[(t1 & 0x03) << 4];
+    t1 = uint8_t(src[i]);
+    *out++ = e0[t1];
+    *out++ = e1[(t1 & 0x03) << 4];
     *out++ = '=';
     *out++ = '=';
     break;
   default: /* case 2 */
-    t1 = (uint8_t)src[i];
-    t2 = (uint8_t)src[i + 1];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = tables::base64::e2[(t2 & 0x0F) << 2];
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e2[(t2 & 0x0F) << 2];
     *out++ = '=';
   }
   return (size_t)(out - dst);
diff --git a/src/simdutf/arm64/implementation.h b/src/simdutf/arm64/implementation.h
index 5e0d89ace..1d6fbd423 100644
--- a/src/simdutf/arm64/implementation.h
+++ b/src/simdutf/arm64/implementation.h
@@ -90,11 +90,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace arm64
diff --git a/src/simdutf/fallback/implementation.h b/src/simdutf/fallback/implementation.h
index c8dfc2037..40fdcc246 100644
--- a/src/simdutf/fallback/implementation.h
+++ b/src/simdutf/fallback/implementation.h
@@ -93,11 +93,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 } // namespace fallback
 } // namespace simdutf
diff --git a/src/simdutf/haswell/implementation.h b/src/simdutf/haswell/implementation.h
index 79969941b..f3eb7e4db 100644
--- a/src/simdutf/haswell/implementation.h
+++ b/src/simdutf/haswell/implementation.h
@@ -92,11 +92,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace haswell
diff --git a/src/simdutf/icelake/implementation.h b/src/simdutf/icelake/implementation.h
index 4638bf9b9..495a05a59 100644
--- a/src/simdutf/icelake/implementation.h
+++ b/src/simdutf/icelake/implementation.h
@@ -92,11 +92,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace icelake
diff --git a/src/simdutf/ppc64/implementation.h b/src/simdutf/ppc64/implementation.h
index 7fd324493..ee0c7dcd4 100644
--- a/src/simdutf/ppc64/implementation.h
+++ b/src/simdutf/ppc64/implementation.h
@@ -70,11 +70,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace ppc64
diff --git a/src/simdutf/rvv/implementation.h b/src/simdutf/rvv/implementation.h
index 56f02362d..d4e668581 100644
--- a/src/simdutf/rvv/implementation.h
+++ b/src/simdutf/rvv/implementation.h
@@ -94,11 +94,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 private:
   const bool _supports_zvbb;
 
diff --git a/src/simdutf/westmere/implementation.h b/src/simdutf/westmere/implementation.h
index 190693783..d10dfb433 100644
--- a/src/simdutf/westmere/implementation.h
+++ b/src/simdutf/westmere/implementation.h
@@ -90,11 +90,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace westmere
diff --git a/src/tables/base64_tables.h b/src/tables/base64_tables.h
index a0f997733..f835f141b 100644
--- a/src/tables/base64_tables.h
+++ b/src/tables/base64_tables.h
@@ -7,6 +7,7 @@ namespace simdutf {
 namespace {
 namespace tables {
 namespace base64 {
+namespace base64_default {
 
 const char e0[256] = {
     'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
@@ -68,8 +69,6 @@ const char e2[256] = {
     'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
     '/'};
 
-/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN CPUS */
-
 const uint32_t d0[256] = {
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
@@ -249,6 +248,247 @@ const uint32_t d3[256] = {
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_default
+
+namespace base64_url {
+
+const char e0[256] = {
+    'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+    'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+    'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+    'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+    'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+    'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+    'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+    'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+    'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+    'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+    'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+    'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+    't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+    'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+    '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+    '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+    '8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_',
+    '_'};
+
+const char e1[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+    '_'};
+
+const char e2[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+    '_'};
+
+const uint32_t d0[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff,
+    0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+    0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+    0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+    0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+    0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+    0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
+    0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+    0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+    0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+    0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+    0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d1[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff,
+    0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+    0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+    0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+    0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+    0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+    0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
+    0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+    0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+    0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+    0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+    0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d2[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff,
+    0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+    0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+    0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+    0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+    0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+    0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
+    0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+    0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+    0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+    0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+    0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d3[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff,
+    0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+    0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+    0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+    0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+    0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+    0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
+    0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+    0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+    0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+    0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+    0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_url
 const uint64_t thintable_epi8[256] = {
     0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
     0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
@@ -388,6 +628,27 @@ const uint8_t to_base64_value[] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255};
+
+const uint8_t to_base64_url_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    62,  255, 255, 52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 63,  255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
 } // namespace base64
 } // namespace tables
 } // unnamed namespace
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index a491818c1..c995df881 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -783,16 +783,16 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length, base64_options options) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length, options);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return compress_decode_base64(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index ef57f5184..c72118c7b 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -158,7 +158,7 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
   }
 
   return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
 }
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
@@ -339,7 +339,8 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -371,7 +372,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
       if (error) {
         src -= 64;
         while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+               to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -416,7 +417,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -464,7 +465,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -506,7 +507,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index e04dee0a1..a22dd18d0 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -31,6 +31,26 @@ const uint8_t to_base64_value[] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255};
 
+
+const uint8_t to_base64url_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  255,
+    62, 255, 255,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 63, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
 template <typename char_type>
 size_t add_space(std::vector<char_type> &v, std::mt19937 &gen) {
   const static std::array<char_type, 4> space = {' ', '\t', '\n', '\r'};

From c72079ce7f70663f64308897e16beb90e7b88d5c Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 28 Mar 2024 11:56:01 -0400
Subject: [PATCH 32/49] working through

---
 scripts/base64/neon_decode.py      | 33 ++++++++++++++++++++++++++++++
 src/arm64/arm_base64.cpp           | 16 +++++++++++----
 src/arm64/implementation.cpp       |  4 ++--
 src/haswell/avx2_base64.cpp        |  4 ++--
 src/haswell/implementation.cpp     |  4 ++--
 src/icelake/icelake_base64.inl.cpp |  4 ++--
 src/icelake/implementation.cpp     |  4 ++--
 src/westmere/implementation.cpp    |  4 ++--
 src/westmere/sse_base64.cpp        |  4 ++--
 9 files changed, 59 insertions(+), 18 deletions(-)
 create mode 100644 scripts/base64/neon_decode.py

diff --git a/scripts/base64/neon_decode.py b/scripts/base64/neon_decode.py
new file mode 100644
index 000000000..6ce185cb5
--- /dev/null
+++ b/scripts/base64/neon_decode.py
@@ -0,0 +1,33 @@
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
+spaces=' \t\n\r'
+lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4]
+lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
+roll = [0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
+def decode(s):
+    low = s & 0xf
+    high = s >> 4
+    m = lut_lo[low] & lut_hi[high]
+    if(m > 0x3):
+        return (m, None)
+    if s == 0x2f:
+        off = roll[high - 1]
+    else:
+        off = roll[high]
+    return (m,(s + off)&0xff)
+
+for i in range(256):
+    m,d = decode(i)
+    if d is None:
+        assert t.find(chr(i)) == -1
+        assert spaces.find(chr(i)) == -1
+        continue
+    if m == 0:
+        assert d >= 0
+        # we must have a base64 element
+        v = t.find(chr(i))
+        #print(i, chr(i), v, d)
+        assert v == d
+    else:
+        # we must have a space
+        v = spaces.find(chr(i))
+        assert v >= 0
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 58d885e17..698a3e723 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -36,8 +36,15 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options o
       '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
       'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
   };
+  constexpr static uint8_t source_table_url[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
+  };
   const uint8x16_t v3f = vdupq_n_u8(0x3f);
-  const uint8x16x4_t table = vld4q_u8(source_table);
+  const uint8x16x4_t table = vld4q_u8((options&base64_url) ? source_table_url : source_table);
   size_t i = 0;
   for (; i + 16 * 3 <= srclen; i += 16 * 3) {
     const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
@@ -94,6 +101,7 @@ struct block64 {
   uint8x16_t chunks[4];
 };
 static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
+template <bool base64_url>
 uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t v0f = vdupq_n_u8(0xf);
 
@@ -235,9 +243,9 @@ void base64_decode_block(char *out, const char *src) {
   vst3q_u8((uint8_t *)out, outvec);
 }
 
-template <typename char_type>
+template <bool base64_url, typename char_type>
 result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -261,7 +269,7 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, ba
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
 
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index 75fc037b5..e0a35f071 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -840,7 +840,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
@@ -848,7 +848,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 12954b60c..d6886aa86 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -328,9 +328,9 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 24, buffer, 24);
 }
 
-template <typename chartype>
+template <bool base64_url, typename chartype>
 result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 8f24b7e2a..4d3f1951e 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -783,7 +783,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
@@ -791,7 +791,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 94eb4feb8..8b1882ca8 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -137,9 +137,9 @@ static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
 }
 
-template <typename chartype>
+template <bool base64_url, typename chartype>
 result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 183de14bc..8aa9cf886 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1369,7 +1369,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
@@ -1377,7 +1377,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index c995df881..14565397b 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -784,7 +784,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length, base64_options options) const noexcept {
@@ -792,7 +792,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
-  return compress_decode_base64(output, input, length, options);
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index c72118c7b..2fc986acc 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -338,9 +338,9 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 36, buffer, 12);
 }
 
-template <typename chartype>
+template <bool base64_url, typename chartype>
 result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;

From 200b6bc153511ba657cf9b08d0e283fc06364366 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 28 Mar 2024 20:34:13 -0400
Subject: [PATCH 33/49] implemented base64url for ARM.

---
 benchmarks/base64/benchmark_base64.cpp |  32 ++++-
 scripts/base64/neon_decode.py          | 119 ++++++++++++++++
 src/arm64/arm_base64.cpp               | 143 +++++++++++++------
 tests/base64_tests.cpp                 | 182 ++++++++++++++++++++++++-
 4 files changed, 435 insertions(+), 41 deletions(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 22eaf8dc5..3fb475d58 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -67,7 +67,7 @@ int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
   return !state.bytes;
 }
 
-enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3 };
+enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3, roundtripurl = 4 };
 
 event_collector collector;
 
@@ -116,6 +116,7 @@ void show_help() {
   printf("  -d, --decode      Decode the input file\n");
   printf("  -e, --encode      Encode the input file\n");
   printf("  -r, --roundtrip   Roundtrip the input file\n");
+  printf("  --roundtripurl    Roundtrip the input file (URL)\n");
   printf("  -b, --bun         Bun benchmark\n");
 
   printf(" See https://github.com/lemire/base64data for test data.\n");
@@ -212,6 +213,33 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
   printf("# number of inputs: %zu\n", data.size());
 
   switch (mode) {
+
+  case roundtripurl: {
+    printf("# roundtrip (url)\n");
+    for (auto &e : simdutf::get_available_implementations()) {
+      if (!e->supported_by_runtime_system()) {
+        continue;
+      }
+      pretty_print(data.size(), volume, "simdutf::" + e->name(),
+                   bench([&data, &buffer1, &buffer2, &e]() {
+                     for (const std::vector<char> &source : data) {
+                       size_t base64_size = e->binary_to_base64(
+                           source.data(), source.size(), buffer1.data(), simdutf::base64_url);
+                       auto err = e->base64_to_binary(
+                           buffer1.data(), base64_size, buffer2.data(), simdutf::base64_url);
+                       if (err.error) {
+                         std::cerr << "Error:  at position " << err.count
+                                   << std::endl;
+                       } else if (err.count != source.size()) {
+                         std::cerr << "Error: " << err.count
+                                   << " bytes decoded, expected "
+                                   << source.size() << std::endl;
+                       }
+                     }
+                   }));
+    }
+    break;
+  }
   case roundtrip: {
     printf("# roundtrip\n");
     pretty_print(
@@ -439,6 +467,8 @@ int main(int argc, char **argv) {
       mode = encode;
     } else if ((arg == "-r") || (arg == "--roundtrip")) {
       mode = roundtrip;
+    } else if (arg == "--roundtripurl") {
+      mode = roundtripurl;
     } else if ((arg == "-b") || (arg == "--bun")) {
       mode = bun;
     } else {
diff --git a/scripts/base64/neon_decode.py b/scripts/base64/neon_decode.py
index 6ce185cb5..88e5cab95 100644
--- a/scripts/base64/neon_decode.py
+++ b/scripts/base64/neon_decode.py
@@ -31,3 +31,122 @@ def decode(s):
         # we must have a space
         v = spaces.find(chr(i))
         assert v >= 0
+
+
+
+
+## 0x2d is '-' in base64
+## 0x5f is '_' in base64
+
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+spaces=' \t\n\r'
+
+#3 numbers
+#4-6 letters
+#5-7 letters
+
+#0x2d
+#0x5f
+
+lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb0]
+lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
+roll = [0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+spaces=' \t\n\r' ## ['0x20', '0x9', '0xa', '0xd']
+
+
+lut_lo = [0x0 for i in range(16)]
+lut_hi = [0x0 for i in range(16)]
+#roll = [0 for i in range(16)]
+
+#0x00 are forbidden except for \t \n \r which go to one
+lut_hi[0] = 0x11
+#for c in '\t\n\r':
+#    lut_lo[ord(c) & 0xf] = 0x1
+for z in range(16):
+    if '\t\n\r'.find(chr(z)) != -1:
+        lut_lo[z & 0xf] = 0x1 # allowed
+    else:
+        lut_lo[z] = 0x10 # forbidden
+#0x10 and 0x80 all forbidden
+lut_hi[0x1] = 0x20
+for z in range(0x8, 16):
+    lut_hi[z] = 0x20
+#lut_hi[0x8] = 0x20
+
+for z in range(16):
+    lut_lo[z] |= 0x20
+
+#0x20 selective
+lut_hi[0x2] = 0x42
+for z in range(16):
+    if z == 0:
+        lut_lo[z] |= 0x2
+    elif z != 0xd:
+        lut_lo[z] |= 0x40
+
+
+#0x30 numbers
+lut_hi[0x3] = 0x80
+for z in range(10,16):
+    lut_lo[z] |= 0x80
+
+#0x40, 0x60 letters
+lut_hi[0x4] = 0x8
+lut_hi[0x6] = 0x8
+lut_lo[0] |= 0x8
+
+#0x7 letters
+#0x5 letters
+lut_hi[0x5] |= 0x4
+lut_hi[0x7] |= 0x4
+for i in range(0xb,16):
+    lut_lo[i] |= 0x4
+
+
+
+
+
+def decode(s):
+    low = s & 0xf
+    high = s >> 4
+    m = lut_lo[low] & lut_hi[high]
+    is_underscore = s == 0x5f
+    if(is_underscore):
+        m = 0
+        high = 0
+    if(m > 0x3):
+        return (m, None)
+    if s == 0x2d:
+        off = roll[high - 1]
+    else:
+        off = roll[high]
+    return (m,(s + off)&0xff)
+print(",".join([hex(c) for c in lut_lo]))
+print(",".join([hex(c) for c in lut_hi]))
+print(",".join([hex(c) for c in roll]))
+
+#for c in spaces:
+#    print(hex(ord(c)),decode(ord(c)))
+
+#import sys
+#sys.exit(0)
+
+for i in range(256):
+    m,d = decode(i)
+    #print(hex(i), m, d, chr(i))
+    if d is None:
+        assert t.find(chr(i)) == -1
+        assert spaces.find(chr(i)) == -1
+        continue
+    if m == 0:
+        assert d >= 0
+        # we must have a base64 element
+        v = t.find(chr(i))
+        if(v != d): 
+            print(hex(i), chr(i), v, d)
+        #assert v == d
+    else:
+        # we must have a space
+        v = spaces.find(chr(i))
+        assert v >= 0
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 698a3e723..877d9bdb7 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -26,7 +26,8 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
   // credit: Wojciech Muła
   uint8_t *out = (uint8_t *)dst;
   constexpr static uint8_t source_table[64] = {
@@ -44,7 +45,8 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options o
       'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
   };
   const uint8x16_t v3f = vdupq_n_u8(0x3f);
-  const uint8x16x4_t table = vld4q_u8((options&base64_url) ? source_table_url : source_table);
+  const uint8x16x4_t table =
+      vld4q_u8((options & base64_url) ? source_table_url : source_table);
   size_t i = 0;
   for (; i + 16 * 3 <= srclen; i += 16 * 3) {
     const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
@@ -62,7 +64,8 @@ size_t encode_base64(char *dst, const char *src, size_t srclen, base64_options o
     vst4q_u8(out, result);
     out += 64;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
+                                            options);
 
   return size_t((char *)out - dst);
 }
@@ -101,10 +104,22 @@ struct block64 {
   uint8x16_t chunks[4];
 };
 static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
-template <bool base64_url>
-uint64_t to_base64_mask(block64 *b, bool *error) {
+template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t v0f = vdupq_n_u8(0xf);
 
+  uint8x16_t underscore0, underscore1, underscore2, underscore3;
+  if (base64_url) {
+    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
+    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
+    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
+    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
+  } else {
+    (void)underscore0;
+    (void)underscore1;
+    (void)underscore2;
+    (void)underscore3;
+  }
+
   uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
   uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
   uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
@@ -114,31 +129,62 @@ uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
   uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
   uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+  uint8x16_t lut_lo;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t lut_lo =
-      simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+  if (base64_url) {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4);
+  } else {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+  }
 #else
-  const uint8x16_t lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                             0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+  if (base64_url) {
+    lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+              0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4};
+  } else {
+    lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+  }
 #endif
   uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
   uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
   uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
   uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+  uint8x16_t lut_hi;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t lut_hi =
-      simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, 0x20,
-                              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  if (base64_url) {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  } else {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  }
 #else
-  const uint8x16_t lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                             0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  if (base64_url) {
+    lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  } else {
+    lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  }
 #endif
   uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
   uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
   uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
   uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
 
+  if (base64_url) {
+    hi0 = vbicq_u8(hi0, underscore0);
+    hi1 = vbicq_u8(hi1, underscore1);
+    hi2 = vbicq_u8(hi2, underscore2);
+    hi3 = vbicq_u8(hi3, underscore3);
+  }
+
   uint8_t checks =
       vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
                          vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
@@ -169,23 +215,41 @@ uint64_t to_base64_mask(block64 *b, bool *error) {
   }
   // This is the transformation step that can be done while we are waiting for
   // sum0
+  uint8x16_t roll_lut;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t roll_lut =
-      simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0,
-                              0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  if (base64_url) {
+    roll_lut =
+        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  } else {
+    roll_lut =
+        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  }
 #else
-  const uint8x16_t roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                               0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  if (base64_url) {
+    roll_lut = {0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  } else {
+    roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  }
 #endif
-  uint8x16_t v2f = vdupq_n_u8(0x2f);
-  uint8x16_t roll0 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], v2f), hi_nibbles0));
-  uint8x16_t roll1 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], v2f), hi_nibbles1));
-  uint8x16_t roll2 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], v2f), hi_nibbles2));
-  uint8x16_t roll3 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], v2f), hi_nibbles3));
+  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
+  if (base64_url) {
+    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
+    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
+    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
+    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+  }
+  uint8x16_t roll0 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
+  uint8x16_t roll1 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
+  uint8x16_t roll2 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
+  uint8x16_t roll3 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
   b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
   b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
   b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
@@ -218,10 +282,10 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
-inline uint8x16_t load_satured(const uint16_t * data) {
-    uint16x8_t in1 = vld1q_u16(data);
-    uint16x8_t in2 = vld1q_u16(data+8);
-    return vqmovn_high_u16(vqmovn_u16(in1), in2);
+inline uint8x16_t load_satured(const uint16_t *data) {
+  uint16x8_t in1 = vld1q_u16(data);
+  uint16x8_t in2 = vld1q_u16(data + 8);
+  return vqmovn_high_u16(vqmovn_u16(in1), in2);
 }
 
 void load_block(block64 *b, const char16_t *src) {
@@ -244,8 +308,10 @@ void base64_decode_block(char *out, const char *src) {
 }
 
 template <bool base64_url, typename char_type>
-result compress_decode_base64(char *dst, const char_type *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -270,11 +336,11 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, ba
       src += 64;
       bool error = false;
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if(badcharmask)
       if (error) {
         src -= 64;
 
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -396,7 +462,8 @@ result compress_decode_base64(char *dst, const char_type *src, size_t srclen, ba
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index a22dd18d0..6465811a6 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -109,7 +109,8 @@ TEST(encode_base64_cases) {
       {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
       {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
       {"123456", "MTIzNDU2"},
-      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", "IVJ+SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c+fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
   printf(" -- ");
@@ -152,12 +153,63 @@ TEST(encode_base64_cases) {
   }
 }
 
+
+TEST(encode_base64url_cases) {
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", "MTIzNDU2"},
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", "IVJ-SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c-fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(
+        implementation.base64_length_from_binary(p.first.size()));
+    ASSERT_EQUAL(buffer.size(), p.second.size());
+    size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
+                                               buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(s, p.second.size());
+    ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" --  ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length, simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
 TEST(encode_base64_cases_16) {
   std::vector<std::pair<std::string, std::u16string>> cases = {
       {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
       {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
       {"123456", u"MTIzNDU2"},
-      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"}};
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", u"IVJ+SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c+fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
   printf(" -- ");
@@ -191,6 +243,47 @@ TEST(encode_base64_cases_16) {
   }
 }
 
+
+TEST(encode_base64url_cases_16) {
+  std::vector<std::pair<std::string, std::u16string>> cases = {
+      {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", u"MTIzNDU2"},
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", u"IVJ-SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c-fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length, simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
 TEST(roundtrip_base64) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);
@@ -274,6 +367,91 @@ TEST(roundtrip_base64_16) {
   }
 }
 
+
+
+TEST(roundtrip_base64url) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data(), simdutf::base64_url);
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer.data(), size, back.data(), simdutf::base64_url);
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(roundtrip_base64url_16) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data(), simdutf::base64_url);
+      buffer.resize(size);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer16.data(), size, back.data(), simdutf::base64_url);
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
 TEST(doomed_base64_roundtrip) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);

From 4971bc27358b32e0286c8c1f319e58d5cf184f54 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Thu, 28 Mar 2024 20:39:11 -0400
Subject: [PATCH 34/49] documentation.

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index be0cff736..63cbc326b 100644
--- a/README.md
+++ b/README.md
@@ -56,8 +56,8 @@ This library provide fast Unicode functions such as
 - From an UTF-16LE/BE string, compute the size of the UTF-32 equivalent string (equivalent to UTF-16 character counting),
 - UTF-8 and UTF-16LE/BE character counting,
 - UTF-16 endianness change (UTF16-LE/BE to UTF-16-BE/LE),
-- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary,
-- Binary to base64.
+- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) (with or without URL encoding) to binary,
+- Binary to base64 (with or without URL encoding).
 
 The functions are accelerated using SIMD instructions (e.g., ARM NEON, SSE, AVX, AVX-512, RISC-V Vector Extension, etc.). When your strings contain hundreds of characters, we can often transcode them at speeds exceeding a billion characters per second. You should expect high speeds not only with English strings (ASCII) but also Chinese, Japanese, Arabic, and so forth. We handle the full character range (including, for example, emojis).
 
@@ -1568,7 +1568,7 @@ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * o
 Base64
 -----
 
-We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary.
+We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary. We also support the base64 URL encoding alternative.
 
 Converting binary data to base64 always succeeds and is relatively simple:
 ```C++

From c729247714ebad8035d659fe557be5149f95608b Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 29 Mar 2024 22:58:55 -0400
Subject: [PATCH 35/49] prototype base64url

---
 scripts/base64/avx512.py           |  79 ++++++++++
 scripts/base64/sse.py              | 239 +++++++++++++++++++++++++++++
 src/haswell/avx2_base64.cpp        | 151 ++++++++++++------
 src/haswell/implementation.cpp     |   6 +-
 src/icelake/icelake_base64.inl.cpp |  73 ++++++---
 src/icelake/implementation.cpp     |   6 +-
 src/westmere/implementation.cpp    |  12 +-
 src/westmere/sse_base64.cpp        | 119 +++++++++-----
 tests/base64_tests.cpp             |  16 ++
 9 files changed, 588 insertions(+), 113 deletions(-)
 create mode 100644 scripts/base64/avx512.py
 create mode 100644 scripts/base64/sse.py

diff --git a/scripts/base64/avx512.py b/scripts/base64/avx512.py
new file mode 100644
index 000000000..b09265cc8
--- /dev/null
+++ b/scripts/base64/avx512.py
@@ -0,0 +1,79 @@
+lookup_0 = [0 for i in range(64)]
+lookup_1 = [0 for i in range(64)]
+for i in range(64):
+    lookup_0[i] = 0x80
+    lookup_1[i] = 0x80
+lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+for i in range(64):
+    val = ord(lookup[i])
+    bit6 = val & 0x40
+    bits05 = val & 0x3f
+    if bit6:
+        lookup_1[bits05] = i
+    else:
+        lookup_0[bits05] = i
+allowed = "\t\r\n "
+for z in allowed:
+    lookup_0[ord(z)] = 0xff
+def sign8(x):
+    if x >= 128:
+        return x - 256
+    return x
+lookup_0.reverse()
+lookup_1.reverse()
+print("lookup0:")
+print(", ".join([str(sign8(i)) for i in lookup_0]))
+print("lookup1:")
+print(", ".join([str(sign8(i)) for i in lookup_1]))
+lookup = [0 for i in range(64)]
+output = 0
+for ifrom in range(16):
+    lookup[ifrom*4 + 0] = output + 3
+    lookup[ifrom*4 + 1] = output + 2
+    lookup[ifrom*4 + 2] = output + 1
+    lookup[ifrom*4 + 3] = output + 0
+    output += 4
+lookup.reverse()
+print("reverse:")
+print(", ".join([str(i) for i in lookup]))
+
+print("====")
+
+lookup_0 = [0 for i in range(64)]
+lookup_1 = [0 for i in range(64)]
+for i in range(64):
+    lookup_0[i] = 0x80
+    lookup_1[i] = 0x80
+lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+for i in range(64):
+    val = ord(lookup[i])
+    bit6 = val & 0x40
+    bits05 = val & 0x3f
+    if bit6:
+        lookup_1[bits05] = i
+    else:
+        lookup_0[bits05] = i
+allowed = "\0\t\r\n "
+for z in allowed:
+    lookup_0[ord(z)] = 0xff
+def sign8(x):
+    if x >= 128:
+        return x - 256
+    return x
+lookup_0.reverse()
+lookup_1.reverse()
+print("lookup0:")
+print(", ".join([str(sign8(i)) for i in lookup_0]))
+print("lookup1:")
+print(", ".join([str(sign8(i)) for i in lookup_1]))
+lookup = [0 for i in range(64)]
+output = 0
+for ifrom in range(16):
+    lookup[ifrom*4 + 0] = output + 3
+    lookup[ifrom*4 + 1] = output + 2
+    lookup[ifrom*4 + 2] = output + 1
+    lookup[ifrom*4 + 3] = output + 0
+    output += 4
+lookup.reverse()
+print("reverse:")
+print(", ".join([str(i) for i in lookup]))
\ No newline at end of file
diff --git a/scripts/base64/sse.py b/scripts/base64/sse.py
new file mode 100644
index 000000000..d50cfd9f6
--- /dev/null
+++ b/scripts/base64/sse.py
@@ -0,0 +1,239 @@
+import sys
+delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
+check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
+
+delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
+check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
+
+
+valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+
+def safechr(i):
+    if i < 32:
+        return '.'
+    if i > 127:
+        return '?'
+    return chr(i)
+
+def safehex(x):
+    return "0x{0:2x}".format(x)
+
+def to_signed(x):
+    if(x >= 128):
+        return x - 256
+    return x
+
+def to_unsigned(x):
+    if(x < 0):
+        return x + 256
+    return x
+
+def sat(x, y):
+    x = to_signed(x)
+    y = to_signed(y)
+    z = x + y
+    if(z > 127):
+        return 127
+    if(z < -128):
+        return to_unsigned(-128)
+    return to_unsigned(z)
+
+def lookup(table, index):
+    print("looking up ", hex(index))
+    if(index >= 128):
+        return 0
+    return table[index&0xf]
+
+
+
+def quietlookup(table, index):
+    if(index >= 128):
+        return 0
+    return table[index&0xf]
+
+def process(src):
+    shifted = (src >> 3)%256
+    delta_hash = (lookup(delta_asso,src) + shifted + 1) >> 1
+    check_hash = (lookup(check_asso,src) + shifted + 1) >> 1
+    out = sat(lookup(delta_values,delta_hash), src)
+    chk = sat(lookup(check_values,check_hash), src)
+    mask = chk & 0x80
+    return (out, mask)
+
+def processquiet(src):
+    shifted = (src >> 3)%256
+    delta_hash = (quietlookup(delta_asso,src) + shifted + 1) >> 1
+    check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+    out = sat(quietlookup(delta_values,delta_hash), src)
+    chk = sat(quietlookup(check_values,check_hash), src)
+    mask = chk & 0x80
+    return (out, mask)
+
+def is_ok(i):
+    out, mask = processquiet(i)
+    if mask == 0:
+        return 1
+    return 0
+
+def computestring():
+    s = ""
+    for i in range(256):
+        out, mask = processquiet(i)
+        if(mask == 0):
+            s +=  safechr(i)
+    return s
+print(computestring() + " " + str(len(computestring())))
+
+def print_layout():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in range(16):
+        if check_hash in t:
+            off = quietlookup(check_values,check_hash)
+            print(hex(check_hash), hex(off), end="")
+            print("\t", " ".join(["   "+safechr(c) for c in t[check_hash]]))
+        else:
+            continue
+
+
+def is_valid():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        if check_hash in t:
+            array = t[check_hash]
+            i = 0
+            while i < len(array) and valid.find(chr(array[i])) == -1:
+                i += 1
+            while i < len(array) and valid.find(chr(array[i])) != -1:
+                i += 1
+            while i < len(array) and array[i] >= 128:
+                i += 1
+            if i < len(array):
+                return False
+        else:
+            continue
+    return True
+
+print_layout()
+print(is_valid())
+
+valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+print("----")
+def fun_adjust():
+    for zz in range(256):
+        check_asso[ord('-')&0xf] = zz
+        for yy in range(256):
+            check_asso[ord('_')&0xf] = yy
+            if(is_valid()):
+                print("----")
+                print_layout()
+                print(is_valid())
+                print("found")
+                return
+fun_adjust()
+            #sys.exit(0)
+
+def adjust(array, start, end, check_hash):
+    for j in range(256):
+        is_ok = True
+        for i in range(len(array)):
+            valid = (sat(j,array[i])&0x80 == 0) # sat(quietlookup(check_values,check_hash), src)
+            should_be_valid = (i>=start and i < end)
+            is_ok = is_ok and (valid == should_be_valid)
+        if(is_ok):
+            check_values[check_hash&0xf] = j
+            return
+    raise "unexpected"
+
+
+    chk = sat(quietlookup(check_values,check_hash), src)
+    mask = chk & 0x80
+
+def process():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        if check_hash in t:
+            array = t[check_hash]
+            i = 0
+            while i < len(array) and valid.find(chr(array[i])) == -1:
+                i += 1
+            if i < len(array) and valid.find(chr(array[i])) != -1:
+                start = i
+                while i < len(array) and valid.find(chr(array[i])) != -1:
+                    i += 1
+                end = i
+                adjust(array, start, end, check_hash)
+        else:
+            continue
+    return True
+print("process")
+process()
+print("string")
+print(computestring()+ " "+str(len(computestring())))
+
+for c in valid:
+    print(c,processquiet(ord(c)))
+
+def examine():
+    t={}
+    for i in valid:
+        src = ord(i)
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(delta_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        print(check_hash, t[check_hash])
+    return True
+examine()
+
+delta_values[10] += 1 
+
+delta_values[13] += 33 
+
+for c in valid:
+    print(c,processquiet(ord(c)))
+
+
+
+delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
+check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
+
+delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
+check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
+
+def casthex(v):
+    if(v >= 0x80):
+        return "uint8_t("+"0x{:X}".format(v)+")"
+    return "0x{:X}".format(v)
+def printme(c):
+    print(",".join([casthex(i) for i in c]))
+print("delta_asso")
+printme(delta_asso)
+print("check_asso")
+printme(check_asso)
+print("delta_values")
+printme(delta_values)
+print("check_values")
+printme(check_values)
\ No newline at end of file
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index d6886aa86..615535881 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -26,23 +26,35 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-__m256i lookup_pshufb_improved(const __m256i input) {
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
   // credit: Wojciech Muła
   __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
   const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
   result =
       _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-  const __m256i shift_LUT = _mm256_setr_epi8(
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
-
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m256i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
 
   result = _mm256_shuffle_epi8(shift_LUT, result);
   return _mm256_add_epi8(result, input);
 }
 
+template <base64_options options>
 size_t encode_base64(char *dst, const char *src, size_t srclen) {
   // credit: Wojciech Muła
   const uint8_t *input = (const uint8_t *)src;
@@ -110,18 +122,18 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input0));
+                        lookup_pshufb_improved<options == base64_url>(input0));
     out += 32;
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input1));
+                        lookup_pshufb_improved<options == base64_url>(input1));
     out += 32;
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input2));
+                        lookup_pshufb_improved<options == base64_url>(input2));
     out += 32;
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input3));
+                        lookup_pshufb_improved<options == base64_url>(input3));
     out += 32;
   }
   for (; i + 28 <= srclen; i += 24) {
@@ -145,11 +157,11 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m256i indices = _mm256_or_si256(t1, t3);
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(indices));
+                        lookup_pshufb_improved<options == base64_url>(indices));
     out += 32;
   }
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
 static inline void compress(__m128i data, uint16_t mask, char *output) {
@@ -200,36 +212,79 @@ struct block64 {
   __m256i chunks[2];
 };
 
+template <bool base64_url>
 static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   const __m256i ascii_space_tbl =
       _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
                        0x0, 0x0, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
                        0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0x0, 0xd, 0x0, 0x0);
   // credit: aqrit
-  const __m256i delta_asso = _mm256_setr_epi8(
-      0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  const __m256i delta_values = _mm256_setr_epi8(
-      int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
-      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
-      int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-      int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
-      int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
-      int8_t(0xB9), int8_t(0xB9));
-  const __m256i check_asso = _mm256_setr_epi8(
-      0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
-      0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-      0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
-  const __m256i check_values = _mm256_setr_epi8(
-      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
-      int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
-      int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
-      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
-      int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
-      int8_t(0x91), int8_t(0x80));
+  __m256i delta_asso;
+  if (base64_url) {
+    delta_asso =
+        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+    delta_asso = _mm256_setr_epi8(
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+
+  __m256i delta_values;
+  if (base64_url) {
+    delta_values = _mm256_setr_epi8(
+        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+        uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xBF),
+        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+    delta_values = _mm256_setr_epi8(
+        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+        int8_t(0xB9), int8_t(0xB9));
+  }
+  __m256i check_asso;
+
+  if (base64_url) {
+    check_asso =
+        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+                         0x7, 0xB, 0xB, 0xB, 0xF, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+  } else {
+
+    check_asso = _mm256_setr_epi8(
+        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m256i check_values;
+  if (base64_url) {
+    check_values = _mm256_setr_epi8(
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
+        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
+        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+  } else {
+    check_values = _mm256_setr_epi8(
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+        int8_t(0x91), int8_t(0x80));
+  }
   const __m256i shifted = _mm256_srli_epi32(*src, 3);
 
   const __m256i delta_hash =
@@ -250,10 +305,12 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   *src = out;
   return (uint32_t)mask;
 }
+
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   *error = 0;
-  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
-  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
   return m0 | (m1 << 32);
 }
 
@@ -329,8 +386,10 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <bool base64_url, typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -358,11 +417,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -496,7 +554,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 4d3f1951e..f11325864 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -799,7 +799,11 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
 }
 
 size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return encode_base64(output, input, length, options);
+  if(options & base64_url) {
+    return encode_base64<base64_url>(output, input, length);
+  } else {
+    return encode_base64<base64_default>(output, input, length);
+  }
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 8b1882ca8..a6e3908fa 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -31,14 +31,17 @@ struct block64 {
   __m512i chunks[1];
 };
 
-size_t encode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
   // credit: Wojciech Muła
-
   const uint8_t *input = (const uint8_t *)src;
 
   uint8_t *out = (uint8_t *)dst;
   static const char *lookup_tbl =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+      base64_url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
   const __m512i shuffle_input = _mm512_setr_epi32(
       0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
@@ -57,27 +60,48 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
     out += 64;
   }
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   __m512i input = b->chunks[0];
   const __m512i ascii_space_tbl = _mm512_set_epi8(
       0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9,
       0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0,
       32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
-  __m512i lookup0 = _mm512_set_epi8(
-      -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-      52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
-      -128, -128, -128, -64, -128, -128, -128, -128, -128, -128, -128, -128,
-      -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -64, -128,
-      -128, -64, -64, -128, -128, -128, -128, -128, -128, -128, -128, -64);
-  __m512i lookup1 = _mm512_set_epi8(
-      -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41,
-      40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, -128,
-      -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14,
-      13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  __m512i lookup0;
+  if (base64_url) {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
+        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
+  } else {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
+        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
+  }
+  __m512i lookup1;
+  if (base64_url) {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  } else {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  }
+
   const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
   const __m512i combined = _mm512_or_si512(translated, input);
   const __mmask64 mask = _mm512_movepi8_mask(combined);
@@ -110,7 +134,8 @@ static inline void load_block(block64 *b, const char16_t *src) {
   __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
   __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
   __m512i p = _mm512_packus_epi16(m1, m2);
-  b->chunks[0] = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
 }
 
 static inline void base64_decode(char *out, __m512i str) {
@@ -138,8 +163,10 @@ static inline void base64_decode_block(char *out, block64 *b) {
 }
 
 template <bool base64_url, typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -164,11 +191,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -287,7 +313,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 8aa9cf886..356159808 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1386,7 +1386,11 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
 }
 
 size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
-  return encode_base64(output, input, length, options);
+  if(options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
 }
 
 } // namespace SIMDUTF_IMPLEMENTATION
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index 14565397b..e95e5f331 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -787,8 +787,8 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length, base64_options options) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length, options);
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
 simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
@@ -799,8 +799,12 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t leng
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  if(options == base64_url) {
+    return encode_base64<base64_url>(output, input, length);
+  } else {
+    return encode_base64<base64_default>(output, input, length);
+  }
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index 2fc986acc..6966fc864 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -25,8 +25,7 @@
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-
-__m128i lookup_pshufb_improved(const __m128i input) {
+template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
   // credit: Wojciech Muła
   // reduce  0..51 -> 0
   //        52..61 -> 1 .. 10
@@ -40,9 +39,16 @@ __m128i lookup_pshufb_improved(const __m128i input) {
   const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
   result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
 
-  const __m128i shift_LUT = _mm_setr_epi8(
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m128i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
 
   // read shift
   result = _mm_shuffle_epi8(shift_LUT, result);
@@ -50,6 +56,7 @@ __m128i lookup_pshufb_improved(const __m128i input) {
   return _mm_add_epi8(result, input);
 }
 
+template <base64_options options>
 size_t encode_base64(char *dst, const char *src, size_t srclen) {
   // credit: Wojciech Muła
   // SSE (lookup: pshufb improved unrolled)
@@ -101,19 +108,19 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m128i input3 = _mm_or_si128(t1_3, t3_3);
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input0));
+                     lookup_pshufb_improved<options & base64_url>(input0));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input1));
+                     lookup_pshufb_improved<options & base64_url>(input1));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input2));
+                     lookup_pshufb_improved<options & base64_url>(input2));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input3));
+                     lookup_pshufb_improved<options & base64_url>(input3));
     out += 16;
   }
   for (; i + 16 <= srclen; i += 12) {
@@ -153,12 +160,12 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m128i indices = _mm_or_si128(t1, t3);
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(indices));
+                     lookup_pshufb_improved<options & base64_url>(indices));
     out += 16;
   }
 
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i, options);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
@@ -198,27 +205,59 @@ struct block64 {
   __m128i chunks[4];
 };
 
+template <bool base64_url>
 static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   const __m128i ascii_space_tbl =
       _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
                     0x0, 0xd, 0x0, 0x0);
   // credit: aqrit
-  const __m128i delta_asso =
-      _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
-                    0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  const __m128i delta_values =
-      _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-                    int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-                    int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
-                    int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
-  const __m128i check_asso =
-      _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                    0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
-  const __m128i check_values =
-      _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-                    int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
-                    int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
-                    int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  __m128i delta_asso;
+  if (base64_url) {
+    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+
+    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+  __m128i delta_values;
+  if (base64_url) {
+    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
+                                 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+
+    delta_values =
+        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+  }
+  __m128i check_asso;
+  if (base64_url) {
+    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                               0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+  } else {
+
+    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m128i check_values;
+  if (base64_url) {
+    check_values = _mm_setr_epi8(
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
+        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+  } else {
+
+    check_values =
+        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  }
   const __m128i shifted = _mm_srli_epi32(*src, 3);
 
   const __m128i delta_hash =
@@ -239,12 +278,14 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   *src = out;
   return (uint16_t)mask;
 }
+
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   *error = 0;
-  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
-  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
-  uint64_t m2 = to_base64_mask(&b->chunks[2], error);
-  uint64_t m3 = to_base64_mask(&b->chunks[3], error);
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
   return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
 }
 
@@ -339,8 +380,10 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
 }
 
 template <bool base64_url, typename chartype>
-result compress_decode_base64(char *dst, const chartype *src, size_t srclen, base64_options options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -368,11 +411,10 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               to_base64[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -507,7 +549,8 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen, bas
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 6465811a6..19c0ef947 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -8,6 +8,9 @@
 #include <tests/helpers/test.h>
 #include <tests/helpers/transcode_test_base.h>
 
+// We may disable base64url tests by commenting out this next line.
+#define SIMDUTF_BASE64URL_TESTS 1
+
 using random_generator = std::mt19937;
 static random_generator::result_type seed = 42;
 
@@ -153,6 +156,7 @@ TEST(encode_base64_cases) {
   }
 }
 
+#if SIMDUTF_BASE64URL_TESTS
 
 TEST(encode_base64url_cases) {
   std::vector<std::pair<std::string, std::string>> cases = {
@@ -171,6 +175,11 @@ TEST(encode_base64url_cases) {
     size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
                                                buffer.data(), simdutf::base64_url);
     ASSERT_EQUAL(s, p.second.size());
+    if(std::string(buffer.data(), buffer.size()) != p.second) {
+      printf("difference:\n");
+      printf(" %.*s\n", (int)s, buffer.data());
+      printf(" %.*s\n", (int)s, p.second.data());
+    }
     ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
   }
   printf(" -- ");
@@ -203,6 +212,8 @@ TEST(encode_base64url_cases) {
   }
 }
 
+#endif
+
 TEST(encode_base64_cases_16) {
   std::vector<std::pair<std::string, std::u16string>> cases = {
       {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
@@ -243,6 +254,7 @@ TEST(encode_base64_cases_16) {
   }
 }
 
+#if SIMDUTF_BASE64URL_TESTS
 
 TEST(encode_base64url_cases_16) {
   std::vector<std::pair<std::string, std::u16string>> cases = {
@@ -284,6 +296,8 @@ TEST(encode_base64url_cases_16) {
   }
 }
 
+#endif
+
 TEST(roundtrip_base64) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);
@@ -368,6 +382,7 @@ TEST(roundtrip_base64_16) {
 }
 
 
+#if SIMDUTF_BASE64URL_TESTS
 
 TEST(roundtrip_base64url) {
   for (size_t len = 0; len < 2048; len++) {
@@ -451,6 +466,7 @@ TEST(roundtrip_base64url_16) {
     }
   }
 }
+#endif
 
 TEST(doomed_base64_roundtrip) {
   for (size_t len = 0; len < 2048; len++) {

From e32acc9c6cefe1a734a37581679ba52edf8c5ee2 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Fri, 29 Mar 2024 23:37:23 -0400
Subject: [PATCH 36/49] solved based64url

---
 scripts/base64/sse.py       | 27 ++++++++++++++++++++-------
 src/haswell/avx2_base64.cpp | 28 +++++++++++++---------------
 src/westmere/sse_base64.cpp | 16 ++++++++--------
 3 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/scripts/base64/sse.py b/scripts/base64/sse.py
index d50cfd9f6..25369c754 100644
--- a/scripts/base64/sse.py
+++ b/scripts/base64/sse.py
@@ -217,12 +217,6 @@ def examine():
 
 
 
-delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
-check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
-
-delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
-check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
-
 def casthex(v):
     if(v >= 0x80):
         return "uint8_t("+"0x{:X}".format(v)+")"
@@ -236,4 +230,23 @@ def printme(c):
 print("delta_values")
 printme(delta_values)
 print("check_values")
-printme(check_values)
\ No newline at end of file
+printme(check_values)
+
+def processverbose(src):
+    print("processing ", hex(src))
+    shifted = (src >> 3)%256
+    print("shifted ", hex(shifted))
+    delta_hash = (lookup(delta_asso,src) + shifted + 1) >> 1
+    print("delta_hash ", hex(delta_hash))
+    check_hash = (lookup(check_asso,src) + shifted + 1) >> 1
+    print("check_hash ", hex(check_hash))
+    out = sat(lookup(delta_values,delta_hash), src)
+    print("out ", hex(out))
+    chk = sat(lookup(check_values,check_hash), src)
+    print("chk ", hex(chk))
+
+    mask = chk & 0x80
+    return (out, mask)
+processverbose(ord('-'))
+
+print(computestring()+ " "+str(len(computestring())))
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 615535881..4205f8b86 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -236,10 +236,10 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   if (base64_url) {
     delta_values = _mm256_setr_epi8(
         0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
-        uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xBF),
+        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
         uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
-        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x10, uint8_t(0xC3),
-        uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
   } else {
     delta_values = _mm256_setr_epi8(
         int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
@@ -255,8 +255,8 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   if (base64_url) {
     check_asso =
         _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
-                         0x7, 0xB, 0xB, 0xB, 0xF, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+                         0x7, 0xB, 0x6, 0xB, 0x12, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
   } else {
 
     check_asso = _mm256_setr_epi8(
@@ -267,14 +267,13 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   __m256i check_values;
   if (base64_url) {
     check_values = _mm256_setr_epi8(
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
-        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80),
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
-        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF),
+        uint8_t(0xBF), uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
+        uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), uint8_t(0xB0),
+        uint8_t(0x80), 0x0, 0x0, 0x0, uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD3),
+        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD0),
+        uint8_t(0x80), uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
   } else {
     check_values = _mm256_setr_epi8(
         int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
@@ -286,12 +285,10 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
         int8_t(0x91), int8_t(0x80));
   }
   const __m256i shifted = _mm256_srli_epi32(*src, 3);
-
   const __m256i delta_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
   const __m256i check_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
-
   const __m256i out =
       _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
   const __m256i chk =
@@ -420,6 +417,7 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
+        printf("ERROOOROOROROR\n");
         while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index 6966fc864..c6a8e4dbc 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -224,8 +224,8 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   if (base64_url) {
     delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
                                  uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
-                                 0x0, 0x10, uint8_t(0xC3), uint8_t(0xBF),
-                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9));
+                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
   } else {
 
     delta_values =
@@ -237,7 +237,7 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   __m128i check_asso;
   if (base64_url) {
     check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                               0x3, 0x7, 0xB, 0xB, 0xB, 0xF);
+                               0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
   } else {
 
     check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
@@ -245,11 +245,11 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   }
   __m128i check_values;
   if (base64_url) {
-    check_values = _mm_setr_epi8(
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD5), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD1), uint8_t(0x80),
-        uint8_t(0xB1), uint8_t(0x80), uint8_t(0x91), uint8_t(0x80));
+    check_values = _mm_setr_epi8(0x0, uint8_t(0x80), uint8_t(0x80),
+                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
+                                 uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
+                                 uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80),
+                                 uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
   } else {
 
     check_values =

From 9154818d50929b215e6555b6165f352155e22177 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Fri, 29 Mar 2024 23:50:38 -0400
Subject: [PATCH 37/49] fixing a missing func definition (bad signature)

---
 src/implementation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/implementation.cpp b/src/implementation.cpp
index 48a14e386..6159898cb 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1358,8 +1358,8 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return get_default_implementation()->base64_length_from_binary(length);
 }
 
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->binary_to_base64(input, length, output);
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->binary_to_base64(input, length, output, options);
 }
 
 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {

From fd037f542a7f436a197608ac5ba6a116d7606076 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 00:03:11 -0400
Subject: [PATCH 38/49] no such thing as version 4 of uraimo/run-on-arch-action

---
 .github/workflows/aarch64.yml | 2 +-
 .github/workflows/ppc64le.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/aarch64.yml b/.github/workflows/aarch64.yml
index b54e2afa0..a94eb8eed 100644
--- a/.github/workflows/aarch64.yml
+++ b/.github/workflows/aarch64.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: uraimo/run-on-arch-action@v4
+      - uses: uraimo/run-on-arch-action@v2
         name: Test
         id: runcmd
         with:
diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml
index c0c773928..72d6c517b 100644
--- a/.github/workflows/ppc64le.yml
+++ b/.github/workflows/ppc64le.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      - uses: uraimo/run-on-arch-action@v4
+      - uses: uraimo/run-on-arch-action@v2
         name: Test
         id: runcmd
         with:

From 0de753a1b08a7123b5c625d0bc0a48293dcdaeb3 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 11:44:03 -0400
Subject: [PATCH 39/49] fixes

---
 .github/workflows/rvv-128-clang-17.yml | 15 ++++++-------
 README.md                              | 29 +++++++++++++++++++-------
 include/simdutf/implementation.h       | 24 +++++++++++++--------
 src/arm64/arm_base64.cpp               | 12 +++++------
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/rvv-128-clang-17.yml b/.github/workflows/rvv-128-clang-17.yml
index 2289f6b58..44c34fbc6 100644
--- a/.github/workflows/rvv-128-clang-17.yml
+++ b/.github/workflows/rvv-128-clang-17.yml
@@ -1,12 +1,13 @@
 name: Ubuntu rvv VLEN=128 (clang 17)
 
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
+# Fails due to the inability to install packages
+#on:
+#  push:
+#    branches:
+#      - master
+#  pull_request:
+#    branches:
+#      - master
 
 jobs:
   build:
diff --git a/README.md b/README.md
index 63cbc326b..a4173af36 100644
--- a/README.md
+++ b/README.md
@@ -1605,15 +1605,19 @@ be useful if you seek to decode the input into segments having a maximal capacit
   simdutf::result r = simdutf::base64_to_binary_safe(
             base64.data(), base64.size(), back.data(), limited_length);
   assert(r.error == simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
-  // We decoded r.count base64 bytes to limited_length bytes
+  // We decoded r.count base64 8-bit units to limited_length bytes
   // Now let us decode the rest !!!
+  //
+  // We have read up to r.count in the input buffer and we have
+  // produced limited_length bytes.
+  //
   size_t input_index = r.count;
   size_t limited_length2 = back.size();
   r = simdutf::base64_to_binary_safe(base64.data() + input_index,
                                            base64.size() - input_index,
                                            back.data(), limited_length2);
   assert(r.error == simdutf::error_code::SUCCESS);
-  // We decoded r.count base64 bytes to limited_length2 bytes
+  // We decoded r.count base64 8-bit units to limited_length2 bytes
   // We are done
   assert(limited_length2 + limited_length == (len + 3) / 4 * 3);
 ```
@@ -1672,9 +1676,12 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * in
  *
  * This function will fail in case of invalid input. There are two possible reasons for
  * failure: the input contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
+ * the output.
+ *
  * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
@@ -1723,7 +1730,7 @@ size_t binary_to_base64(const char * input, size_t length, char* output, base64_
  *
  * This function will fail in case of invalid input. There are two possible reasons for
  * failure: the input contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
  * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
@@ -1737,7 +1744,7 @@ size_t binary_to_base64(const char * input, size_t length, char* output, base64_
  * @param length        the length of the string in 16-bit units
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
  * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
  */
 simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
@@ -1752,20 +1759,26 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  *
  * This function will fail in case of invalid input. There are three possible reasons for
  * failure: the input contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), the input contains a character
- * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer is too small (OUTPUT_BUFFER_TOO_SMALL).
+ * a single remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer 
+ * is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
+ * and the number of units processed, see description of the parameters and returned value.
  *
  * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
  *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
+ * the output.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
  * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
  * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
  * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of units processed if successful.
  */
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index a6784ba3a..68009eb6d 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1420,7 +1420,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * in
  *
  * This function will fail in case of invalid input. There are two possible reasons for
  * failure: the input contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
  * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
@@ -1471,7 +1471,7 @@ size_t binary_to_base64(const char * input, size_t length, char* output, base64_
  *
  * This function will fail in case of invalid input. There are two possible reasons for
  * failure: the input contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
  * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
@@ -1485,7 +1485,7 @@ size_t binary_to_base64(const char * input, size_t length, char* output, base64_
  * @param length        the length of the string in 16-bit units
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
  * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
  */
 simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
 
@@ -1500,20 +1500,26 @@ simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t lengt
  *
  * This function will fail in case of invalid input. There are three possible reasons for
  * failure: the input contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), the input contains a character
- * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer is too small (OUTPUT_BUFFER_TOO_SMALL).
+ * a single remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
+ * is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
+ * and the number of units processed, see description of the parameters and returned value.
  *
  * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
  * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
  * r.count contains the number of bytes decoded.
  *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
+ * the output.
  *
  * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
  * @param length        the length of the string in 8-bit or 16-bit units.
  * @param output        the pointer to buffer that can hold the conversion result.
  * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
  * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
- * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of units processed if successful.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of units processed if successful.
  */
 simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
@@ -2611,7 +2617,7 @@ class implementation {
    *
    * This function will fail in case of invalid input. There are two possible reasons for
    * failure: the input contains a number of base64 characters that when divided by 4, leaves
-   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
    * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
    *
    * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
@@ -2636,7 +2642,7 @@ class implementation {
    *
    * This function will fail in case of invalid input. There are two possible reasons for
    * failure: the input contains a number of base64 characters that when divided by 4, leaves
-   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
    * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
    *
    * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
@@ -2646,7 +2652,7 @@ class implementation {
    * @param length        the length of the string in 16-bit units
    * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
    * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
-   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in 16-bit units) if any, or the number of bytes written if successful.
+   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
    */
   simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 877d9bdb7..034da84df 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -142,10 +142,10 @@ template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   }
 #else
   if (base64_url) {
-    lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
               0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4};
   } else {
-    lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
               0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
   }
 #endif
@@ -166,10 +166,10 @@ template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   }
 #else
   if (base64_url) {
-    lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
               0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
   } else {
-    lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
               0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
   }
 #endif
@@ -228,10 +228,10 @@ template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   }
 #else
   if (base64_url) {
-    roll_lut = {0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+    roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
                 0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
   } else {
-    roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+    roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
                 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
   }
 #endif

From ccdf51d11416e462a4f7d2c06e7f73dcd66b90bb Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 15:15:05 -0400
Subject: [PATCH 40/49] Update benchmarks/base64/benchmark_base64.cpp

Co-authored-by: Yagiz Nizipli <yagiz@nizipli.com>
---
 benchmarks/base64/benchmark_base64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 3fb475d58..bc8538bd7 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -116,7 +116,7 @@ void show_help() {
   printf("  -d, --decode      Decode the input file\n");
   printf("  -e, --encode      Encode the input file\n");
   printf("  -r, --roundtrip   Roundtrip the input file\n");
-  printf("  --roundtripurl    Roundtrip the input file (URL)\n");
+  printf("  --roundtrip-url    Roundtrip the input file (URL)\n");
   printf("  -b, --bun         Bun benchmark\n");
 
   printf(" See https://github.com/lemire/base64data for test data.\n");

From 7ec70f26f6d71daa171e67d96bda87249177dbe2 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 15:15:12 -0400
Subject: [PATCH 41/49] Update benchmarks/base64/benchmark_base64.cpp

Co-authored-by: Yagiz Nizipli <yagiz@nizipli.com>
---
 benchmarks/base64/benchmark_base64.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index bc8538bd7..37662ebf5 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -117,7 +117,7 @@ void show_help() {
   printf("  -e, --encode      Encode the input file\n");
   printf("  -r, --roundtrip   Roundtrip the input file\n");
   printf("  --roundtrip-url    Roundtrip the input file (URL)\n");
-  printf("  -b, --bun         Bun benchmark\n");
+  printf("  -b, --bench-bun         Bun benchmark\n");
 
   printf(" See https://github.com/lemire/base64data for test data.\n");
 }

From 18dc6164058f9e26cf8318fe9552abbc0622819d Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 15:15:20 -0400
Subject: [PATCH 42/49] Update benchmarks/base64/libbase64_spaces.h

Co-authored-by: Yagiz Nizipli <yagiz@nizipli.com>
---
 benchmarks/base64/libbase64_spaces.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/benchmarks/base64/libbase64_spaces.h b/benchmarks/base64/libbase64_spaces.h
index 6e68c2caf..e368221e8 100644
--- a/benchmarks/base64/libbase64_spaces.h
+++ b/benchmarks/base64/libbase64_spaces.h
@@ -12,7 +12,6 @@ static inline size_t libbase64_find_space(const char *p, const size_t avail) {
 
 // Inspired by
 // https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L405
-
 static bool libbase64_space_decode(const char *start, size_t avail, char *outbuf,
                    size_t *outlen) {
   struct base64_state state;

From aeb2f5fa1e2ee29a7b20fe4c84be0724398e9111 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 15:15:31 -0400
Subject: [PATCH 43/49] Update include/simdutf/implementation.h

Co-authored-by: Yagiz Nizipli <yagiz@nizipli.com>
---
 include/simdutf/implementation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index 68009eb6d..25342d81b 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1394,7 +1394,7 @@ enum : base64_options {
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return maximal number of binary bytes
+ * @return maximum number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 

From e0ce663673c1dcde52943bab6eb080f9ec01733a Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 30 Mar 2024 15:16:06 -0400
Subject: [PATCH 44/49] Update src/haswell/avx2_base64.cpp

Co-authored-by: Yagiz Nizipli <yagiz@nizipli.com>
---
 src/haswell/avx2_base64.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 4205f8b86..d3d700145 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -417,7 +417,6 @@ result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        printf("ERROOOROOROROR\n");
         while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }

From bb9d1fc3fa367f97baa10cf6c7dc35a11df93887 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Sat, 30 Mar 2024 18:58:24 -0400
Subject: [PATCH 45/49] various minor fixes (linting + comments)

---
 CMakeLists.txt                     |  2 +-
 benchmarks/base64/CMakeLists.txt   |  2 +-
 scripts/Makefile                   |  2 ++
 scripts/base64/Makefile            |  2 ++
 scripts/base64/README.md           |  2 ++
 scripts/base64/avx512.py           | 33 ++++++++++++++----------------
 scripts/base64/neon_decode.py      | 26 ++---------------------
 scripts/base64/sse.py              |  4 ++--
 scripts/base64/table.py            |  4 ++--
 scripts/create_latex_table.py      | 21 +++++++++++--------
 scripts/release.py                 | 22 ++++++++++++--------
 src/arm64/arm_base64.cpp           |  6 ++++++
 src/haswell/avx2_base64.cpp        |  4 ++++
 src/icelake/icelake_base64.inl.cpp |  4 ++++
 src/scalar/base64.h                |  3 +++
 src/westmere/sse_base64.cpp        |  4 ++++
 tools/CMakeLists.txt               |  1 +
 17 files changed, 76 insertions(+), 66 deletions(-)
 create mode 100644 scripts/Makefile
 create mode 100644 scripts/base64/Makefile
 create mode 100644 scripts/base64/README.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9af473466..9a0f2891c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ set(SIMDUTF_LIB_VERSION "6.0.0" CACHE STRING "simdutf library version")
 set(SIMDUTF_LIB_SOVERSION "6" CACHE STRING "simdutf library soversion")
 option(SIMDUTF_TESTS "Whether the tests are included as part of the CMake Build." ON)
 option(SIMDUTF_BENCHMARKS "Whether the benchmarks are included as part of the CMake Build." OFF)
-option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build." ON)
+option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build. Requires C++17 or better." ON)
 option(SIMDUTF_ICONV "Whether to use iconv as part of the CMake build if available." ON)
 
 set(SIMDUTF_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/benchmarks/base64/CMakeLists.txt b/benchmarks/base64/CMakeLists.txt
index d00fc855e..ce6624b5d 100644
--- a/benchmarks/base64/CMakeLists.txt
+++ b/benchmarks/base64/CMakeLists.txt
@@ -8,7 +8,7 @@ CPMAddPackage(
 
 
 add_executable(benchmark_base64 benchmark_base64.cpp)
-
+message(STATUS "The tools benchmark_base64 require C++17. If your system does not support C++17, please set SIMDUTF_BENCHMARK_BASE64 to OFF.")
 set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD 17)
 set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD_REQUIRED ON)
 
diff --git a/scripts/Makefile b/scripts/Makefile
new file mode 100644
index 000000000..ac88a9b37
--- /dev/null
+++ b/scripts/Makefile
@@ -0,0 +1,2 @@
+lint:
+	python -m mypy *.py
diff --git a/scripts/base64/Makefile b/scripts/base64/Makefile
new file mode 100644
index 000000000..ac88a9b37
--- /dev/null
+++ b/scripts/base64/Makefile
@@ -0,0 +1,2 @@
+lint:
+	python -m mypy *.py
diff --git a/scripts/base64/README.md b/scripts/base64/README.md
new file mode 100644
index 000000000..574806304
--- /dev/null
+++ b/scripts/base64/README.md
@@ -0,0 +1,2 @@
+The scripts in this directory are for reference only. They were use to check
+the algorithms we are using.
\ No newline at end of file
diff --git a/scripts/base64/avx512.py b/scripts/base64/avx512.py
index b09265cc8..21788b528 100644
--- a/scripts/base64/avx512.py
+++ b/scripts/base64/avx512.py
@@ -25,17 +25,17 @@ def sign8(x):
 print(", ".join([str(sign8(i)) for i in lookup_0]))
 print("lookup1:")
 print(", ".join([str(sign8(i)) for i in lookup_1]))
-lookup = [0 for i in range(64)]
+lookupn = [0 for i in range(64)]
 output = 0
 for ifrom in range(16):
-    lookup[ifrom*4 + 0] = output + 3
-    lookup[ifrom*4 + 1] = output + 2
-    lookup[ifrom*4 + 2] = output + 1
-    lookup[ifrom*4 + 3] = output + 0
+    lookupn[ifrom*4 + 0] = output + 3
+    lookupn[ifrom*4 + 1] = output + 2
+    lookupn[ifrom*4 + 2] = output + 1
+    lookupn[ifrom*4 + 3] = output + 0
     output += 4
-lookup.reverse()
+lookupn.reverse()
 print("reverse:")
-print(", ".join([str(i) for i in lookup]))
+print(", ".join([str(i) for i in lookupn]))
 
 print("====")
 
@@ -56,24 +56,21 @@ def sign8(x):
 allowed = "\0\t\r\n "
 for z in allowed:
     lookup_0[ord(z)] = 0xff
-def sign8(x):
-    if x >= 128:
-        return x - 256
-    return x
+
 lookup_0.reverse()
 lookup_1.reverse()
 print("lookup0:")
 print(", ".join([str(sign8(i)) for i in lookup_0]))
 print("lookup1:")
 print(", ".join([str(sign8(i)) for i in lookup_1]))
-lookup = [0 for i in range(64)]
+lookupn = [0 for i in range(64)]
 output = 0
 for ifrom in range(16):
-    lookup[ifrom*4 + 0] = output + 3
-    lookup[ifrom*4 + 1] = output + 2
-    lookup[ifrom*4 + 2] = output + 1
-    lookup[ifrom*4 + 3] = output + 0
+    lookupn[ifrom*4 + 0] = output + 3
+    lookupn[ifrom*4 + 1] = output + 2
+    lookupn[ifrom*4 + 2] = output + 1
+    lookupn[ifrom*4 + 3] = output + 0
     output += 4
-lookup.reverse()
+lookupn.reverse()
 print("reverse:")
-print(", ".join([str(i) for i in lookup]))
\ No newline at end of file
+print(", ".join([str(i) for i in lookupn]))
\ No newline at end of file
diff --git a/scripts/base64/neon_decode.py b/scripts/base64/neon_decode.py
index 88e5cab95..24fe1c03e 100644
--- a/scripts/base64/neon_decode.py
+++ b/scripts/base64/neon_decode.py
@@ -25,7 +25,6 @@ def decode(s):
         assert d >= 0
         # we must have a base64 element
         v = t.find(chr(i))
-        #print(i, chr(i), v, d)
         assert v == d
     else:
         # we must have a space
@@ -34,20 +33,9 @@ def decode(s):
 
 
 
-
-## 0x2d is '-' in base64
-## 0x5f is '_' in base64
-
 t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
 spaces=' \t\n\r'
 
-#3 numbers
-#4-6 letters
-#5-7 letters
-
-#0x2d
-#0x5f
-
 lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb0]
 lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
 roll = [0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
@@ -61,8 +49,6 @@ def decode(s):
 
 #0x00 are forbidden except for \t \n \r which go to one
 lut_hi[0] = 0x11
-#for c in '\t\n\r':
-#    lut_lo[ord(c) & 0xf] = 0x1
 for z in range(16):
     if '\t\n\r'.find(chr(z)) != -1:
         lut_lo[z & 0xf] = 0x1 # allowed
@@ -105,9 +91,7 @@ def decode(s):
 
 
 
-
-
-def decode(s):
+def decodes(s):
     low = s & 0xf
     high = s >> 4
     m = lut_lo[low] & lut_hi[high]
@@ -126,15 +110,9 @@ def decode(s):
 print(",".join([hex(c) for c in lut_hi]))
 print(",".join([hex(c) for c in roll]))
 
-#for c in spaces:
-#    print(hex(ord(c)),decode(ord(c)))
-
-#import sys
-#sys.exit(0)
 
 for i in range(256):
-    m,d = decode(i)
-    #print(hex(i), m, d, chr(i))
+    m,d = decodes(i)
     if d is None:
         assert t.find(chr(i)) == -1
         assert spaces.find(chr(i)) == -1
diff --git a/scripts/base64/sse.py b/scripts/base64/sse.py
index 25369c754..aa7b4f75a 100644
--- a/scripts/base64/sse.py
+++ b/scripts/base64/sse.py
@@ -162,7 +162,7 @@ def adjust(array, start, end, check_hash):
     chk = sat(quietlookup(check_values,check_hash), src)
     mask = chk & 0x80
 
-def process():
+def process_explicit():
     t={}
     for i in range(256):
         src = i
@@ -187,7 +187,7 @@ def process():
             continue
     return True
 print("process")
-process()
+process_explicit()
 print("string")
 print(computestring()+ " "+str(len(computestring())))
 
diff --git a/scripts/base64/table.py b/scripts/base64/table.py
index ff83aa316..d99ed4e76 100644
--- a/scripts/base64/table.py
+++ b/scripts/base64/table.py
@@ -1,7 +1,7 @@
 import base64
-#default:
+#default base64 table, uncomment and comment the next line to use the default table
 #t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
-#baseur::
+
 t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
 def formula(a, b, c, d):
     if(a >= 64 or b >= 64 or c >= 64 or d >= 64):
diff --git a/scripts/create_latex_table.py b/scripts/create_latex_table.py
index d396bb9c4..708de9aef 100755
--- a/scripts/create_latex_table.py
+++ b/scripts/create_latex_table.py
@@ -23,13 +23,16 @@
 for line in content:
 
     if line.startswith("convert"):
-        codec = re.search(r"\+(\w+)",line).group(1)
-        rfile = re.search(r"/(\w+)[\.-]",line).group(1)
-        currentrow["codec"] = codec
-        currentrow["dataset"] = rfile
-        datasets.add(rfile)
-        codecs.add(codec)
-
+        m = re.search(r"\+(\w+)",line)
+        if m is not None:
+            codec = m.group(1)
+            currentrow["codec"] = codec
+            codecs.add(codec)
+        m = re.search(r"/(\w+)[\.-]",line)
+        if m is not None:
+            rfile = m.group(1)
+            currentrow["dataset"] = rfile
+            datasets.add(rfile)
     m = re.search(r"\s([\.0-9]+) Gc/s",line)
     if m:
         v = float(m.group(1))
@@ -50,8 +53,8 @@ def get(d, k):
     for x in table:
         if(x['codec'] == k) and (x['dataset'] == d):
             return x["result"]
-datasets=sorted(datasets)
-for dataset in datasets:
+datasetsorted=sorted(datasets)
+for dataset in datasetsorted:
     s = dataset
     for k in kernels:
       s +=  " & " + get(dataset, k)
diff --git a/scripts/release.py b/scripts/release.py
index 944798413..e3062ae43 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -124,16 +124,19 @@ def topaddedversionstring(major, minor, rev):
 with open (cmakefile, 'rt') as myfile:
     for line in myfile:
         m = pattern.search(line)
-        if m != None:
+        if m is not None:
             sonumber = int(m.group(1))
             break
 print("so library number "+str(sonumber))
 
 if(atleastminor):
     print("Given that we have a minor revision, it seems necessary to bump the so library number")
+    if sonumber is None:
+        print("I cannot find the so library number in the CMakeLists.txt file")
+        sys.exit(-1)
     sonumber += 1
 
-for line in fileinput.input(cmakefile, inplace=1, backup='.bak'):
+for line in fileinput.input(cmakefile, inplace=True, backup='.bak'):
     line = re.sub(r'  VERSION \d+\.\d+\.\d+','  VERSION '+newmajorversionstring+'.'+mewminorversionstring+'.'+newrevversionstring, line.rstrip())
     line = re.sub(r'SIMDUTF_LIB_VERSION "\d+\.\d+\.\d+','SIMDUTF_LIB_VERSION "'+str(sonumber)+".0.0", line)
     line = re.sub(r'set\(SIMDUTF_LIB_SOVERSION "\d+"','set(SIMDUTF_LIB_SOVERSION \"'+str(sonumber)+'\"', line)
@@ -143,7 +146,7 @@ def topaddedversionstring(major, minor, rev):
 
 
 doxyfile = maindir + os.sep + "Doxyfile"
-for line in fileinput.input(doxyfile, inplace=1, backup='.bak'):
+for line in fileinput.input(doxyfile, inplace=True, backup='.bak'):
     line = re.sub(r'PROJECT_NUMBER         = "\d+\.\d+\.\d+','PROJECT_NUMBER         = "'+newversionstring, line.rstrip())
     print(line)
 print("modified "+doxyfile+", a backup was made")
@@ -165,7 +168,7 @@ def topaddedversionstring(major, minor, rev):
 readmefile = maindir + os.sep + "README.md"
 
 
-for line in fileinput.input(readmefile, inplace=1, backup='.bak'):
+for line in fileinput.input(readmefile, inplace=True, backup='.bak'):
     line = re.sub(r'   wget https://github.com/simdutf/simdutf/releases/download/v\d+\.\d+\.\d+/singleheader.zip','   wget https://github.com/simdutf/simdutf/releases/download/v'+newmajorversionstring+'.'+mewminorversionstring+'.'+newrevversionstring+'/singleheader.zip', line.rstrip())
     line = re.sub(r'https://github.com/simdutf/simdutf/releases/download/v\d+\.\d+\.\d+/singleheader.zip','https://github.com/simdutf/simdutf/releases/download/v'+newmajorversionstring+'.'+mewminorversionstring+'.'+newrevversionstring+'/singleheader.zip', line.rstrip())
     print(line)
@@ -178,11 +181,12 @@ def topaddedversionstring(major, minor, rev):
 if m == None:
     print('I cannot find a link to the API documentation in your README')
 else:
-    detectedreadme = m.group(1)
-    print("found a link to your API documentation in the README file: "+detectedreadme+" ("+toversionstring(*newversion)+")")
-    if(atleastminor):
-       if(detectedreadme != toversionstring(*newversion)):
-           print(colored(255, 0, 0, "Consider updating the readme link to "+toversionstring(*newversion)))
+    if m is not None:
+        detectedreadme = m.group(1)
+        print("found a link to your API documentation in the README file: "+detectedreadme+" ("+toversionstring(*newversion)+")")
+        if(atleastminor):
+            if(detectedreadme != toversionstring(*newversion)):
+                print(colored(255, 0, 0, "Consider updating the readme link to "+toversionstring(*newversion)))
 
 
 
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 034da84df..35a0e3d78 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -275,6 +275,8 @@ uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return offsets >> 56;
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 void load_block(block64 *b, const char *src) {
   b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
   b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
@@ -282,12 +284,16 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
+// The caller of this function is responsible to ensure that there are 32 bytes available
+// from reading at data. It returns a 16-byte value, narrowing with saturation the 16-bit words.
 inline uint8x16_t load_satured(const uint16_t *data) {
   uint16x8_t in1 = vld1q_u16(data);
   uint16x8_t in2 = vld1q_u16(data + 8);
   return vqmovn_high_u16(vqmovn_u16(in1), in2);
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
 void load_block(block64 *b, const char16_t *src) {
   b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
   b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index d3d700145..187df475a 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -324,12 +324,16 @@ static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return _mm_popcnt_u64(nmask);
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
   b->chunks[1] =
       _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
   __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
   __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index a6e3908fa..312a0813c 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -126,10 +126,14 @@ static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return _mm_popcnt_u64(nmask);
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
   __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
   __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index 3e3b617bb..427154f96 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -13,6 +13,7 @@ namespace base64 {
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
 result base64_tail_decode(char *dst, const char_type *src, size_t length, base64_options options) {
+  // This looks like 5 branches, but we expect the compiler to resolve this to a single branch:
   const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
   const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
@@ -105,6 +106,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
 result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length, base64_options options) {
+  // This looks like 5 branches, but we expect the compiler to resolve this to a single branch:
   const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
   const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
   const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
@@ -216,6 +218,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
 // Returns the number of bytes written. The destination buffer must be large
 // enough. It will add padding (=) if needed.
 size_t tail_encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
+  // This looks like 3 branches, but we expect the compiler to resolve this to a single branch:
   const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 : tables::base64::base64_default::e0;
   const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 : tables::base64::base64_default::e1;
   const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 : tables::base64::base64_default::e2;
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index c6a8e4dbc..f8df6a830 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -308,6 +308,8 @@ static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return _mm_popcnt_u64(nmask);
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
   b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
@@ -315,6 +317,8 @@ static inline void load_block(block64 *b, const char *src) {
   b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
   __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
   __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 57a846f41..c0a62a934 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -40,6 +40,7 @@ if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Cl
 endif()
 endif()
 
+message(STATUS "The tools require C++17. If your system does not support C++17, please set SIMDUTF_TOOLS to OFF.")
 set_property(TARGET sutf PROPERTY CXX_STANDARD 17)
 set_property(TARGET sutf PROPERTY CXX_STANDARD_REQUIRED ON)
 set_property(TARGET fastbase64 PROPERTY CXX_STANDARD 17)

From f511d9a4ba9911a1e2184d9fa0bb3950b81bd503 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Sat, 30 Mar 2024 19:02:38 -0400
Subject: [PATCH 46/49] adding another comment.

---
 src/implementation.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/implementation.cpp b/src/implementation.cpp
index 6159898cb..e6e5ecc48 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1323,6 +1323,11 @@ simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, si
     ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
   size_t output_index = r.count - (r.count % 3);
   size_t input_index = safe_input;
+  // offset is a value that is no larger than 3. We backtrack
+  // by up to offset characters + an undetermined number of
+  // white space characters. It is expected that the next loop
+  // runs at most 3 times + the number of white space characters
+  // in between them, so we are not worried about performance.
   while(offset > 0 && input_index > 0) {
     chartype c = input[--input_index];
     if(c == '=' || c == '\n' || c == '\r' || c == '\t' || c == ' ') {

From e2a224fb95d852a34c4183068d0f7c5451318c4c Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Sat, 30 Mar 2024 19:05:28 -0400
Subject: [PATCH 47/49] cleaning up the base64 benchmark flags

---
 benchmarks/base64/benchmark_base64.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 37662ebf5..cfdf2e30d 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -112,12 +112,12 @@ std::vector<char> read_file(const char *filename,
 void show_help() {
   printf("Usage: benchmark_base64 [options] file1 [file2 ...]\n");
   printf("Options:\n");
-  printf("  -h, --help        Show this help message and exit\n");
-  printf("  -d, --decode      Decode the input file\n");
-  printf("  -e, --encode      Encode the input file\n");
-  printf("  -r, --roundtrip   Roundtrip the input file\n");
+  printf("  -h, --help         Show this help message and exit\n");
+  printf("  -d, --decode       Decode the input file\n");
+  printf("  -e, --encode       Encode the input file\n");
+  printf("  -r, --roundtrip    Roundtrip the input file\n");
   printf("  --roundtrip-url    Roundtrip the input file (URL)\n");
-  printf("  -b, --bench-bun         Bun benchmark\n");
+  printf("  -b, --bench-bun    Bun benchmark\n");
 
   printf(" See https://github.com/lemire/base64data for test data.\n");
 }
@@ -467,7 +467,7 @@ int main(int argc, char **argv) {
       mode = encode;
     } else if ((arg == "-r") || (arg == "--roundtrip")) {
       mode = roundtrip;
-    } else if (arg == "--roundtripurl") {
+    } else if (arg == "--roundtrip-url") {
       mode = roundtripurl;
     } else if ((arg == "-b") || (arg == "--bun")) {
       mode = bun;

From 5e6a3661b250fb77030e488391fd49d08a08c4f4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Sat, 30 Mar 2024 19:06:56 -0400
Subject: [PATCH 48/49] disabling Ubuntu rvv VLEN=1024 (clang 17) CI due to
 system failures

---
 .github/workflows/rvv-1024-clang-17.yml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/rvv-1024-clang-17.yml b/.github/workflows/rvv-1024-clang-17.yml
index 7a0355172..a01ec7c6b 100644
--- a/.github/workflows/rvv-1024-clang-17.yml
+++ b/.github/workflows/rvv-1024-clang-17.yml
@@ -1,12 +1,13 @@
 name: Ubuntu rvv VLEN=1024 (clang 17)
 
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
+# Fails due to the inability to install packages
+#on:
+#  push:
+#    branches:
+#      - master
+#  pull_request:
+#    branches:
+#      - master
 
 jobs:
   build:

From 9a92c544ba01c05e0c1335b86239470bec9443da Mon Sep 17 00:00:00 2001
From: Daniel Lemire <dlemire@lemire.me>
Date: Sun, 31 Mar 2024 13:46:48 -0400
Subject: [PATCH 49/49] adding the option

---
 benchmarks/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 40d641445..65d7e0eb9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -53,6 +53,9 @@ if(Threads_FOUND)
   set_property(TARGET threaded PROPERTY CXX_STANDARD 17)
   set_property(TARGET threaded PROPERTY CXX_STANDARD_REQUIRED ON)
 endif(Threads_FOUND)
+
+option(SIMDUTF_BENCHMARK_BASE64 "Whether the base64 benchmarks are included as part of the CMake Build (requires C++17 or better)." ON)
+
 if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
   message(STATUS "Not building base64 benchmarks when using clang-cl due to build errors with the aklomp/base64 dependency.")
 else()