diff --git a/.github/workflows/ppc64le.yml b/.github/workflows/ppc64le.yml
new file mode 100644
index 000000000..72d6c517b
--- /dev/null
+++ b/.github/workflows/ppc64le.yml
@@ -0,0 +1,28 @@
+name: Ubuntu aarch64 (GCC 11)
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: uraimo/run-on-arch-action@v2
+        name: Test
+        id: runcmd
+        with:
+          arch: ppc64le
+          githubToken: ${{ github.token }}
+          distro: ubuntu_latest
+          install: |
+            apt-get update -q -y
+            apt-get install -y cmake make g++
+          run: |
+            cmake -DCMAKE_BUILD_TYPE=Release -B build
+            cmake --build build -j=2
diff --git a/.github/workflows/rvv-1024-clang-17.yml b/.github/workflows/rvv-1024-clang-17.yml
index 7a0355172..a01ec7c6b 100644
--- a/.github/workflows/rvv-1024-clang-17.yml
+++ b/.github/workflows/rvv-1024-clang-17.yml
@@ -1,12 +1,13 @@
 name: Ubuntu rvv VLEN=1024 (clang 17)
 
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
+# Fails due to the inability to install packages
+#on:
+#  push:
+#    branches:
+#      - master
+#  pull_request:
+#    branches:
+#      - master
 
 jobs:
   build:
diff --git a/.github/workflows/rvv-128-clang-17.yml b/.github/workflows/rvv-128-clang-17.yml
index 2289f6b58..44c34fbc6 100644
--- a/.github/workflows/rvv-128-clang-17.yml
+++ b/.github/workflows/rvv-128-clang-17.yml
@@ -1,12 +1,13 @@
 name: Ubuntu rvv VLEN=128 (clang 17)
 
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
+# Fails due to the inability to install packages
+#on:
+#  push:
+#    branches:
+#      - master
+#  pull_request:
+#    branches:
+#      - master
 
 jobs:
   build:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9af473466..9a0f2891c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ set(SIMDUTF_LIB_VERSION "6.0.0" CACHE STRING "simdutf library version")
 set(SIMDUTF_LIB_SOVERSION "6" CACHE STRING "simdutf library soversion")
 option(SIMDUTF_TESTS "Whether the tests are included as part of the CMake Build." ON)
 option(SIMDUTF_BENCHMARKS "Whether the benchmarks are included as part of the CMake Build." OFF)
-option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build." ON)
+option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build. Requires C++17 or better." ON)
 option(SIMDUTF_ICONV "Whether to use iconv as part of the CMake build if available." ON)
 
 set(SIMDUTF_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/README.md b/README.md
index d18baab52..a4173af36 100644
--- a/README.md
+++ b/README.md
@@ -56,8 +56,8 @@ This library provide fast Unicode functions such as
 - From an UTF-16LE/BE string, compute the size of the UTF-32 equivalent string (equivalent to UTF-16 character counting),
 - UTF-8 and UTF-16LE/BE character counting,
 - UTF-16 endianness change (UTF16-LE/BE to UTF-16-BE/LE),
-- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary,
-- Binary to base64.
+- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) (with or without URL encoding) to binary,
+- Binary to base64 (with or without URL encoding).
 
 The functions are accelerated using SIMD instructions (e.g., ARM NEON, SSE, AVX, AVX-512, RISC-V Vector Extension, etc.). When your strings contain hundreds of characters, we can often transcode them at speeds exceeding a billion characters per second. You should expect high speeds not only with English strings (ASCII) but also Chinese, Japanese, Arabic, and so forth. We handle the full character range (including, for example, emojis).
 
@@ -1568,7 +1568,7 @@ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * o
 Base64
 -----
 
-We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary.
+We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary. We also support the base64 URL encoding alternative.
 
 Converting binary data to base64 always succeeds and is relatively simple:
 ```C++
@@ -1583,12 +1583,50 @@ we prune spaces, we may need to adjust the result size afterword.
 std::vector<char> buffer(simdutf::maximal_binary_length_from_base64(base64.data(), base64.size()));
 simdutf::result r = simdutf::base64_to_binary(base64.data(), base64.size(), buffer.data());
 if(r.error) {
-  // We have some error, r.count tells you where the error was encountered in the input
+  // We have some error, r.count tells you where the error was encountered in the input if
+  // the error is INVALID_BASE64_CHARACTER. If the error is BASE64_INPUT_REMAINDER, then
+  // a single valid base64 remained, and r.count contains the number of bytes decoded.
 } else {
   buffer.resize(r.count); // resize the buffer according to actual number of bytes
 }
 ```
 
+In some instances, you may want to limit the size of the output further when decoding base64.
+For this purpose, you may use the `base64_to_binary_safe` functions. The functions may also
+be useful if you seek to decode the input into segments having a maximal capacity.
+
+
+```C++
+  size_t len = 72; // for simplicity we chose len divisible by 3
+  std::vector<char> base64(len, 'a'); // we want to decode 'aaaaa....'
+  std::vector<char> back((len + 3) / 4 * 3);
+  size_t limited_length = back.size() / 2; // Intentionally too small
+  // We proceed to decode half:
+  simdutf::result r = simdutf::base64_to_binary_safe(
+            base64.data(), base64.size(), back.data(), limited_length);
+  assert(r.error == simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+  // We decoded r.count base64 8-bit units to limited_length bytes
+  // Now let us decode the rest !!!
+  //
+  // We have read up to r.count in the input buffer and we have
+  // produced limited_length bytes.
+  //
+  size_t input_index = r.count;
+  size_t limited_length2 = back.size();
+  r = simdutf::base64_to_binary_safe(base64.data() + input_index,
+                                           base64.size() - input_index,
+                                           back.data(), limited_length2);
+  assert(r.error == simdutf::error_code::SUCCESS);
+  // We decoded r.count base64 8-bit units to limited_length2 bytes
+  // We are done
+  assert(limited_length2 + limited_length == (len + 3) / 4 * 3);
+```
+
+See our function specifications for more details.
+
+In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
+we have function overloads for these cases as well.
+
 Some users may want to decode the base64 inputs in chunks, especially when doing
 file or networking programming. These users should see `tools/fastbase64.cpp`, a command-line
 utility designed for as an example. It reads and writes base64 files using chunks of at most
@@ -1597,6 +1635,14 @@ a few tens of kilobytes.
 The specification of our base64 functions is as follows:
 
 ```C++
+
+// base64_options are used to specify the base64 encoding options.
+using base64_options = uint64_t;
+enum : base64_options {
+  base64_default = 0, /* standard base64 format */
+  base64_url = 1 /* base64url format*/
+};
+
 /**
  * Provide the maximal binary length in bytes given the base64 input.
  * In general, if the input contains ASCII spaces, the result will be less than
@@ -1604,10 +1650,21 @@ The specification of our base64 functions is as follows:
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return number of base64 bytes
+ * @return maximal number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process, in ASCII stored as 16-bit units
+ * @param length        the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
+
 /**
  * Convert a base64 input to a binary ouput.
  *
@@ -1618,19 +1675,27 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * See https://infra.spec.whatwg.org/#forgiving-base64-decode
  *
  * This function will fail in case of invalid input. There are two possible reasons for
- * failure: the input is contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
+ * the output.
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
  * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
  * @param input         the base64 string to process
  * @param length        the length of the string in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Provide the base64 length in bytes given the length of a binary input.
@@ -1649,9 +1714,74 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  * @param input         the binary to process
  * @param length        the length of the input in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return number of written bytes, will be equal to base64_length_from_binary(length)
  */
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 16-bit units
+ * @param length        the length of the string in 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are three possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a single remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer 
+ * is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
+ * and the number of units processed, see description of the parameters and returned value.
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
+ * the output.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units.
+ * @param output        the pointer to buffer that can hold the conversion result.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of units processed if successful.
+ */
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 
 ```
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 40d641445..65d7e0eb9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -53,6 +53,9 @@ if(Threads_FOUND)
   set_property(TARGET threaded PROPERTY CXX_STANDARD 17)
   set_property(TARGET threaded PROPERTY CXX_STANDARD_REQUIRED ON)
 endif(Threads_FOUND)
+
+option(SIMDUTF_BENCHMARK_BASE64 "Whether the base64 benchmarks are included as part of the CMake Build (requires C++17 or better)." ON)
+
 if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
   message(STATUS "Not building base64 benchmarks when using clang-cl due to build errors with the aklomp/base64 dependency.")
 else()
diff --git a/benchmarks/base64/CMakeLists.txt b/benchmarks/base64/CMakeLists.txt
index a866b9609..ce6624b5d 100644
--- a/benchmarks/base64/CMakeLists.txt
+++ b/benchmarks/base64/CMakeLists.txt
@@ -8,7 +8,9 @@ CPMAddPackage(
 
 
 add_executable(benchmark_base64 benchmark_base64.cpp)
+message(STATUS "The tools benchmark_base64 require C++17. If your system does not support C++17, please set SIMDUTF_BENCHMARK_BASE64 to OFF.")
+set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD 17)
+set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD_REQUIRED ON)
 
-target_link_libraries(benchmark_base64 PUBLIC simdutf)
 target_link_libraries(benchmark_base64 PUBLIC base64)
 target_link_libraries(benchmark_base64 PUBLIC simdutf::benchmarks::benchmark)
diff --git a/benchmarks/base64/benchmark_base64.cpp b/benchmarks/base64/benchmark_base64.cpp
index 579d1e198..cfdf2e30d 100644
--- a/benchmarks/base64/benchmark_base64.cpp
+++ b/benchmarks/base64/benchmark_base64.cpp
@@ -10,9 +10,11 @@
 #include <vector>
 
 #include "libbase64.h"
-#include "simdutf.h"
+#include "libbase64_spaces.h"
 #include "node_base64.h"
 
+#include "simdutf.h"
+
 #include "event_counter.h"
 #include <atomic>
 
@@ -34,7 +36,7 @@ bool is_space(char c) {
 // This is for reference only, do not use this function in production
 // system.
 int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
-                                     size_t *outlen) {
+                              size_t *outlen) {
   struct base64_state state;
   base64_stream_decode_init(&state, 0);
   const char *srcend = src + srclen;
@@ -65,7 +67,7 @@ int base64_decode_skip_spaces(const char *src, size_t srclen, char *out,
   return !state.bytes;
 }
 
-enum : uint8_t { roundtrip = 0, decode = 1, encode = 2 };
+enum : uint8_t { roundtrip = 0, decode = 1, encode = 2, bun = 3, roundtripurl = 4 };
 
 event_collector collector;
 
@@ -110,10 +112,13 @@ std::vector<char> read_file(const char *filename,
 void show_help() {
   printf("Usage: benchmark_base64 [options] file1 [file2 ...]\n");
   printf("Options:\n");
-  printf("  -h, --help     Show this help message and exit\n");
-  printf("  -d, --decode   Decode the input file\n");
-  printf("  -e, --encode   Encode the input file\n");
-  printf("  -r, --roundtrip   Roundtrip the input file\n");
+  printf("  -h, --help         Show this help message and exit\n");
+  printf("  -d, --decode       Decode the input file\n");
+  printf("  -e, --encode       Encode the input file\n");
+  printf("  -r, --roundtrip    Roundtrip the input file\n");
+  printf("  --roundtrip-url    Roundtrip the input file (URL)\n");
+  printf("  -b, --bench-bun    Bun benchmark\n");
+
   printf(" See https://github.com/lemire/base64data for test data.\n");
 }
 void pretty_print(size_t, size_t bytes, std::string name, event_aggregate agg) {
@@ -208,6 +213,33 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
   printf("# number of inputs: %zu\n", data.size());
 
   switch (mode) {
+
+  case roundtripurl: {
+    printf("# roundtrip (url)\n");
+    for (auto &e : simdutf::get_available_implementations()) {
+      if (!e->supported_by_runtime_system()) {
+        continue;
+      }
+      pretty_print(data.size(), volume, "simdutf::" + e->name(),
+                   bench([&data, &buffer1, &buffer2, &e]() {
+                     for (const std::vector<char> &source : data) {
+                       size_t base64_size = e->binary_to_base64(
+                           source.data(), source.size(), buffer1.data(), simdutf::base64_url);
+                       auto err = e->base64_to_binary(
+                           buffer1.data(), base64_size, buffer2.data(), simdutf::base64_url);
+                       if (err.error) {
+                         std::cerr << "Error:  at position " << err.count
+                                   << std::endl;
+                       } else if (err.count != source.size()) {
+                         std::cerr << "Error: " << err.count
+                                   << " bytes decoded, expected "
+                                   << source.size() << std::endl;
+                       }
+                     }
+                   }));
+    }
+    break;
+  }
   case roundtrip: {
     printf("# roundtrip\n");
     pretty_print(
@@ -259,7 +291,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
     bool spaces = contains_spaces(data);
     if (spaces) {
       printf("# the base64 data contains spaces, so we cannot use straigth "
-             "libbase64::base64_decode\n");
+             "libbase64::base64_decode directly\n");
     } else {
       pretty_print(data.size(), volume, "libbase64",
                    bench([&data, &buffer1, &buffer2]() {
@@ -277,14 +309,29 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
                      }
                    }));
     }
-    pretty_print(data.size(), volume, "node",
-                 bench([&data, &buffer1, &buffer2]() {
-                   for (const std::vector<char> &source : data) {
-                     int result = node::base64_decode(buffer1.data(), buffer1.size(),
-                                    source.data(), source.size());
-                     (void) result;
-                   }
-                 }));
+    pretty_print(
+        data.size(), volume, "libbase64_space_decode",
+        bench([&data, &buffer1, &buffer2]() {
+          for (const std::vector<char> &source : data) {
+
+            size_t outlen;
+            bool ok = libbase64_space_decode(source.data(), source.size(),
+                                             buffer1.data(), &outlen);
+            if (!ok) {
+              std::cerr << "Error: "
+                        << " failed to decode base64 " << std::endl;
+              throw std::runtime_error("Error: failed to decode base64 ");
+            }
+          }
+        }));
+    pretty_print(
+        data.size(), volume, "node", bench([&data, &buffer1, &buffer2]() {
+          for (const std::vector<char> &source : data) {
+            int result = node::base64_decode(buffer1.data(), buffer1.size(),
+                                             source.data(), source.size());
+            (void)result;
+          }
+        }));
     for (auto &e : simdutf::get_available_implementations()) {
       if (!e->supported_by_runtime_system()) {
         continue;
@@ -316,7 +363,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
     printf("# encode\n");
     volatile size_t base64_size;
     pretty_print(data.size(), volume, "libbase64",
-                 bench([&data, &buffer1, &buffer2, &base64_size]() {
+                 bench([&data, &buffer1, &base64_size]() {
                    for (const std::vector<char> &source : data) {
                      size_t outlen;
                      base64_encode(source.data(), source.size(), buffer1.data(),
@@ -329,7 +376,7 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
         continue;
       }
       pretty_print(data.size(), volume, "simdutf::" + e->name(),
-                   bench([&data, &buffer1, &buffer2, &e, &base64_size]() {
+                   bench([&data, &buffer1, &e, &base64_size]() {
                      for (const std::vector<char> &source : data) {
                        base64_size = e->binary_to_base64(
                            source.data(), source.size(), buffer1.data());
@@ -341,6 +388,63 @@ void bench(std::vector<std::vector<char>> &data, uint8_t mode) {
   }
 }
 
+int bench_bun() {
+  /**
+   * See
+   * https://github.com/oven-sh/bun/blob/main/bench/snippets/buffer-to-string.mjs
+   *
+   * const bigBuffer = Buffer.from("hello world".repeat(10000));
+   * const converted = bigBuffer.toString("base64");
+   * const uuid = crypto.randomBytes(16);
+   *
+   * bench(`Buffer(${bigBuffer.byteLength}).toString('base64')`, () => {
+   * return bigBuffer.toString("base64");
+   * });
+   *
+   * bench(`Buffer(${uuid.byteLength}).toString('base64')`, () => {
+   *  return uuid.toString("base64");
+   * });
+   */
+  printf("# benching bun (essentially an encoding bench)\n");
+  std::string bigBuffer = "hello world";
+  bigBuffer.reserve(10000 * bigBuffer.size());
+  for (size_t i = 1; i < 10000; i++) {
+    bigBuffer += "hello world";
+  }
+  std::string crypto;
+  for (size_t i = 0; i < 16; i++) {
+    crypto += rand();
+  }
+  std::vector<std::pair<std::string, std::string>> tests = {
+      {"big hello world", bigBuffer}, {"random 16 bytes", crypto}};
+  // Could be nicer with C++20
+  for (auto &i : tests) {
+    printf("# %s\n", i.first.c_str());
+    std::string source = i.second;
+    volatile size_t base64_size;
+    std::vector<char> buffer1(
+        simdutf::base64_length_from_binary(source.size()));
+    pretty_print(1, source.size(), "libbase64",
+                 bench([&source, &buffer1, &base64_size]() {
+                   size_t outlen;
+                   base64_encode(source.data(), source.size(), buffer1.data(),
+                                 &outlen, 0);
+                   base64_size = outlen;
+                 }));
+    for (auto &e : simdutf::get_available_implementations()) {
+      if (!e->supported_by_runtime_system()) {
+        continue;
+      }
+      pretty_print(1, source.size(), "simdutf::" + e->name(),
+                   bench([&source, &buffer1, &e, &base64_size]() {
+                     base64_size = e->binary_to_base64(
+                         source.data(), source.size(), buffer1.data());
+                   }));
+    }
+  }
+  return EXIT_SUCCESS;
+}
+
 int main(int argc, char **argv) {
   printf("# current system detected as %s.\n",
          simdutf::get_active_implementation()->name().c_str());
@@ -363,10 +467,17 @@ int main(int argc, char **argv) {
       mode = encode;
     } else if ((arg == "-r") || (arg == "--roundtrip")) {
       mode = roundtrip;
+    } else if (arg == "--roundtrip-url") {
+      mode = roundtripurl;
+    } else if ((arg == "-b") || (arg == "--bun")) {
+      mode = bun;
     } else {
       arguments.push_back(std::move(arg));
     }
   }
+  if (mode == bun) {
+    return bench_bun();
+  }
   auto return_value = EXIT_SUCCESS;
   std::vector<std::vector<char>> input;
   printf("# loading files: ");
diff --git a/benchmarks/base64/libbase64_spaces.h b/benchmarks/base64/libbase64_spaces.h
new file mode 100644
index 000000000..e368221e8
--- /dev/null
+++ b/benchmarks/base64/libbase64_spaces.h
@@ -0,0 +1,49 @@
+
+// https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L392
+static inline size_t libbase64_find_space(const char *p, const size_t avail) {
+  for (size_t len = 0; len < avail; len++) {
+    if (p[len] == '\n' || p[len] == '\r' || p[len] == ' ' || p[len] == '\t') {
+      return len;
+    }
+  }
+
+  return avail;
+}
+
+// Inspired by
+// https://github.com/aklomp/base64/blob/b20a31a997e0b48274fa09e58b65ee9202531e4f/bin/base64.c#L405
+static bool libbase64_space_decode(const char *start, size_t avail, char *outbuf,
+                   size_t *outlen) {
+  struct base64_state state;
+  *outlen = 0;
+
+  // Initialize the decoder's state structure.
+  base64_stream_decode_init(&state, 0);
+
+  while (avail > 0) {
+    size_t len = libbase64_find_space(start, avail);
+    if (len == 0) {
+      start++;
+      avail--;
+      continue;
+    }
+
+    // Decode the chunk into the raw buffer.
+    size_t outlen = 0;
+    if (base64_stream_decode(&state, start, len, outbuf, &outlen) == 0) {
+      // decoding error
+      return false;
+    }
+
+    // Update the output buffer pointer and total size.
+    outbuf += outlen;
+    outlen += outlen;
+    if(avail == len) {
+      break;
+    }
+
+    start += len + 1;
+    avail -= len + 1;
+  }
+  return true;
+}
\ No newline at end of file
diff --git a/include/simdutf/error.h b/include/simdutf/error.h
index 0090ff1d6..a65303ce3 100644
--- a/include/simdutf/error.h
+++ b/include/simdutf/error.h
@@ -16,6 +16,7 @@ enum error_code {
                 // there must be no surrogate at all (Latin1)
   INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid base64 string.
   BASE64_INPUT_REMAINDER, // The base64 input terminates with a single character, excluding padding (=).
+  OUTPUT_BUFFER_TOO_SMALL, // The provided buffer is too small.
   OTHER         // Not related to validation/transcoding.
 };
 
diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
index 27cb6027b..25342d81b 100644
--- a/include/simdutf/implementation.h
+++ b/include/simdutf/implementation.h
@@ -1380,6 +1380,12 @@ simdutf_warn_unused size_t trim_partial_utf16le(const char16_t* input, size_t le
  */
 simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t length);
 
+// base64_options are used to specify the base64 encoding options.
+using base64_options = uint64_t;
+enum : base64_options {
+  base64_default = 0, /* standard base64 format */
+  base64_url = 1 /* base64url format*/
+};
 
 /**
  * Provide the maximal binary length in bytes given the base64 input.
@@ -1388,10 +1394,21 @@ simdutf_warn_unused size_t trim_partial_utf16(const char16_t* input, size_t leng
  *
  * @param input         the base64 input to process
  * @param length        the length of the base64 input in bytes
- * @return number of base64 bytes
+ * @return maximum number of binary bytes
  */
 simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;
 
+/**
+ * Provide the maximal binary length in bytes given the base64 input.
+ * In general, if the input contains ASCII spaces, the result will be less than
+ * the maximum length.
+ *
+ * @param input         the base64 input to process, in ASCII stored as 16-bit units
+ * @param length        the length of the base64 input in 16-bit units
+ * @return maximal number of binary bytes
+ */
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;
+
 /**
  * Convert a base64 input to a binary ouput.
  *
@@ -1402,19 +1419,24 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
  * See https://infra.spec.whatwg.org/#forgiving-base64-decode
  *
  * This function will fail in case of invalid input. There are two possible reasons for
- * failure: the input is contains a number of base64 characters that when divided by 4, leaves
- * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
  * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
  *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
  * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
  * If you fail to provide that much space, the function may cause a buffer overflow.
  *
  * @param input         the base64 string to process
  * @param length        the length of the string in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
  */
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
 
 /**
  * Provide the base64 length in bytes given the length of a binary input.
@@ -1433,9 +1455,74 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
  * @param input         the binary to process
  * @param length        the length of the input in bytes
  * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
  * @return number of written bytes, will be equal to base64_length_from_binary(length)
  */
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are two possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+ * If you fail to provide that much space, the function may cause a buffer overflow.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 16-bit units
+ * @param length        the length of the string in 16-bit units
+ * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
+ */
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default)  noexcept;
+
+/**
+ * Convert a base64 input to a binary ouput.
+ *
+ * This function follows the WHATWG forgiving-base64 format, which means that it will
+ * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+ * equal signs at the end) or an unpadded input (without any equal signs at the end).
+ *
+ * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+ *
+ * This function will fail in case of invalid input. There are three possible reasons for
+ * failure: the input contains a number of base64 characters that when divided by 4, leaves
+ * a single remainder character (BASE64_INPUT_REMAINDER), the input contains a character
+ * that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
+ * is too small (OUTPUT_BUFFER_TOO_SMALL).
+ *
+ * When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
+ * and the number of units processed, see description of the parameters and returned value.
+ *
+ * When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
+ * where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
+ * r.count contains the number of bytes decoded.
+ *
+ * The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
+ * the output.
+ *
+ * @param input         the base64 string to process, in ASCII stored as 8-bit or 16-bit units
+ * @param length        the length of the string in 8-bit or 16-bit units.
+ * @param output        the pointer to buffer that can hold the conversion result.
+ * @param outlen        the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
+ * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+ * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of units processed if successful.
+ */
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
 
 /**
  * An implementation of simdutf for a particular CPU architecture.
@@ -2504,10 +2591,21 @@ class implementation {
    *
    * @param input         the base64 input to process
    * @param length        the length of the base64 input in bytes
-   * @return number of base64 bytes
+   * @return maximal number of binary bytes
    */
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept = 0;
 
+  /**
+   * Provide the maximal binary length in bytes given the base64 input.
+   * In general, if the input contains ASCII spaces, the result will be less than
+   * the maximum length.
+   *
+   * @param input         the base64 input to process, in ASCII stored as 16-bit units
+   * @param length        the length of the base64 input in 16-bit units
+   * @return maximal number of binary bytes
+   */
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept = 0;
+
   /**
    * Convert a base64 input to a binary ouput.
    *
@@ -2518,8 +2616,8 @@ class implementation {
    * See https://infra.spec.whatwg.org/#forgiving-base64-decode
    *
    * This function will fail in case of invalid input. There are two possible reasons for
-   * failure: the input is contains a number of base64 characters that when divided by 4, leaves
-   * a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * failure: the input contains a number of base64 characters that when divided by 4, leaves
+   * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
    * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
    *
    * You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
@@ -2528,9 +2626,35 @@ class implementation {
    * @param input         the base64 string to process
    * @param length        the length of the string in bytes
    * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
    */
-  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept = 0;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
+
+  /**
+   * Convert a base64 input to a binary ouput.
+   *
+   * This function follows the WHATWG forgiving-base64 format, which means that it will
+   * ignore any ASCII spaces in the input. You may provide a padded input (with one or two
+   * equal signs at the end) or an unpadded input (without any equal signs at the end).
+   *
+   * See https://infra.spec.whatwg.org/#forgiving-base64-decode
+   *
+   * This function will fail in case of invalid input. There are two possible reasons for
+   * failure: the input contains a number of base64 characters that when divided by 4, leaves
+   * a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
+   * that is not a valid base64 character (INVALID_BASE64_CHARACTER).
+   *
+   * You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
+   * If you fail to provide that much space, the function may cause a buffer overflow.
+   *
+   * @param input         the base64 string to process, in ASCII stored as 16-bit units
+   * @param length        the length of the string in 16-bit units
+   * @param output        the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
+   * @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
+   */
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
   /**
    * Provide the base64 length in bytes given the length of a binary input.
@@ -2549,9 +2673,10 @@ class implementation {
    * @param input         the binary to process
    * @param length        the length of the input in bytes
    * @param output        the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
+   * @param options       the base64 options to use, can be base64_default or base64_url, is base64_default by default.
    * @return number of written bytes, will be equal to base64_length_from_binary(length)
    */
-  virtual size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept = 0;
+  virtual size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) const noexcept = 0;
 
 
 protected:
diff --git a/scripts/Makefile b/scripts/Makefile
new file mode 100644
index 000000000..ac88a9b37
--- /dev/null
+++ b/scripts/Makefile
@@ -0,0 +1,2 @@
+lint:
+	python -m mypy *.py
diff --git a/scripts/base64/Makefile b/scripts/base64/Makefile
new file mode 100644
index 000000000..ac88a9b37
--- /dev/null
+++ b/scripts/base64/Makefile
@@ -0,0 +1,2 @@
+lint:
+	python -m mypy *.py
diff --git a/scripts/base64/README.md b/scripts/base64/README.md
new file mode 100644
index 000000000..574806304
--- /dev/null
+++ b/scripts/base64/README.md
@@ -0,0 +1,2 @@
+The scripts in this directory are for reference only. They were use to check
+the algorithms we are using.
\ No newline at end of file
diff --git a/scripts/base64/avx512.py b/scripts/base64/avx512.py
new file mode 100644
index 000000000..21788b528
--- /dev/null
+++ b/scripts/base64/avx512.py
@@ -0,0 +1,76 @@
+lookup_0 = [0 for i in range(64)]
+lookup_1 = [0 for i in range(64)]
+for i in range(64):
+    lookup_0[i] = 0x80
+    lookup_1[i] = 0x80
+lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+for i in range(64):
+    val = ord(lookup[i])
+    bit6 = val & 0x40
+    bits05 = val & 0x3f
+    if bit6:
+        lookup_1[bits05] = i
+    else:
+        lookup_0[bits05] = i
+allowed = "\t\r\n "
+for z in allowed:
+    lookup_0[ord(z)] = 0xff
+def sign8(x):
+    if x >= 128:
+        return x - 256
+    return x
+lookup_0.reverse()
+lookup_1.reverse()
+print("lookup0:")
+print(", ".join([str(sign8(i)) for i in lookup_0]))
+print("lookup1:")
+print(", ".join([str(sign8(i)) for i in lookup_1]))
+lookupn = [0 for i in range(64)]
+output = 0
+for ifrom in range(16):
+    lookupn[ifrom*4 + 0] = output + 3
+    lookupn[ifrom*4 + 1] = output + 2
+    lookupn[ifrom*4 + 2] = output + 1
+    lookupn[ifrom*4 + 3] = output + 0
+    output += 4
+lookupn.reverse()
+print("reverse:")
+print(", ".join([str(i) for i in lookupn]))
+
+print("====")
+
+lookup_0 = [0 for i in range(64)]
+lookup_1 = [0 for i in range(64)]
+for i in range(64):
+    lookup_0[i] = 0x80
+    lookup_1[i] = 0x80
+lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+for i in range(64):
+    val = ord(lookup[i])
+    bit6 = val & 0x40
+    bits05 = val & 0x3f
+    if bit6:
+        lookup_1[bits05] = i
+    else:
+        lookup_0[bits05] = i
+allowed = "\0\t\r\n "
+for z in allowed:
+    lookup_0[ord(z)] = 0xff
+
+lookup_0.reverse()
+lookup_1.reverse()
+print("lookup0:")
+print(", ".join([str(sign8(i)) for i in lookup_0]))
+print("lookup1:")
+print(", ".join([str(sign8(i)) for i in lookup_1]))
+lookupn = [0 for i in range(64)]
+output = 0
+for ifrom in range(16):
+    lookupn[ifrom*4 + 0] = output + 3
+    lookupn[ifrom*4 + 1] = output + 2
+    lookupn[ifrom*4 + 2] = output + 1
+    lookupn[ifrom*4 + 3] = output + 0
+    output += 4
+lookupn.reverse()
+print("reverse:")
+print(", ".join([str(i) for i in lookupn]))
\ No newline at end of file
diff --git a/scripts/base64/neon_decode.py b/scripts/base64/neon_decode.py
new file mode 100644
index 000000000..24fe1c03e
--- /dev/null
+++ b/scripts/base64/neon_decode.py
@@ -0,0 +1,130 @@
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
+spaces=' \t\n\r'
+lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4]
+lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
+roll = [0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
+def decode(s):
+    low = s & 0xf
+    high = s >> 4
+    m = lut_lo[low] & lut_hi[high]
+    if(m > 0x3):
+        return (m, None)
+    if s == 0x2f:
+        off = roll[high - 1]
+    else:
+        off = roll[high]
+    return (m,(s + off)&0xff)
+
+for i in range(256):
+    m,d = decode(i)
+    if d is None:
+        assert t.find(chr(i)) == -1
+        assert spaces.find(chr(i)) == -1
+        continue
+    if m == 0:
+        assert d >= 0
+        # we must have a base64 element
+        v = t.find(chr(i))
+        assert v == d
+    else:
+        # we must have a space
+        v = spaces.find(chr(i))
+        assert v >= 0
+
+
+
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+spaces=' \t\n\r'
+
+lut_lo = [0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb0]
+lut_hi = [0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20]
+roll = [0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0]
+t='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'
+spaces=' \t\n\r' ## ['0x20', '0x9', '0xa', '0xd']
+
+
+lut_lo = [0x0 for i in range(16)]
+lut_hi = [0x0 for i in range(16)]
+#roll = [0 for i in range(16)]
+
+#0x00 are forbidden except for \t \n \r which go to one
+lut_hi[0] = 0x11
+for z in range(16):
+    if '\t\n\r'.find(chr(z)) != -1:
+        lut_lo[z & 0xf] = 0x1 # allowed
+    else:
+        lut_lo[z] = 0x10 # forbidden
+#0x10 and 0x80 all forbidden
+lut_hi[0x1] = 0x20
+for z in range(0x8, 16):
+    lut_hi[z] = 0x20
+#lut_hi[0x8] = 0x20
+
+for z in range(16):
+    lut_lo[z] |= 0x20
+
+#0x20 selective
+lut_hi[0x2] = 0x42
+for z in range(16):
+    if z == 0:
+        lut_lo[z] |= 0x2
+    elif z != 0xd:
+        lut_lo[z] |= 0x40
+
+
+#0x30 numbers
+lut_hi[0x3] = 0x80
+for z in range(10,16):
+    lut_lo[z] |= 0x80
+
+#0x40, 0x60 letters
+lut_hi[0x4] = 0x8
+lut_hi[0x6] = 0x8
+lut_lo[0] |= 0x8
+
+#0x7 letters
+#0x5 letters
+lut_hi[0x5] |= 0x4
+lut_hi[0x7] |= 0x4
+for i in range(0xb,16):
+    lut_lo[i] |= 0x4
+
+
+
+def decodes(s):
+    low = s & 0xf
+    high = s >> 4
+    m = lut_lo[low] & lut_hi[high]
+    is_underscore = s == 0x5f
+    if(is_underscore):
+        m = 0
+        high = 0
+    if(m > 0x3):
+        return (m, None)
+    if s == 0x2d:
+        off = roll[high - 1]
+    else:
+        off = roll[high]
+    return (m,(s + off)&0xff)
+print(",".join([hex(c) for c in lut_lo]))
+print(",".join([hex(c) for c in lut_hi]))
+print(",".join([hex(c) for c in roll]))
+
+
+for i in range(256):
+    m,d = decodes(i)
+    if d is None:
+        assert t.find(chr(i)) == -1
+        assert spaces.find(chr(i)) == -1
+        continue
+    if m == 0:
+        assert d >= 0
+        # we must have a base64 element
+        v = t.find(chr(i))
+        if(v != d): 
+            print(hex(i), chr(i), v, d)
+        #assert v == d
+    else:
+        # we must have a space
+        v = spaces.find(chr(i))
+        assert v >= 0
diff --git a/scripts/base64/sse.py b/scripts/base64/sse.py
new file mode 100644
index 000000000..aa7b4f75a
--- /dev/null
+++ b/scripts/base64/sse.py
@@ -0,0 +1,252 @@
+import sys
+delta_asso = [0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F]
+check_asso = [0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F]
+
+delta_values =[(0x00), (0x00), (0x00), (0x13), (0x04), (0xBF), (0xBF), (0xB9), (0xB9), (0x00), (0x10), (0xC3), (0xBF), (0xBF), (0xB9), (0xB9)]
+check_values = [(0x80), (0x80), (0x80), (0x80), (0xCF), (0xBF), (0xD5), (0xA6), (0xB5), (0x86), (0xD1), (0x80), (0xB1), (0x80), (0x91), (0x80)]
+
+
+valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+
+def safechr(i):
+    if i < 32:
+        return '.'
+    if i > 127:
+        return '?'
+    return chr(i)
+
+def safehex(x):
+    return "0x{0:2x}".format(x)
+
+def to_signed(x):
+    if(x >= 128):
+        return x - 256
+    return x
+
+def to_unsigned(x):
+    if(x < 0):
+        return x + 256
+    return x
+
+def sat(x, y):
+    x = to_signed(x)
+    y = to_signed(y)
+    z = x + y
+    if(z > 127):
+        return 127
+    if(z < -128):
+        return to_unsigned(-128)
+    return to_unsigned(z)
+
+def lookup(table, index):
+    print("looking up ", hex(index))
+    if(index >= 128):
+        return 0
+    return table[index&0xf]
+
+
+
+def quietlookup(table, index):
+    if(index >= 128):
+        return 0
+    return table[index&0xf]
+
+def process(src):
+    shifted = (src >> 3)%256
+    delta_hash = (lookup(delta_asso,src) + shifted + 1) >> 1
+    check_hash = (lookup(check_asso,src) + shifted + 1) >> 1
+    out = sat(lookup(delta_values,delta_hash), src)
+    chk = sat(lookup(check_values,check_hash), src)
+    mask = chk & 0x80
+    return (out, mask)
+
+def processquiet(src):
+    shifted = (src >> 3)%256
+    delta_hash = (quietlookup(delta_asso,src) + shifted + 1) >> 1
+    check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+    out = sat(quietlookup(delta_values,delta_hash), src)
+    chk = sat(quietlookup(check_values,check_hash), src)
+    mask = chk & 0x80
+    return (out, mask)
+
+def is_ok(i):
+    out, mask = processquiet(i)
+    if mask == 0:
+        return 1
+    return 0
+
+def computestring():
+    s = ""
+    for i in range(256):
+        out, mask = processquiet(i)
+        if(mask == 0):
+            s +=  safechr(i)
+    return s
+print(computestring() + " " + str(len(computestring())))
+
+def print_layout():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in range(16):
+        if check_hash in t:
+            off = quietlookup(check_values,check_hash)
+            print(hex(check_hash), hex(off), end="")
+            print("\t", " ".join(["   "+safechr(c) for c in t[check_hash]]))
+        else:
+            continue
+
+
+def is_valid():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        if check_hash in t:
+            array = t[check_hash]
+            i = 0
+            while i < len(array) and valid.find(chr(array[i])) == -1:
+                i += 1
+            while i < len(array) and valid.find(chr(array[i])) != -1:
+                i += 1
+            while i < len(array) and array[i] >= 128:
+                i += 1
+            if i < len(array):
+                return False
+        else:
+            continue
+    return True
+
+print_layout()
+print(is_valid())
+
+valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+print("----")
+def fun_adjust():
+    for zz in range(256):
+        check_asso[ord('-')&0xf] = zz
+        for yy in range(256):
+            check_asso[ord('_')&0xf] = yy
+            if(is_valid()):
+                print("----")
+                print_layout()
+                print(is_valid())
+                print("found")
+                return
+fun_adjust()
+            #sys.exit(0)
+
+def adjust(array, start, end, check_hash):
+    for j in range(256):
+        is_ok = True
+        for i in range(len(array)):
+            valid = (sat(j,array[i])&0x80 == 0) # sat(quietlookup(check_values,check_hash), src)
+            should_be_valid = (i>=start and i < end)
+            is_ok = is_ok and (valid == should_be_valid)
+        if(is_ok):
+            check_values[check_hash&0xf] = j
+            return
+    raise "unexpected"
+
+
+    chk = sat(quietlookup(check_values,check_hash), src)
+    mask = chk & 0x80
+
+def process_explicit():
+    t={}
+    for i in range(256):
+        src = i
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(check_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        if check_hash in t:
+            array = t[check_hash]
+            i = 0
+            while i < len(array) and valid.find(chr(array[i])) == -1:
+                i += 1
+            if i < len(array) and valid.find(chr(array[i])) != -1:
+                start = i
+                while i < len(array) and valid.find(chr(array[i])) != -1:
+                    i += 1
+                end = i
+                adjust(array, start, end, check_hash)
+        else:
+            continue
+    return True
+print("process")
+process_explicit()
+print("string")
+print(computestring()+ " "+str(len(computestring())))
+
+for c in valid:
+    print(c,processquiet(ord(c)))
+
+def examine():
+    t={}
+    for i in valid:
+        src = ord(i)
+        shifted = (src >> 3)%256
+        check_hash = (quietlookup(delta_asso,src) + shifted + 1) >> 1
+        if check_hash not in t:
+            t[check_hash] = []
+        t[check_hash].append(i)
+    for check_hash in t.keys():
+        print(check_hash, t[check_hash])
+    return True
+examine()
+
+delta_values[10] += 1 
+
+delta_values[13] += 33 
+
+for c in valid:
+    print(c,processquiet(ord(c)))
+
+
+
+def casthex(v):
+    if(v >= 0x80):
+        return "uint8_t("+"0x{:X}".format(v)+")"
+    return "0x{:X}".format(v)
+def printme(c):
+    print(",".join([casthex(i) for i in c]))
+print("delta_asso")
+printme(delta_asso)
+print("check_asso")
+printme(check_asso)
+print("delta_values")
+printme(delta_values)
+print("check_values")
+printme(check_values)
+
+def processverbose(src):
+    print("processing ", hex(src))
+    shifted = (src >> 3)%256
+    print("shifted ", hex(shifted))
+    delta_hash = (lookup(delta_asso,src) + shifted + 1) >> 1
+    print("delta_hash ", hex(delta_hash))
+    check_hash = (lookup(check_asso,src) + shifted + 1) >> 1
+    print("check_hash ", hex(check_hash))
+    out = sat(lookup(delta_values,delta_hash), src)
+    print("out ", hex(out))
+    chk = sat(lookup(check_values,check_hash), src)
+    print("chk ", hex(chk))
+
+    mask = chk & 0x80
+    return (out, mask)
+processverbose(ord('-'))
+
+print(computestring()+ " "+str(len(computestring())))
diff --git a/scripts/base64/table.py b/scripts/base64/table.py
new file mode 100644
index 000000000..d99ed4e76
--- /dev/null
+++ b/scripts/base64/table.py
@@ -0,0 +1,42 @@
+import base64
+#default base64 table, uncomment and comment the next line to use the default table
+#t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
+
+t=[255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 64, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 64, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63, 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255]
+def formula(a, b, c, d):
+    if(a >= 64 or b >= 64 or c >= 64 or d >= 64):
+        return 0x01ffffff
+    z =[ ((a * 4) + (b // 16))% 256, ((b * 16) % 256 + (c // 4))% 256 , ((c * 64) % 256 + d)% 256 ]
+    return z[0] + (z[1] << 8) + (z[2] << 16)
+
+acc = "const uint32_t d0[256] = {"
+for i in range(256):
+    a = formula(t[i], 0, 0, 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d1[256] = {"
+for i in range(256):
+    a = formula(0, t[i], 0, 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d2[256] = {"
+for i in range(256):
+    a = formula(0, 0, t[i], 0)
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
+
+acc = "const uint32_t d3[256] = {"
+for i in range(256):
+    a = formula(0, 0, 0, t[i])
+    acc += "0x"+format(a, '08x')+","
+acc=acc[:-1] 
+acc+= "};"
+print(acc)
\ No newline at end of file
diff --git a/scripts/create_latex_table.py b/scripts/create_latex_table.py
index d396bb9c4..708de9aef 100755
--- a/scripts/create_latex_table.py
+++ b/scripts/create_latex_table.py
@@ -23,13 +23,16 @@
 for line in content:
 
     if line.startswith("convert"):
-        codec = re.search(r"\+(\w+)",line).group(1)
-        rfile = re.search(r"/(\w+)[\.-]",line).group(1)
-        currentrow["codec"] = codec
-        currentrow["dataset"] = rfile
-        datasets.add(rfile)
-        codecs.add(codec)
-
+        m = re.search(r"\+(\w+)",line)
+        if m is not None:
+            codec = m.group(1)
+            currentrow["codec"] = codec
+            codecs.add(codec)
+        m = re.search(r"/(\w+)[\.-]",line)
+        if m is not None:
+            rfile = m.group(1)
+            currentrow["dataset"] = rfile
+            datasets.add(rfile)
     m = re.search(r"\s([\.0-9]+) Gc/s",line)
     if m:
         v = float(m.group(1))
@@ -50,8 +53,8 @@ def get(d, k):
     for x in table:
         if(x['codec'] == k) and (x['dataset'] == d):
             return x["result"]
-datasets=sorted(datasets)
-for dataset in datasets:
+datasetsorted=sorted(datasets)
+for dataset in datasetsorted:
     s = dataset
     for k in kernels:
       s +=  " & " + get(dataset, k)
diff --git a/scripts/release.py b/scripts/release.py
index 944798413..e3062ae43 100755
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -124,16 +124,19 @@ def topaddedversionstring(major, minor, rev):
 with open (cmakefile, 'rt') as myfile:
     for line in myfile:
         m = pattern.search(line)
-        if m != None:
+        if m is not None:
             sonumber = int(m.group(1))
             break
 print("so library number "+str(sonumber))
 
 if(atleastminor):
     print("Given that we have a minor revision, it seems necessary to bump the so library number")
+    if sonumber is None:
+        print("I cannot find the so library number in the CMakeLists.txt file")
+        sys.exit(-1)
     sonumber += 1
 
-for line in fileinput.input(cmakefile, inplace=1, backup='.bak'):
+for line in fileinput.input(cmakefile, inplace=True, backup='.bak'):
     line = re.sub(r'  VERSION \d+\.\d+\.\d+','  VERSION '+newmajorversionstring+'.'+mewminorversionstring+'.'+newrevversionstring, line.rstrip())
     line = re.sub(r'SIMDUTF_LIB_VERSION "\d+\.\d+\.\d+','SIMDUTF_LIB_VERSION "'+str(sonumber)+".0.0", line)
     line = re.sub(r'set\(SIMDUTF_LIB_SOVERSION "\d+"','set(SIMDUTF_LIB_SOVERSION \"'+str(sonumber)+'\"', line)
@@ -143,7 +146,7 @@ def topaddedversionstring(major, minor, rev):
 
 
 doxyfile = maindir + os.sep + "Doxyfile"
-for line in fileinput.input(doxyfile, inplace=1, backup='.bak'):
+for line in fileinput.input(doxyfile, inplace=True, backup='.bak'):
     line = re.sub(r'PROJECT_NUMBER         = "\d+\.\d+\.\d+','PROJECT_NUMBER         = "'+newversionstring, line.rstrip())
     print(line)
 print("modified "+doxyfile+", a backup was made")
@@ -165,7 +168,7 @@ def topaddedversionstring(major, minor, rev):
 readmefile = maindir + os.sep + "README.md"
 
 
-for line in fileinput.input(readmefile, inplace=1, backup='.bak'):
+for line in fileinput.input(readmefile, inplace=True, backup='.bak'):
     line = re.sub(r'   wget https://github.com/simdutf/simdutf/releases/download/v\d+\.\d+\.\d+/singleheader.zip','   wget https://github.com/simdutf/simdutf/releases/download/v'+newmajorversionstring+'.'+mewminorversionstring+'.'+newrevversionstring+'/singleheader.zip', line.rstrip())
     line = re.sub(r'https://github.com/simdutf/simdutf/releases/download/v\d+\.\d+\.\d+/singleheader.zip','https://github.com/simdutf/simdutf/releases/download/v'+newmajorversionstring+'.'+mewminorversionstring+'.'+newrevversionstring+'/singleheader.zip', line.rstrip())
     print(line)
@@ -178,11 +181,12 @@ def topaddedversionstring(major, minor, rev):
 if m == None:
     print('I cannot find a link to the API documentation in your README')
 else:
-    detectedreadme = m.group(1)
-    print("found a link to your API documentation in the README file: "+detectedreadme+" ("+toversionstring(*newversion)+")")
-    if(atleastminor):
-       if(detectedreadme != toversionstring(*newversion)):
-           print(colored(255, 0, 0, "Consider updating the readme link to "+toversionstring(*newversion)))
+    if m is not None:
+        detectedreadme = m.group(1)
+        print("found a link to your API documentation in the README file: "+detectedreadme+" ("+toversionstring(*newversion)+")")
+        if(atleastminor):
+            if(detectedreadme != toversionstring(*newversion)):
+                print(colored(255, 0, 0, "Consider updating the readme link to "+toversionstring(*newversion)))
 
 
 
diff --git a/src/arm64/arm_base64.cpp b/src/arm64/arm_base64.cpp
index 2113a2cec..35a0e3d78 100644
--- a/src/arm64/arm_base64.cpp
+++ b/src/arm64/arm_base64.cpp
@@ -26,7 +26,8 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-size_t encode_base64(char *dst, const char *src, size_t srclen) {
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
   // credit: Wojciech Muła
   uint8_t *out = (uint8_t *)dst;
   constexpr static uint8_t source_table[64] = {
@@ -36,8 +37,16 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
       '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
       'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
   };
+  constexpr static uint8_t source_table_url[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
+  };
   const uint8x16_t v3f = vdupq_n_u8(0x3f);
-  const uint8x16x4_t table = vld4q_u8(source_table);
+  const uint8x16x4_t table =
+      vld4q_u8((options & base64_url) ? source_table_url : source_table);
   size_t i = 0;
   for (; i + 16 * 3 <= srclen; i += 16 * 3) {
     const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
@@ -55,7 +64,8 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     vst4q_u8(out, result);
     out += 64;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
+                                            options);
 
   return size_t((char *)out - dst);
 }
@@ -94,9 +104,22 @@ struct block64 {
   uint8x16_t chunks[4];
 };
 static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
-uint64_t to_base64_mask(block64 *b, bool *error) {
+template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t v0f = vdupq_n_u8(0xf);
 
+  uint8x16_t underscore0, underscore1, underscore2, underscore3;
+  if (base64_url) {
+    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
+    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
+    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
+    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
+  } else {
+    (void)underscore0;
+    (void)underscore1;
+    (void)underscore2;
+    (void)underscore3;
+  }
+
   uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
   uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
   uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
@@ -106,31 +129,62 @@ uint64_t to_base64_mask(block64 *b, bool *error) {
   uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
   uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
   uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+  uint8x16_t lut_lo;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t lut_lo =
-      simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+  if (base64_url) {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4);
+  } else {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4);
+  }
 #else
-  const uint8x16_t lut_lo = {0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                             0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+  if (base64_url) {
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+              0x70, 0x61, 0xe1, 0xf4, 0xf4, 0xa5, 0xf4, 0xf4};
+  } else {
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+              0x70, 0x61, 0xe1, 0xb4, 0xf4, 0xe5, 0xf4, 0xb4};
+  }
 #endif
   uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
   uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
   uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
   uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+  uint8x16_t lut_hi;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t lut_hi =
-      simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4, 0x20,
-                              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  if (base64_url) {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  } else {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  }
 #else
-  const uint8x16_t lut_hi = {0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                             0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  if (base64_url) {
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  } else {
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+              0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  }
 #endif
   uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
   uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
   uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
   uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
 
+  if (base64_url) {
+    hi0 = vbicq_u8(hi0, underscore0);
+    hi1 = vbicq_u8(hi1, underscore1);
+    hi2 = vbicq_u8(hi2, underscore2);
+    hi3 = vbicq_u8(hi3, underscore3);
+  }
+
   uint8_t checks =
       vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
                          vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
@@ -161,23 +215,41 @@ uint64_t to_base64_mask(block64 *b, bool *error) {
   }
   // This is the transformation step that can be done while we are waiting for
   // sum0
+  uint8x16_t roll_lut;
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t roll_lut =
-      simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9, 0x0,
-                              0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  if (base64_url) {
+    roll_lut =
+        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  } else {
+    roll_lut =
+        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  }
 #else
-  const uint8x16_t roll_lut = {0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                               0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  if (base64_url) {
+    roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  } else {
+    roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  }
 #endif
-  uint8x16_t v2f = vdupq_n_u8(0x2f);
-  uint8x16_t roll0 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], v2f), hi_nibbles0));
-  uint8x16_t roll1 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], v2f), hi_nibbles1));
-  uint8x16_t roll2 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], v2f), hi_nibbles2));
-  uint8x16_t roll3 =
-      vqtbl1q_u8(roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], v2f), hi_nibbles3));
+  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
+  if (base64_url) {
+    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
+    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
+    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
+    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+  }
+  uint8x16_t roll0 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
+  uint8x16_t roll1 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
+  uint8x16_t roll2 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
+  uint8x16_t roll3 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
   b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
   b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
   b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
@@ -203,6 +275,8 @@ uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return offsets >> 56;
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 void load_block(block64 *b, const char *src) {
   b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
   b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
@@ -210,6 +284,23 @@ void load_block(block64 *b, const char *src) {
   b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
 }
 
+// The caller of this function is responsible to ensure that there are 32 bytes available
+// from reading at data. It returns a 16-byte value, narrowing with saturation the 16-bit words.
+inline uint8x16_t load_satured(const uint16_t *data) {
+  uint16x8_t in1 = vld1q_u16(data);
+  uint16x8_t in2 = vld1q_u16(data + 8);
+  return vqmovn_high_u16(vqmovn_u16(in1), in2);
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
+  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
+  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
+  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
+}
+
 // decode 64 bytes and output 48 bytes
 void base64_decode_block(char *out, const char *src) {
   uint8x16x4_t str = vld4q_u8((uint8_t *)src);
@@ -222,7 +313,11 @@ void base64_decode_block(char *out, const char *src) {
   vst3q_u8((uint8_t *)out, outvec);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url, typename char_type>
+result compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -232,26 +327,26 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
       equalsigns = 2;
     }
   }
-  const char *const srcinit = src;
+  const char_type *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const char_type *const srcend = src + srclen;
 
   constexpr size_t block_size = 10;
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const char_type *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if(badcharmask)
       if (error) {
         src -= 64;
 
-        while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -288,7 +383,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -332,7 +427,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -373,7 +468,8 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/arm64/implementation.cpp b/src/arm64/implementation.cpp
index ff02797f8..e0a35f071 100644
--- a/src/arm64/implementation.cpp
+++ b/src/arm64/implementation.cpp
@@ -839,16 +839,24 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
 }
 
 
diff --git a/src/fallback/implementation.cpp b/src/fallback/implementation.cpp
index 8bf24a1fc..f7c7d9321 100644
--- a/src/fallback/implementation.cpp
+++ b/src/fallback/implementation.cpp
@@ -349,7 +349,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -359,15 +359,33 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
+}
+
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/haswell/avx2_base64.cpp b/src/haswell/avx2_base64.cpp
index 870d36f6f..187df475a 100644
--- a/src/haswell/avx2_base64.cpp
+++ b/src/haswell/avx2_base64.cpp
@@ -26,23 +26,35 @@
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
 
-__m256i lookup_pshufb_improved(const __m256i input) {
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
   // credit: Wojciech Muła
   __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
   const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
   result =
       _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-  const __m256i shift_LUT = _mm256_setr_epi8(
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
-
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m256i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
 
   result = _mm256_shuffle_epi8(shift_LUT, result);
   return _mm256_add_epi8(result, input);
 }
 
+template <base64_options options>
 size_t encode_base64(char *dst, const char *src, size_t srclen) {
   // credit: Wojciech Muła
   const uint8_t *input = (const uint8_t *)src;
@@ -110,18 +122,18 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input0));
+                        lookup_pshufb_improved<options == base64_url>(input0));
     out += 32;
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input1));
+                        lookup_pshufb_improved<options == base64_url>(input1));
     out += 32;
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input2));
+                        lookup_pshufb_improved<options == base64_url>(input2));
     out += 32;
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(input3));
+                        lookup_pshufb_improved<options == base64_url>(input3));
     out += 32;
   }
   for (; i + 28 <= srclen; i += 24) {
@@ -145,11 +157,11 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m256i indices = _mm256_or_si256(t1, t3);
 
     _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved(indices));
+                        lookup_pshufb_improved<options == base64_url>(indices));
     out += 32;
   }
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
 static inline void compress(__m128i data, uint16_t mask, char *output) {
@@ -200,43 +212,83 @@ struct block64 {
   __m256i chunks[2];
 };
 
+template <bool base64_url>
 static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   const __m256i ascii_space_tbl =
       _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
                        0x0, 0x0, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
                        0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0x0, 0xd, 0x0, 0x0);
   // credit: aqrit
-  const __m256i delta_asso = _mm256_setr_epi8(
-      0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  const __m256i delta_values = _mm256_setr_epi8(
-      int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
-      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
-      int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-      int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
-      int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
-      int8_t(0xB9), int8_t(0xB9));
-  const __m256i check_asso = _mm256_setr_epi8(
-      0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
-      0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-      0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
-  const __m256i check_values = _mm256_setr_epi8(
-      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
-      int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
-      int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
-      int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
-      int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
-      int8_t(0x91), int8_t(0x80));
-  const __m256i shifted = _mm256_srli_epi32(*src, 3);
+  __m256i delta_asso;
+  if (base64_url) {
+    delta_asso =
+        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+    delta_asso = _mm256_setr_epi8(
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
 
+  __m256i delta_values;
+  if (base64_url) {
+    delta_values = _mm256_setr_epi8(
+        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
+        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+    delta_values = _mm256_setr_epi8(
+        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+        int8_t(0xB9), int8_t(0xB9));
+  }
+  __m256i check_asso;
+
+  if (base64_url) {
+    check_asso =
+        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+                         0x7, 0xB, 0x6, 0xB, 0x12, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
+  } else {
+
+    check_asso = _mm256_setr_epi8(
+        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m256i check_values;
+  if (base64_url) {
+    check_values = _mm256_setr_epi8(
+        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0xCF),
+        uint8_t(0xBF), uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
+        uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80), uint8_t(0xB0),
+        uint8_t(0x80), 0x0, 0x0, 0x0, uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xD3),
+        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0x86), uint8_t(0xD0),
+        uint8_t(0x80), uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
+  } else {
+    check_values = _mm256_setr_epi8(
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+        int8_t(0x91), int8_t(0x80));
+  }
+  const __m256i shifted = _mm256_srli_epi32(*src, 3);
   const __m256i delta_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
   const __m256i check_hash =
       _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
-
   const __m256i out =
       _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
   const __m256i chk =
@@ -250,10 +302,12 @@ static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
   *src = out;
   return (uint32_t)mask;
 }
+
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   *error = 0;
-  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
-  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
   return m0 | (m1 << 32);
 }
 
@@ -270,12 +324,29 @@ static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return _mm_popcnt_u64(nmask);
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
   b->chunks[1] =
       _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
+  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
+  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+}
+
 static inline void base64_decode(char *out, __m256i str) {
   // credit: aqrit
   const __m256i pack_shuffle =
@@ -315,7 +386,11 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 24, buffer, 24);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url, typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -328,26 +403,25 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
   char *end_of_safe_64byte_zone =
       (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
 
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
   static_assert(block_size >= 2, "block_size must be at least two");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -393,7 +467,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
   if (last_block != 0 && srcend - src + last_block >= 64) {
 
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -441,7 +515,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -481,7 +555,8 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/haswell/implementation.cpp b/src/haswell/implementation.cpp
index 78d00a6ab..f11325864 100644
--- a/src/haswell/implementation.cpp
+++ b/src/haswell/implementation.cpp
@@ -782,16 +782,28 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  if(options & base64_url) {
+    return encode_base64<base64_url>(output, input, length);
+  } else {
+    return encode_base64<base64_default>(output, input, length);
+  }
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/icelake/icelake_base64.inl.cpp b/src/icelake/icelake_base64.inl.cpp
index 74ea110a4..312a0813c 100644
--- a/src/icelake/icelake_base64.inl.cpp
+++ b/src/icelake/icelake_base64.inl.cpp
@@ -31,14 +31,17 @@ struct block64 {
   __m512i chunks[1];
 };
 
-size_t encode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
   // credit: Wojciech Muła
-
   const uint8_t *input = (const uint8_t *)src;
 
   uint8_t *out = (uint8_t *)dst;
   static const char *lookup_tbl =
-      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+      base64_url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
   const __m512i shuffle_input = _mm512_setr_epi32(
       0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
@@ -57,27 +60,48 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
     out += 64;
   }
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   __m512i input = b->chunks[0];
   const __m512i ascii_space_tbl = _mm512_set_epi8(
       0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9,
       0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0,
       32, 0, 0, 13, 0, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
-  __m512i lookup0 = _mm512_set_epi8(
-      -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-      52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
-      -128, -128, -128, -64, -128, -128, -128, -128, -128, -128, -128, -128,
-      -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -64, -128,
-      -128, -64, -64, -128, -128, -128, -128, -128, -128, -128, -128, -64);
-  __m512i lookup1 = _mm512_set_epi8(
-      -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41,
-      40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128, -128,
-      -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14,
-      13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  __m512i lookup0;
+  if (base64_url) {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
+        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
+  } else {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
+        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
+  }
+  __m512i lookup1;
+  if (base64_url) {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  } else {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  }
+
   const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
   const __m512i combined = _mm512_or_si512(translated, input);
   const __mmask64 mask = _mm512_movepi8_mask(combined);
@@ -102,10 +126,22 @@ static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return _mm_popcnt_u64(nmask);
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
+
 static inline void base64_decode(char *out, __m512i str) {
   const __m512i merge_ab_and_bc =
       _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
@@ -130,7 +166,11 @@ static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url, typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -140,26 +180,25 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
       equalsigns = 2;
     }
   }
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   // figure out why block_size == 2 is sometimes best???
   constexpr size_t block_size = 6;
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -195,7 +234,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
   if (last_block != 0 && srcend - src + last_block >= 64) {
 
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -237,7 +276,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -278,7 +317,8 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/src/icelake/implementation.cpp b/src/icelake/implementation.cpp
index 035c77a50..356159808 100644
--- a/src/icelake/implementation.cpp
+++ b/src/icelake/implementation.cpp
@@ -1368,16 +1368,29 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
+}
+
+
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  if(options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
 }
 
 } // namespace SIMDUTF_IMPLEMENTATION
diff --git a/src/implementation.cpp b/src/implementation.cpp
index bd76c4075..e6e5ecc48 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1,6 +1,7 @@
 #include "simdutf.h"
 #include <initializer_list>
 #include <climits>
+#include <type_traits>
 
 // Useful for debugging purposes
 namespace simdutf {
@@ -31,6 +32,8 @@ std::string toBinaryString(T b) {
 
 #include "scalar/utf8.h"
 #include "scalar/utf16.h"
+#include "scalar/utf32.h"
+#include "scalar/base64.h"
 
 namespace simdutf {
 bool implementation::supported_by_runtime_system() const {
@@ -456,16 +459,24 @@ class detect_best_supported_implementation_on_first_use final : public implement
     return set_best()->maximal_binary_length_from_base64(input, length);
   }
 
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output);
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options);
   }
 
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept override {
     return set_best()->base64_length_from_binary(length);
   }
 
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept override {
-    return set_best()->binary_to_base64(input, length, output);
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept override {
+    return set_best()->binary_to_base64(input, length, output, options);
   }
 
   simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
@@ -812,15 +823,24 @@ class unsupported_implementation final : public implementation {
     return 0;
   }
 
-  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*) const noexcept override {
+  simdutf_warn_unused result base64_to_binary(const char *, size_t, char*, base64_options) const noexcept override {
     return result(error_code::OTHER, 0);
   }
 
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result base64_to_binary(const char16_t *, size_t, char*, base64_options) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+
   simdutf_warn_unused size_t base64_length_from_binary(size_t) const noexcept override {
     return 0;
   }
 
-  size_t binary_to_base64(const char *, size_t, char*) const noexcept override {
+  size_t binary_to_base64(const char *, size_t, char*, base64_options) const noexcept override {
     return 0;
   }
 
@@ -1270,16 +1290,81 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
   return get_default_implementation()->maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->base64_to_binary(input, length, output);
+simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output, options);
+}
+
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->base64_to_binary(input, length, output, options);
+}
+
+template <typename chartype>
+simdutf_warn_unused result base64_to_binary_safe_impl(const chartype * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  static_assert(std::is_same<chartype, char>::value || std::is_same<chartype, char16_t>::value, "Only char and char16_t are supported.");
+  // The implementation could be nicer, but we expect that most times, the user
+  // will provide us with a buffer that is large enough.
+  size_t max_length = maximal_binary_length_from_base64(input, length);
+  if(outlen >= max_length) {
+    // fast path
+    result r = base64_to_binary(input, length, output, options);
+    if(r.error != error_code::INVALID_BASE64_CHARACTER) { outlen = r.count; r.count = length; }
+    return r;
+  }
+  // The output buffer is maybe too small. We will decode a truncated version of the input.
+  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
+  size_t safe_input = base64_length_from_binary(outlen3);
+  result r = base64_to_binary(input, safe_input, output, options);
+  if(r.error == error_code::INVALID_BASE64_CHARACTER) { return r; }
+  size_t offset = (r.error == error_code::BASE64_INPUT_REMAINDER) ? 1 :
+    ((r.count % 3) == 0 ? 0 : (r.count % 3) + 1);
+  size_t output_index = r.count - (r.count % 3);
+  size_t input_index = safe_input;
+  // offset is a value that is no larger than 3. We backtrack
+  // by up to offset characters + an undetermined number of
+  // white space characters. It is expected that the next loop
+  // runs at most 3 times + the number of white space characters
+  // in between them, so we are not worried about performance.
+  while(offset > 0 && input_index > 0) {
+    chartype c = input[--input_index];
+    if(c == '=' || c == '\n' || c == '\r' || c == '\t' || c == ' ') {
+      // skipping
+    } else {
+      offset--;
+    }
+  }
+  size_t remaining_out = outlen - output_index;
+  const chartype * tail_input = input + input_index;
+  size_t tail_length = length - input_index;
+  if(tail_length > 0 && tail_input[tail_length - 1] == '=') {
+    tail_length--;
+    if(tail_length > 0 && tail_input[tail_length - 1] == '=') {
+      tail_length--;
+    }
+  }
+  r = scalar::base64::base64_tail_decode_safe(output + output_index, remaining_out, tail_input, tail_length, options);
+  outlen = output_index + remaining_out;
+  r.count += input_index;
+  return r;
+}
+
+
+simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen, options);
+}
+simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(input, length, output, outlen, options);
 }
 
 simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept {
   return get_default_implementation()->base64_length_from_binary(length);
 }
 
-size_t binary_to_base64(const char * input, size_t length, char* output) noexcept {
-  return get_default_implementation()->binary_to_base64(input, length, output);
+size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) noexcept {
+  return get_default_implementation()->binary_to_base64(input, length, output, options);
 }
 
 simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
diff --git a/src/ppc64/implementation.cpp b/src/ppc64/implementation.cpp
index 8390e01a3..f33444d41 100644
--- a/src/ppc64/implementation.cpp
+++ b/src/ppc64/implementation.cpp
@@ -298,16 +298,33 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::base64_to_binary(input, length, output);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length, options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::base64_to_binary(input, length, output, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/rvv/implementation.cpp b/src/rvv/implementation.cpp
index 63f1283c1..7b4ecf96b 100644
--- a/src/rvv/implementation.cpp
+++ b/src/rvv/implementation.cpp
@@ -82,7 +82,7 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
   if(length > 0 && input[length - 1] == '=') {
     length -= 1;
     if(length > 0 && input[length - 1] == '=') {
@@ -92,15 +92,33 @@ simdutf_warn_unused result implementation::base64_to_binary(const char * input,
   if(length == 0) {
     return {SUCCESS, 0};
   }
-  return scalar::base64::base64_tail_decode(output, input, length);
+  return scalar::base64::base64_tail_decode(output, input, length, options);
+}
+
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  if(length > 0 && input[length - 1] == '=') {
+    length -= 1;
+    if(length > 0 && input[length - 1] == '=') {
+      length -= 1;
+    }
+  }
+  if(length == 0) {
+    return {SUCCESS, 0};
+  }
+  return scalar::base64::base64_tail_decode(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/scalar/base64.h b/src/scalar/base64.h
index ff7368314..427154f96 100644
--- a/src/scalar/base64.h
+++ b/src/scalar/base64.h
@@ -9,12 +9,19 @@ namespace scalar {
 namespace {
 namespace base64 {
 
-// Returns true upon success. The destination buffer must be large enough and is
-// incremented by the number of bytes written and src is incremented by the number of bytes read.
+// Returns true upon success. The destination buffer must be large enough.
 // This functions assumes that the padding (=) has been removed.
-result base64_tail_decode(char *dst, const char *src, size_t length) {
-  const char *srcend = src + length;
-  const char *srcinit = src;
+template <class char_type>
+result base64_tail_decode(char *dst, const char_type *src, size_t length, base64_options options) {
+  // This looks like 5 branches, but we expect the compiler to resolve this to a single branch:
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3;
+
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
   const char *dstinit = dst;
 
   uint32_t x;
@@ -22,8 +29,8 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
   uint8_t buffer[4];
   while (true) {
     while (src + 4 <= srcend &&
-           (x = tables::base64::d0[uint8_t(src[0])] | tables::base64::d1[uint8_t(src[1])] |
-                tables::base64::d2[uint8_t(src[2])] | tables::base64::d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
         x = scalar::utf32::swap_bytes(x);
       }
@@ -34,13 +41,15 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
     idx = 0;
     // we need at least four characters.
     while (idx < 4 && src < srcend) {
-      char c = *src;
-      uint8_t code = tables::base64::to_base64_value[uint8_t(c)];
+      char_type c = *src;
+      uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
       if (code <= 63) {
         idx++;
       } else if (code > 64) {
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
       }
       src++;
     }
@@ -92,43 +101,162 @@ result base64_tail_decode(char *dst, const char *src, size_t length) {
   }
 }
 
+// like base64_tail_decode, but it will not write past the end of the ouput buffer.
+// outlen is modified to reflect the number of bytes written.
+// This functions assumes that the padding (=) has been removed.
+template <class char_type>
+result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src, size_t length, base64_options options) {
+  // This looks like 5 branches, but we expect the compiler to resolve this to a single branch:
+  const uint8_t *to_base64 = (options & base64_url) ? tables::base64::to_base64_url_value : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url) ? tables::base64::base64_url::d0 : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url) ? tables::base64::base64_url::d1 : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url) ? tables::base64::base64_url::d2 : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url) ? tables::base64::base64_url::d3 : tables::base64::base64_default::d3;
+
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+  const char *dstend = dst + outlen;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend &&
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if(match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      if(dst + 3 > dstend) {
+        outlen = size_t(dst - dstinit);
+        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    const char_type *srccur = src;
+
+    // we need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char_type c = *src;
+      uint8_t code = to_base64[uint8_t(c)];
+      buffer[idx] = uint8_t(code);
+      if (code <= 63) {
+        idx++;
+      } else if (code > 64) {
+        outlen = size_t(dst - dstinit);
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (idx == 2) {
+        if(dst == dstend) {
+          outlen = size_t(dst - dstinit);
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+        }
+        uint32_t triple =
+            (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 1);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 1);
+        }
+        dst += 1;
+
+      } else if (idx == 3) {
+        if(dst + 2 >= dstend) {
+          outlen = size_t(dst - dstinit);
+          return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+        }
+        uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+                          (uint32_t(buffer[1]) << 2 * 6) +
+                          (uint32_t(buffer[2]) << 1 * 6);
+        if(match_system(endianness::BIG)) {
+          triple <<= 8;
+          std::memcpy(dst, &triple, 2);
+        } else {
+          triple = scalar::utf32::swap_bytes(triple);
+          triple >>= 8;
+          std::memcpy(dst, &triple, 2);
+        }
+        dst += 2;
+      } else if (idx == 1) {
+        outlen = size_t(dst - dstinit);
+        return {BASE64_INPUT_REMAINDER, size_t(dst - dstinit)};
+      }
+      outlen = size_t(dst - dstinit);
+      return {SUCCESS, size_t(dst - dstinit)};
+    }
+    if(dst + 3 >= dstend) {
+      outlen = size_t(dst - dstinit);
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+    }
+    uint32_t triple =
+        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+    if(match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
+}
+
 // Returns the number of bytes written. The destination buffer must be large
 // enough. It will add padding (=) if needed.
-size_t tail_encode_base64(char *dst, const char *src, size_t srclen) {
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen, base64_options options) {
+  // This looks like 3 branches, but we expect the compiler to resolve this to a single branch:
+  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0 : tables::base64::base64_default::e0;
+  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1 : tables::base64::base64_default::e1;
+  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2 : tables::base64::base64_default::e2;
   char *out = dst;
   size_t i = 0;
   uint8_t t1, t2, t3;
   for (; i + 2 < srclen; i += 3) {
-    t1 = (uint8_t)src[i];
-    t2 = (uint8_t)src[i + 1];
-    t3 = (uint8_t)src[i + 2];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = tables::base64::e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
-    *out++ = tables::base64::e2[t3];
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    t3 = uint8_t(src[i + 2]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+    *out++ = e2[t3];
   }
   switch (srclen - i) {
   case 0:
     break;
   case 1:
-    t1 = (uint8_t)src[i];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[(t1 & 0x03) << 4];
+    t1 = uint8_t(src[i]);
+    *out++ = e0[t1];
+    *out++ = e1[(t1 & 0x03) << 4];
     *out++ = '=';
     *out++ = '=';
     break;
   default: /* case 2 */
-    t1 = (uint8_t)src[i];
-    t2 = (uint8_t)src[i + 1];
-    *out++ = tables::base64::e0[t1];
-    *out++ = tables::base64::e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = tables::base64::e2[(t2 & 0x0F) << 2];
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e2[(t2 & 0x0F) << 2];
     *out++ = '=';
   }
   return (size_t)(out - dst);
 }
 
-simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept {
+template <class char_type>
+simdutf_warn_unused size_t maximal_binary_length_from_base64(const char_type * input, size_t length) noexcept {
   // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
   size_t padding = 0;
   if(length > 0) {
@@ -140,7 +268,7 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
     }
   }
   size_t actual_length = length - padding;
-  if(actual_length % 4 == 0) {
+  if(actual_length % 4 <= 1) {
     return actual_length / 4 * 3;
   }
   // if we have a valid input, then the remainder must be 2 or 3 adding one or two extra bytes.
diff --git a/src/simdutf.cpp b/src/simdutf.cpp
index 26ca712dd..fa889290c 100644
--- a/src/simdutf.cpp
+++ b/src/simdutf.cpp
@@ -1,10 +1,11 @@
 #include "simdutf.h"
+// We include base64_tables once.
+#include "tables/base64_tables.h"
 #include "implementation.cpp"
 #include "encoding_types.cpp"
 #include "error.cpp"
 // The large tables should be included once and they
 // should not depend on a kernel.
-#include "tables/base64_tables.h"
 #include "tables/utf8_to_utf16_tables.h"
 #include "tables/utf16_to_utf8_tables.h"
 // End of tables.
diff --git a/src/simdutf/arm64/implementation.h b/src/simdutf/arm64/implementation.h
index b686be9fe..1d6fbd423 100644
--- a/src/simdutf/arm64/implementation.h
+++ b/src/simdutf/arm64/implementation.h
@@ -90,9 +90,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace arm64
diff --git a/src/simdutf/arm64/simd16-inl.h b/src/simdutf/arm64/simd16-inl.h
index 66d1168b7..32734c0ab 100644
--- a/src/simdutf/arm64/simd16-inl.h
+++ b/src/simdutf/arm64/simd16-inl.h
@@ -156,7 +156,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
   simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
   simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
 
-  // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     return vqmovn_high_u16(vqmovn_u16(v0), v1);
   }
diff --git a/src/simdutf/fallback/implementation.h b/src/simdutf/fallback/implementation.h
index 14d14cb42..40fdcc246 100644
--- a/src/simdutf/fallback/implementation.h
+++ b/src/simdutf/fallback/implementation.h
@@ -93,9 +93,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 } // namespace fallback
 } // namespace simdutf
diff --git a/src/simdutf/haswell/implementation.h b/src/simdutf/haswell/implementation.h
index c75e4a5e7..f3eb7e4db 100644
--- a/src/simdutf/haswell/implementation.h
+++ b/src/simdutf/haswell/implementation.h
@@ -92,9 +92,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused virtual size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused virtual result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused virtual size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace haswell
diff --git a/src/simdutf/haswell/simd16-inl.h b/src/simdutf/haswell/simd16-inl.h
index 04c1b7fe0..964ff4ebd 100644
--- a/src/simdutf/haswell/simd16-inl.h
+++ b/src/simdutf/haswell/simd16-inl.h
@@ -140,7 +140,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     return _mm256_shuffle_epi8(*this, swap);
   }
 
-  // Pack with the unsigned saturation two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
     //       we have to shuffle lanes in order to produce bytes in the
diff --git a/src/simdutf/icelake/implementation.h b/src/simdutf/icelake/implementation.h
index 175b34040..495a05a59 100644
--- a/src/simdutf/icelake/implementation.h
+++ b/src/simdutf/icelake/implementation.h
@@ -92,9 +92,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace icelake
diff --git a/src/simdutf/ppc64/implementation.h b/src/simdutf/ppc64/implementation.h
index f1df43a4c..ee0c7dcd4 100644
--- a/src/simdutf/ppc64/implementation.h
+++ b/src/simdutf/ppc64/implementation.h
@@ -70,9 +70,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
   simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace ppc64
diff --git a/src/simdutf/rvv/implementation.h b/src/simdutf/rvv/implementation.h
index f95dcf2ab..d4e668581 100644
--- a/src/simdutf/rvv/implementation.h
+++ b/src/simdutf/rvv/implementation.h
@@ -94,9 +94,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf, size_t len) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 private:
   const bool _supports_zvbb;
 
diff --git a/src/simdutf/westmere/implementation.h b/src/simdutf/westmere/implementation.h
index 4d992a49b..d10dfb433 100644
--- a/src/simdutf/westmere/implementation.h
+++ b/src/simdutf/westmere/implementation.h
@@ -90,9 +90,11 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
   simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(size_t length) const noexcept;
-  size_t binary_to_base64(const char * input, size_t length, char* output) const noexcept;
+  size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept;
 };
 
 } // namespace westmere
diff --git a/src/simdutf/westmere/simd16-inl.h b/src/simdutf/westmere/simd16-inl.h
index bbcca0776..694d93d22 100644
--- a/src/simdutf/westmere/simd16-inl.h
+++ b/src/simdutf/westmere/simd16-inl.h
@@ -146,7 +146,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     return _mm_shuffle_epi8(*this, swap);
   }
 
-  // Pack with the unsigned saturation  two uint16_t code units into single uint8_t vector
+  // Pack with the unsigned saturation of two uint16_t code units into single uint8_t vector
   static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
     return _mm_packus_epi16(v0, v1);
   }
diff --git a/src/tables/base64_tables.h b/src/tables/base64_tables.h
index a0f997733..f835f141b 100644
--- a/src/tables/base64_tables.h
+++ b/src/tables/base64_tables.h
@@ -7,6 +7,7 @@ namespace simdutf {
 namespace {
 namespace tables {
 namespace base64 {
+namespace base64_default {
 
 const char e0[256] = {
     'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
@@ -68,8 +69,6 @@ const char e2[256] = {
     'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+',
     '/'};
 
-/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN CPUS */
-
 const uint32_t d0[256] = {
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
@@ -249,6 +248,247 @@ const uint32_t d3[256] = {
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
     0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_default
+
+namespace base64_url {
+
+const char e0[256] = {
+    'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'D', 'D', 'D',
+    'D', 'E', 'E', 'E', 'E', 'F', 'F', 'F', 'F', 'G', 'G', 'G', 'G', 'H', 'H',
+    'H', 'H', 'I', 'I', 'I', 'I', 'J', 'J', 'J', 'J', 'K', 'K', 'K', 'K', 'L',
+    'L', 'L', 'L', 'M', 'M', 'M', 'M', 'N', 'N', 'N', 'N', 'O', 'O', 'O', 'O',
+    'P', 'P', 'P', 'P', 'Q', 'Q', 'Q', 'Q', 'R', 'R', 'R', 'R', 'S', 'S', 'S',
+    'S', 'T', 'T', 'T', 'T', 'U', 'U', 'U', 'U', 'V', 'V', 'V', 'V', 'W', 'W',
+    'W', 'W', 'X', 'X', 'X', 'X', 'Y', 'Y', 'Y', 'Y', 'Z', 'Z', 'Z', 'Z', 'a',
+    'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd',
+    'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'h', 'h', 'h',
+    'h', 'i', 'i', 'i', 'i', 'j', 'j', 'j', 'j', 'k', 'k', 'k', 'k', 'l', 'l',
+    'l', 'l', 'm', 'm', 'm', 'm', 'n', 'n', 'n', 'n', 'o', 'o', 'o', 'o', 'p',
+    'p', 'p', 'p', 'q', 'q', 'q', 'q', 'r', 'r', 'r', 'r', 's', 's', 's', 's',
+    't', 't', 't', 't', 'u', 'u', 'u', 'u', 'v', 'v', 'v', 'v', 'w', 'w', 'w',
+    'w', 'x', 'x', 'x', 'x', 'y', 'y', 'y', 'y', 'z', 'z', 'z', 'z', '0', '0',
+    '0', '0', '1', '1', '1', '1', '2', '2', '2', '2', '3', '3', '3', '3', '4',
+    '4', '4', '4', '5', '5', '5', '5', '6', '6', '6', '6', '7', '7', '7', '7',
+    '8', '8', '8', '8', '9', '9', '9', '9', '-', '-', '-', '-', '_', '_', '_',
+    '_'};
+
+const char e1[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+    '_'};
+
+const char e2[256] = {
+    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd',
+    'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
+    't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7',
+    '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
+    'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
+    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+    '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+    'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
+    'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
+    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_', 'A', 'B', 'C',
+    'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
+    'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+    'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+    'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-',
+    '_'};
+
+const uint32_t d0[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000f8, 0x01ffffff, 0x01ffffff,
+    0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+    0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+    0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+    0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+    0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+    0x00000064, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x000000fc,
+    0x01ffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+    0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+    0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+    0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+    0x000000c4, 0x000000c8, 0x000000cc, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d1[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000e003, 0x01ffffff, 0x01ffffff,
+    0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+    0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+    0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+    0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+    0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+    0x00009001, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x0000f003,
+    0x01ffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+    0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+    0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+    0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+    0x00001003, 0x00002003, 0x00003003, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d2[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00800f00, 0x01ffffff, 0x01ffffff,
+    0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+    0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+    0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+    0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+    0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+    0x00400600, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00c00f00,
+    0x01ffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+    0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+    0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+    0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+    0x00400c00, 0x00800c00, 0x00c00c00, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+const uint32_t d3[256] = {
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003e0000, 0x01ffffff, 0x01ffffff,
+    0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+    0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x00000000,
+    0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+    0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+    0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+    0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+    0x00190000, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x003f0000,
+    0x01ffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+    0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+    0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+    0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+    0x00310000, 0x00320000, 0x00330000, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff,
+    0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff};
+} // namespace base64_url
 const uint64_t thintable_epi8[256] = {
     0x0706050403020100, 0x0007060504030201, 0x0007060504030200,
     0x0000070605040302, 0x0007060504030100, 0x0000070605040301,
@@ -388,6 +628,27 @@ const uint8_t to_base64_value[] = {
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
     255};
+
+const uint8_t to_base64_url_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    62,  255, 255, 52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 63,  255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
 } // namespace base64
 } // namespace tables
 } // unnamed namespace
diff --git a/src/westmere/implementation.cpp b/src/westmere/implementation.cpp
index d428e0084..e95e5f331 100644
--- a/src/westmere/implementation.cpp
+++ b/src/westmere/implementation.cpp
@@ -783,16 +783,28 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(con
   return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output) const noexcept {
-  return compress_decode_base64(output, input, length);
+simdutf_warn_unused result implementation::base64_to_binary(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(const char16_t * input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options) const noexcept {
+  return (options & base64_url) ? compress_decode_base64<true>(output, input, length, options) : compress_decode_base64<false>(output, input, length, options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(size_t length) const noexcept {
   return scalar::base64::base64_length_from_binary(length);
 }
 
-size_t implementation::binary_to_base64(const char * input, size_t length, char* output) const noexcept {
-  return encode_base64(output, input, length);
+size_t implementation::binary_to_base64(const char * input, size_t length, char* output, base64_options options) const noexcept {
+  if(options == base64_url) {
+    return encode_base64<base64_url>(output, input, length);
+  } else {
+    return encode_base64<base64_default>(output, input, length);
+  }
 }
 } // namespace SIMDUTF_IMPLEMENTATION
 } // namespace simdutf
diff --git a/src/westmere/sse_base64.cpp b/src/westmere/sse_base64.cpp
index f2f4d7211..f8df6a830 100644
--- a/src/westmere/sse_base64.cpp
+++ b/src/westmere/sse_base64.cpp
@@ -25,8 +25,7 @@
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-
-__m128i lookup_pshufb_improved(const __m128i input) {
+template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
   // credit: Wojciech Muła
   // reduce  0..51 -> 0
   //        52..61 -> 1 .. 10
@@ -40,9 +39,16 @@ __m128i lookup_pshufb_improved(const __m128i input) {
   const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
   result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
 
-  const __m128i shift_LUT = _mm_setr_epi8(
-      'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-      '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m128i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
 
   // read shift
   result = _mm_shuffle_epi8(shift_LUT, result);
@@ -50,6 +56,7 @@ __m128i lookup_pshufb_improved(const __m128i input) {
   return _mm_add_epi8(result, input);
 }
 
+template <base64_options options>
 size_t encode_base64(char *dst, const char *src, size_t srclen) {
   // credit: Wojciech Muła
   // SSE (lookup: pshufb improved unrolled)
@@ -101,19 +108,19 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m128i input3 = _mm_or_si128(t1_3, t3_3);
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input0));
+                     lookup_pshufb_improved<options & base64_url>(input0));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input1));
+                     lookup_pshufb_improved<options & base64_url>(input1));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input2));
+                     lookup_pshufb_improved<options & base64_url>(input2));
     out += 16;
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(input3));
+                     lookup_pshufb_improved<options & base64_url>(input3));
     out += 16;
   }
   for (; i + 16 <= srclen; i += 12) {
@@ -153,12 +160,12 @@ size_t encode_base64(char *dst, const char *src, size_t srclen) {
     const __m128i indices = _mm_or_si128(t1, t3);
 
     _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved(indices));
+                     lookup_pshufb_improved<options & base64_url>(indices));
     out += 16;
   }
 
-  return i / 3 * 4 +
-         scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
@@ -198,27 +205,59 @@ struct block64 {
   __m128i chunks[4];
 };
 
+template <bool base64_url>
 static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   const __m128i ascii_space_tbl =
       _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
                     0x0, 0xd, 0x0, 0x0);
   // credit: aqrit
-  const __m128i delta_asso =
-      _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
-                    0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  const __m128i delta_values =
-      _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-                    int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-                    int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
-                    int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
-  const __m128i check_asso =
-      _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                    0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
-  const __m128i check_values =
-      _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-                    int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
-                    int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
-                    int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  __m128i delta_asso;
+  if (base64_url) {
+    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+
+    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+  __m128i delta_values;
+  if (base64_url) {
+    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
+                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+
+    delta_values =
+        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+  }
+  __m128i check_asso;
+  if (base64_url) {
+    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                               0x3, 0x7, 0xB, 0x6, 0xB, 0x12);
+  } else {
+
+    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m128i check_values;
+  if (base64_url) {
+    check_values = _mm_setr_epi8(0x0, uint8_t(0x80), uint8_t(0x80),
+                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
+                                 uint8_t(0xD3), uint8_t(0xA6), uint8_t(0xB5),
+                                 uint8_t(0x86), uint8_t(0xD0), uint8_t(0x80),
+                                 uint8_t(0xB0), uint8_t(0x80), 0x0, 0x0);
+  } else {
+
+    check_values =
+        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
+  }
   const __m128i shifted = _mm_srli_epi32(*src, 3);
 
   const __m128i delta_hash =
@@ -239,12 +278,14 @@ static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
   *src = out;
   return (uint16_t)mask;
 }
+
+template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, bool *error) {
   *error = 0;
-  uint64_t m0 = to_base64_mask(&b->chunks[0], error);
-  uint64_t m1 = to_base64_mask(&b->chunks[1], error);
-  uint64_t m2 = to_base64_mask(&b->chunks[2], error);
-  uint64_t m3 = to_base64_mask(&b->chunks[3], error);
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
   return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
 }
 
@@ -267,6 +308,8 @@ static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   return _mm_popcnt_u64(nmask);
 }
 
+// The caller of this function is responsible to ensure that there are 64 bytes available
+// from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
   b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
   b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
@@ -274,6 +317,23 @@ static inline void load_block(block64 *b, const char *src) {
   b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
 }
 
+// The caller of this function is responsible to ensure that there are 128 bytes available
+// from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
+  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
+  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
+  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
+  b->chunks[0] = _mm_packus_epi16(m1, m2);
+  b->chunks[1] = _mm_packus_epi16(m3, m4);
+  b->chunks[2] = _mm_packus_epi16(m5, m6);
+  b->chunks[3] = _mm_packus_epi16(m7, m8);
+}
+
 static inline void base64_decode(char *out, __m128i str) {
   // credit: aqrit
 
@@ -323,7 +383,11 @@ static inline void base64_decode_block_safe(char *out, block64 *b) {
   std::memcpy(out + 36, buffer, 12);
 }
 
-result compress_decode_base64(char *dst, const char *src, size_t srclen) {
+template <bool base64_url, typename chartype>
+result compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                              base64_options options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
   size_t equalsigns = 0;
   if (srclen > 0 && src[srclen - 1] == '=') {
     srclen--;
@@ -336,26 +400,25 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
   char *end_of_safe_64byte_zone =
       (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
 
-  const char *const srcinit = src;
+  const chartype *const srcinit = src;
   const char *const dstinit = dst;
-  const char *const srcend = src + srclen;
+  const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
   static_assert(block_size >= 2, "block should of size 2 or more");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
-    const char *const srcend64 = src + srclen - 64;
+    const chartype *const srcend64 = src + srclen - 64;
     while (src <= srcend64) {
       block64 b;
       load_block(&b, src);
       src += 64;
       bool error = false;
-      uint64_t badcharmask = to_base64_mask(&b, &error);
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        while (src < srcend &&
-               tables::base64::to_base64_value[uint8_t(*src)] <= 64) {
+        while (src < srcend && to_base64[uint8_t(*src)] <= 64) {
           src++;
         }
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -400,7 +463,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+      uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
       if (val > 64) {
         return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
@@ -448,7 +511,7 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     int leftover = int(bufferptr - buffer_start);
     if (leftover > 0) {
       while (leftover < 4 && src < srcend) {
-        uint8_t val = tables::base64::to_base64_value[uint8_t(*src)];
+        uint8_t val = to_base64[uint8_t(*src)];
         if (val > 64) {
           return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
         }
@@ -490,7 +553,8 @@ result compress_decode_base64(char *dst, const char *src, size_t srclen) {
     }
   }
   if (src < srcend + equalsigns) {
-    result r = scalar::base64::base64_tail_decode(dst, src, srcend - src);
+    result r =
+        scalar::base64::base64_tail_decode(dst, src, srcend - src, options);
     if (r.error == error_code::INVALID_BASE64_CHARACTER) {
       r.count += size_t(src - srcinit);
       return r;
diff --git a/tests/base64_tests.cpp b/tests/base64_tests.cpp
index 6263a4c8a..19c0ef947 100644
--- a/tests/base64_tests.cpp
+++ b/tests/base64_tests.cpp
@@ -8,40 +8,296 @@
 #include <tests/helpers/test.h>
 #include <tests/helpers/transcode_test_base.h>
 
+// We may disable base64url tests by commenting out this next line.
+#define SIMDUTF_BASE64URL_TESTS 1
+
 using random_generator = std::mt19937;
 static random_generator::result_type seed = 42;
 
+const uint8_t to_base64_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
+    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+
+
+const uint8_t to_base64url_value[] = {
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  255,
+    62, 255, 255,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
+    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
+    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
+    25,  255, 255, 255, 255, 63, 255, 26,  27,  28,  29,  30,  31,  32,  33,
+    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
+    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+    255};
+template <typename char_type>
+size_t add_space(std::vector<char_type> &v, std::mt19937 &gen) {
+  const static std::array<char_type, 4> space = {' ', '\t', '\n', '\r'};
+  int padding = 0;
+  if (v.size() > 0 && v[v.size() - 1] == '=') {
+    padding++;
+    if (v.size() > 0 && v[v.size() - 1] == '=') {
+      padding++;
+    }
+  }
+  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
+  size_t i = index_dist(gen);
+  std::uniform_int_distribution<int> char_dist(0, 3);
+  v.insert(v.begin() + i, space[char_dist(gen)]);
+  return i;
+}
+
+template <typename char_type>
+size_t add_garbage(std::vector<char_type> &v, std::mt19937 &gen) {
+  int padding = 0;
+  if (v.size() > 0 && v[v.size() - 1] == '=') {
+    padding++;
+    if (v.size() > 0 && v[v.size() - 1] == '=') {
+      padding++;
+    }
+  }
+  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
+  size_t i = index_dist(gen);
+  std::uniform_int_distribution<int> char_dist(
+      0, (1 << (sizeof(char_type) * 8)) - 1);
+  uint8_t c = char_dist(gen);
+  while (uint8_t(c) == c && to_base64_value[uint8_t(c)] != 255) {
+    c = char_dist(gen);
+  }
+  v.insert(v.begin() + i, c);
+  return i;
+}
+
 TEST(decode_base64_cases) {
   std::vector<std::vector<char>> cases = {{0x53, 0x53}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
 
-  for(size_t i = 0; i < cases.size(); i++) {
-    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(cases[i].data(), cases[i].size()));
-    simdutf::result r = implementation.base64_to_binary(cases[i].data(), cases[i].size(), buffer.data());
-    ASSERT_EQUAL(r.error,codes[i]);
+  for (size_t i = 0; i < cases.size(); i++) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        cases[i].data(), cases[i].size()));
+    simdutf::result r = implementation.base64_to_binary(
+        cases[i].data(), cases[i].size(), buffer.data());
+    ASSERT_EQUAL(r.error, codes[i]);
     ASSERT_EQUAL(r.count, counts[i]);
   }
 }
 
 TEST(encode_base64_cases) {
-  std::vector<std::pair<std::string,std::string>> cases = {
-    {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
-    {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
-    {"123456", "MTIzNDU2"},
-    {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"}};
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", "MTIzNDU2"},
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", "IVJ+SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c+fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
   std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
   std::vector<size_t> counts = {1};
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(
+        implementation.base64_length_from_binary(p.first.size()));
+    ASSERT_EQUAL(buffer.size(), p.second.size());
+    size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
+                                               buffer.data());
+    ASSERT_EQUAL(s, p.second.size());
+    ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data());
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" --  ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
+#if SIMDUTF_BASE64URL_TESTS
 
-  for(std::pair<std::string,std::string> p : cases) {
-    std::vector<char> buffer(implementation.base64_length_from_binary(p.first.size()));
+TEST(encode_base64url_cases) {
+  std::vector<std::pair<std::string, std::string>> cases = {
+      {"Hello, World!", "SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", "R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", "MTIzNDU2"},
+      {"Base64 Encoding", "QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", "IVJ-SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c-fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(
+        implementation.base64_length_from_binary(p.first.size()));
     ASSERT_EQUAL(buffer.size(), p.second.size());
-    size_t s = implementation.binary_to_base64(p.first.data(),p.first.size(), buffer.data());
+    size_t s = implementation.binary_to_base64(p.first.data(), p.first.size(),
+                                               buffer.data(), simdutf::base64_url);
     ASSERT_EQUAL(s, p.second.size());
+    if(std::string(buffer.data(), buffer.size()) != p.second) {
+      printf("difference:\n");
+      printf(" %.*s\n", (int)s, buffer.data());
+      printf(" %.*s\n", (int)s, p.second.data());
+    }
     ASSERT_TRUE(std::string(buffer.data(), buffer.size()) == p.second);
   }
+  printf(" -- ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" --  ");
+  for (std::pair<std::string, std::string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length, simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
 }
 
+#endif
+
+TEST(encode_base64_cases_16) {
+  std::vector<std::pair<std::string, std::u16string>> cases = {
+      {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", u"MTIzNDU2"},
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", u"IVJ+SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c+fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data());
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
+#if SIMDUTF_BASE64URL_TESTS
+
+TEST(encode_base64url_cases_16) {
+  std::vector<std::pair<std::string, std::u16string>> cases = {
+      {"Hello, World!", u"SGVsbG8sIFdvcmxkIQ=="},
+      {"GeeksforGeeks", u"R2Vla3Nmb3JHZWVrcw=="},
+      {"123456", u"MTIzNDU2"},
+      {"Base64 Encoding", u"QmFzZTY0IEVuY29kaW5n"},
+      {"!R~J2jL&mI]O)3=c:G3Mo)oqmJdxoprTZDyxEvU0MI.'Ww5H{G>}y;;+B8E_Ah,Ed[ PdBqY'^N>O$4:7LK1<:|7)btV@|{YWR$$Er59-XjVrFl4L}~yzTEd4'E[@k", u"IVJ-SjJqTCZtSV1PKTM9YzpHM01vKW9xbUpkeG9wclRaRHl4RXZVME1JLidXdzVIe0c-fXk7OytCOEVfQWgsRWRbIFBkQnFZJ15OPk8kNDo3TEsxPDp8NylidFZAfHtZV1IkJEVyNTktWGpWckZsNEx9fnl6VEVkNCdFW0Br"}};
+  std::vector<simdutf::error_code> codes = {simdutf::error_code::SUCCESS};
+  std::vector<size_t> counts = {1};
+  printf(" -- ");
+
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    simdutf::result r = implementation.base64_to_binary(
+        p.second.data(), p.second.size(), buffer.data(), simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+  printf(" -- ");
+  for (std::pair<std::string, std::u16string> p : cases) {
+    std::vector<char> buffer(implementation.maximal_binary_length_from_base64(
+        p.second.data(), p.second.size()));
+    ASSERT_EQUAL(buffer.size(), p.first.size());
+    size_t length = buffer.size();
+    simdutf::result r = simdutf::base64_to_binary_safe(
+        p.second.data(), p.second.size(), buffer.data(), length, simdutf::base64_url);
+    ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+    ASSERT_EQUAL(r.count, p.second.size());
+    ASSERT_EQUAL(length, p.first.size());
+    for (size_t i = 0; i < buffer.size(); i++) {
+      ASSERT_EQUAL(buffer[i], p.first[i]);
+    }
+  }
+}
+
+#endif
+
 TEST(roundtrip_base64) {
   for (size_t len = 0; len < 2048; len++) {
     std::vector<char> source(len, 0);
@@ -61,16 +317,17 @@ TEST(roundtrip_base64) {
           implementation.base64_to_binary(buffer.data(), size, back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
       ASSERT_EQUAL(r.count, len);
-      if(back != source) {
+      if (back != source) {
         printf("=====input size %zu\n", len);
-        for(size_t i = 0; i < len; i++) {
-          if(back[i] != source[i]) {
-            std::cerr << "Mismatch at position " << i << " trial " << trial << std::endl;
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
           }
           printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
         }
         printf("=====base64 size %zu\n", size);
-        for(size_t i = 0; i < size; i++) {
+        for (size_t i = 0; i < size; i++) {
           printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
         }
       }
@@ -79,60 +336,137 @@ TEST(roundtrip_base64) {
   }
 }
 
-const uint8_t to_base64_value[] = {
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 64,  64,  255, 255, 64,  255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 64,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62,  255,
-    255, 255, 63,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  255, 255,
-    255, 255, 255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
-    10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
-    25,  255, 255, 255, 255, 255, 255, 26,  27,  28,  29,  30,  31,  32,  33,
-    34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
-    49,  50,  51,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-    255};
+TEST(roundtrip_base64_16) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
 
-size_t add_space(std::vector<char> &v, std::mt19937 &gen) {
-  const static std::array<char, 4> space = {' ', '\t', '\n', '\r'};
-  int padding = 0;
-  if (v.size() > 0 && v[v.size() - 1] == '=') {
-    padding++;
-    if (v.size() > 0 && v[v.size() - 1] == '=') {
-      padding++;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer16.data(), size, back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
     }
   }
-  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
-  size_t i = index_dist(gen);
-  std::uniform_int_distribution<int> char_dist(0, 3);
-  v.insert(v.begin() + i, space[char_dist(gen)]);
-  return i;
 }
 
-size_t add_garbage(std::vector<char> &v, std::mt19937 &gen) {
-  int padding = 0;
-  if (v.size() > 0 && v[v.size() - 1] == '=') {
-    padding++;
-    if (v.size() > 0 && v[v.size() - 1] == '=') {
-      padding++;
+
+#if SIMDUTF_BASE64URL_TESTS
+
+TEST(roundtrip_base64url) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data(), simdutf::base64_url);
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer.data(), size, back.data(), simdutf::base64_url);
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
     }
   }
-  std::uniform_int_distribution<int> index_dist(0, v.size() - padding);
-  size_t i = index_dist(gen);
-  std::uniform_int_distribution<int> char_dist(0, 255);
-  uint8_t c = char_dist(gen);
-  while(to_base64_value[uint8_t(c)] != 255) {
-    c = char_dist(gen);
+}
+
+TEST(roundtrip_base64url_16) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data(), simdutf::base64_url);
+      buffer.resize(size);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r =
+          implementation.base64_to_binary(buffer16.data(), size, back.data(), simdutf::base64_url);
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
+    }
   }
-  v.insert(v.begin() + i, c);
-  return i;
 }
+#endif
 
 TEST(doomed_base64_roundtrip) {
   for (size_t len = 0; len < 2048; len++) {
@@ -151,8 +485,13 @@ TEST(doomed_base64_roundtrip) {
       size_t location = add_garbage(buffer, gen);
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::INVALID_BASE64_CHARACTER);
+      ASSERT_EQUAL(r.count, location);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
       ASSERT_EQUAL(r.error, simdutf::error_code::INVALID_BASE64_CHARACTER);
       ASSERT_EQUAL(r.count, location);
     }
@@ -175,10 +514,49 @@ TEST(doomed_truncated_base64_roundtrip) {
       buffer.resize(size - 3);
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, (size - 4) / 4 * 3);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, buffer.size());
+    }
+  }
+}
+
+TEST(doomed_truncated_base64_roundtrip_16) {
+  for (size_t len = 1; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size - 3);
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+          buffer16.data(), buffer16.size()));
       simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+          buffer16.data(), buffer16.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
+      ASSERT_EQUAL(r.count, (size - 4) / 4 * 3);
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer16.data(), buffer16.size(),
+                                         back.data(), back_length);
       ASSERT_EQUAL(r.error, simdutf::error_code::BASE64_INPUT_REMAINDER);
-      ASSERT_EQUAL(r.count, (size-4)/4*3);
+      ASSERT_EQUAL(r.count, buffer16.size());
     }
   }
 }
@@ -200,21 +578,277 @@ TEST(roundtrip_base64_with_spaces) {
       for (size_t i = 0; i < 5; i++) {
         add_space(buffer, gen);
       }
-
       std::vector<char> back(simdutf::maximal_binary_length_from_base64(
           buffer.data(), buffer.size()));
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data(), buffer.size(), back.data());
+      simdutf::result r =
+          simdutf::base64_to_binary(buffer.data(), buffer.size(), back.data());
       ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
 
       back.resize(
           r.count); // resize the buffer according to actual number of bytes
       ASSERT_EQUAL(r.count, len);
       ASSERT_TRUE(back == source);
+      back.resize(back.capacity());
+      size_t back_length = back.size();
+      r = simdutf::base64_to_binary_safe(buffer.data(), buffer.size(),
+                                         back.data(), back_length);
+
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+
+      back.resize(
+          back_length); // resize the buffer according to actual number of bytes
+      ASSERT_EQUAL(r.count, buffer.size());
+      ASSERT_TRUE(back == source);
+    }
+  }
+}
+
+TEST(roundtrip_base64_16_with_spaces) {
+  for (size_t len = 0; len < 2048; len++) {
+    std::vector<char> source(len, 0);
+    std::vector<char> buffer;
+    std::vector<char16_t> buffer16;
+
+    buffer.resize(implementation.base64_length_from_binary(len));
+    std::vector<char> back(len);
+    std::mt19937 gen((std::mt19937::result_type)(seed));
+    std::uniform_int_distribution<int> byte_generator{0, 255};
+    for (size_t trial = 0; trial < 10; trial++) {
+      for (size_t i = 0; i < len; i++) {
+        source[i] = byte_generator(gen);
+      }
+      size_t size = implementation.binary_to_base64(
+          source.data(), source.size(), buffer.data());
+      buffer.resize(size);
+      for (size_t i = 0; i < 5; i++) {
+        add_space(buffer, gen);
+      }
+      buffer16.resize(buffer.size());
+      for (size_t i = 0; i < buffer.size(); i++) {
+        buffer16[i] = buffer[i];
+      }
+      ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+      simdutf::result r = implementation.base64_to_binary(
+          buffer16.data(), buffer16.size(), back.data());
+      ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+      ASSERT_EQUAL(r.count, len);
+      if (back != source) {
+        printf("=====input size %zu\n", len);
+        for (size_t i = 0; i < len; i++) {
+          if (back[i] != source[i]) {
+            std::cerr << "Mismatch at position " << i << " trial " << trial
+                      << std::endl;
+          }
+          printf("%zu: %02x %02x\n", i, uint8_t(back[i]), uint8_t(source[i]));
+        }
+        printf("=====base64 size %zu\n", size);
+        for (size_t i = 0; i < size; i++) {
+          printf("%zu: %02x %c\n", i, uint8_t(buffer[i]), buffer[i]);
+        }
+      }
+      ASSERT_TRUE(back == source);
     }
   }
 }
 
+TEST(aborted_safe_roundtrip_base64) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+            buffer.data(), buffer.size()));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(second_length);
+        ASSERT_EQUAL(second_length + limited_length, len);
+
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_16) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      std::vector<char16_t> buffer16;
+
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::vector<char> back(len);
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        buffer16.resize(buffer.size());
+        for (size_t i = 0; i < buffer.size(); i++) {
+          buffer16[i] = buffer[i];
+        }
+        ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(second_length);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_with_spaces) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        for (size_t i = 0; i < 5; i++) {
+          add_space(buffer, gen);
+        }
+        std::vector<char> back(simdutf::maximal_binary_length_from_base64(
+            buffer.data(), buffer.size()));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(second_length);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
+
+TEST(aborted_safe_roundtrip_base64_16_with_spaces) {
+  for (size_t offset = 1; offset <= 16; offset+=3) {
+    for (size_t len = offset; len < 1024; len++) {
+      std::vector<char> source(len, 0);
+      std::vector<char> buffer;
+      std::vector<char16_t> buffer16;
+
+      buffer.resize(implementation.base64_length_from_binary(len));
+      std::vector<char> back(len);
+      std::mt19937 gen((std::mt19937::result_type)(seed));
+      std::uniform_int_distribution<int> byte_generator{0, 255};
+      for (size_t trial = 0; trial < 10; trial++) {
+        for (size_t i = 0; i < len; i++) {
+          source[i] = byte_generator(gen);
+        }
+        size_t size = implementation.binary_to_base64(
+            source.data(), source.size(), buffer.data());
+        buffer.resize(size);
+        for (size_t i = 0; i < 5; i++) {
+          add_space(buffer, gen);
+        }
+        buffer16.resize(buffer.size());
+        for (size_t i = 0; i < buffer.size(); i++) {
+          buffer16[i] = buffer[i];
+        }
+        ASSERT_TRUE(size == implementation.base64_length_from_binary(len));
+        size_t limited_length = len - offset; // intentionally too little
+        back.resize(limited_length);
+        back.shrink_to_fit();
+        simdutf::result r = simdutf::base64_to_binary_safe(
+            buffer.data(), buffer.size(), back.data(), limited_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+        for (size_t i = 0; i < limited_length; i++) {
+          ASSERT_EQUAL(source[i], back[i]);
+        }
+        // Now let us decode the rest !!!
+        size_t input_index = r.count;
+        back.resize(simdutf::maximal_binary_length_from_base64(
+            buffer.data() + input_index, buffer.size() - input_index));
+        size_t second_length = back.size();
+        r = simdutf::base64_to_binary_safe(buffer.data() + input_index,
+                                           buffer.size() - input_index,
+                                           back.data(), second_length);
+        ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+        back.resize(second_length);
+        ASSERT_EQUAL(second_length + limited_length, len);
+        for (size_t i = 0; i < second_length; i++) {
+          ASSERT_EQUAL(source[i + limited_length], back[i]);
+        }
+      }
+    }
+  }
+}
 
 TEST(streaming_base64_roundtrip) {
   size_t len = 2048;
@@ -226,25 +860,25 @@ TEST(streaming_base64_roundtrip) {
   for (size_t i = 0; i < len; i++) {
     source[i] = byte_generator(gen);
   }
-  size_t size = implementation.binary_to_base64(
-          source.data(), source.size(), buffer.data());
+  size_t size = implementation.binary_to_base64(source.data(), source.size(),
+                                                buffer.data());
   buffer.resize(size);
   for (size_t window = 16; window <= 2048; window += 7) {
     // build a buffer with enough space to receive the decoded base64
     std::vector<char> back(len);
     size_t outpos = 0;
-    for(size_t pos = 0; pos < buffer.size(); pos += window) {
+    for (size_t pos = 0; pos < buffer.size(); pos += window) {
       size_t count = std::min(window, buffer.size() - pos);
-      simdutf::result r = simdutf::base64_to_binary(
-          buffer.data() + pos, count, back.data() + outpos);
+      simdutf::result r = simdutf::base64_to_binary(buffer.data() + pos, count,
+                                                    back.data() + outpos);
       ASSERT_TRUE(r.error != simdutf::error_code::INVALID_BASE64_CHARACTER);
-      if(count + pos == buffer.size()) {
+      if (count + pos == buffer.size()) {
         // We must check that the last call to base64_to_binary did not
         // end with an BASE64_INPUT_REMAINDER error.
         ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
       } else {
         size_t tail_bytes_to_reprocess = 0;
-        if(r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+        if (r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
           tail_bytes_to_reprocess = 1;
         } else {
           tail_bytes_to_reprocess = (r.count % 3) == 0 ? 0 : (r.count % 3) + 1;
@@ -259,31 +893,34 @@ TEST(streaming_base64_roundtrip) {
   }
 }
 
-
 TEST(readme_test) {
   size_t len = 2048;
   std::vector<char> base64(len, 'a');
-  std::vector<char> back((len+3)/4*3);
+  std::vector<char> back((len + 3) / 4 * 3);
   size_t outpos = 0;
   size_t window = 512;
-  for(size_t pos = 0; pos < base64.size(); pos += window) {
+  for (size_t pos = 0; pos < base64.size(); pos += window) {
     // how many base64 characters we can process in this iteration
     size_t count = std::min(window, base64.size() - pos);
-    simdutf::result r = simdutf::base64_to_binary(
-        base64.data() + pos, count, back.data() + outpos);
-    if(r.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
-      std::cerr << "Invalid base64 character at position " << pos + r.count << std::endl;
+    simdutf::result r = simdutf::base64_to_binary(base64.data() + pos, count,
+                                                  back.data() + outpos);
+    if (r.error == simdutf::error_code::INVALID_BASE64_CHARACTER) {
+      std::cerr << "Invalid base64 character at position " << pos + r.count
+                << std::endl;
       return;
     }
-    // If we arrived at the end of the base64 input, we must check that the number
-    // of characters processed is a multiple of 4, or that we have a remainder of 0, 2 or 3.
-    if(count + pos == base64.size() && r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
-      std::cerr << "The base64 input contained an invalid number of characters " << std::endl;
+    // If we arrived at the end of the base64 input, we must check that the
+    // number of characters processed is a multiple of 4, or that we have a
+    // remainder of 0, 2 or 3.
+    if (count + pos == base64.size() &&
+        r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+      std::cerr << "The base64 input contained an invalid number of characters "
+                << std::endl;
     }
-    // If we are not at then end, we may have to reprocess either 1, 2 or 3 bytes, and
-    // to drop the last 0, 2 or 3 bytes decoded.
+    // If we are not at then end, we may have to reprocess either 1, 2 or 3
+    // bytes, and to drop the last 0, 2 or 3 bytes decoded.
     size_t tail_bytes_to_reprocess = 0;
-    if(r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
+    if (r.error == simdutf::error_code::BASE64_INPUT_REMAINDER) {
       tail_bytes_to_reprocess = 1;
     } else {
       tail_bytes_to_reprocess = (r.count % 3) == 0 ? 0 : (r.count % 3) + 1;
@@ -296,13 +933,34 @@ TEST(readme_test) {
   back.resize(outpos);
 }
 
+TEST(readme_safe) {
+  size_t len = 72;
+  std::vector<char> base64(len, 'a');
+  std::vector<char> back((len + 3) / 4 * 3);
+  size_t limited_length = back.size() / 2; // Intentionally too small
+  simdutf::result r = simdutf::base64_to_binary_safe(
+            base64.data(), base64.size(), back.data(), limited_length);
+  ASSERT_EQUAL(r.error, simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
+
+  // We decoded 'limited_length' bytes to back.
+  // Now let us decode the rest !!!
+  size_t input_index = r.count;
+  size_t limited_length2 = back.size();
+  r = simdutf::base64_to_binary_safe(base64.data() + input_index,
+                                           base64.size() - input_index,
+                                           back.data(), limited_length2);
+  ASSERT_EQUAL(r.error, simdutf::error_code::SUCCESS);
+  back.resize(limited_length2);
+  ASSERT_EQUAL(limited_length2 + limited_length, (len + 3) / 4 * 3);
+}
+
 int main(int argc, char *argv[]) {
   if (argc == 2) {
     try {
       seed = std::stoi(argv[1]);
-    } catch (const std::exception& e) {
-        printf("%s\n", e.what());
-        return EXIT_FAILURE;
+    } catch (const std::exception &e) {
+      printf("%s\n", e.what());
+      return EXIT_FAILURE;
     }
   }
   return simdutf::test::main(argc, argv);
diff --git a/tests/helpers/test.h b/tests/helpers/test.h
index 1d7d20238..5c28a299c 100644
--- a/tests/helpers/test.h
+++ b/tests/helpers/test.h
@@ -41,11 +41,12 @@ void name(const simdutf::implementation& impl) {            \
 static simdutf::test::register_test test_register_##name(#name, name); \
 void test_impl_##name(const simdutf::implementation& implementation)
 
-#define ASSERT_EQUAL(a, b) {                                      \
-  const auto expr = (a);                                          \
-  if (expr != b) {                                                \
-    std::cout << "\nExpected " << expr << " to be " << b << ".\n";\
-    printf("%s \n",#a);                                           \
+#define ASSERT_EQUAL(a, b) {                                                   \
+  const auto expr = (a);                                                       \
+  if (expr != b) {                                                             \
+    std::cout << "\nExpected " << expr << " to be " << b << ".\n";             \
+    printf("%s \n",#a);                                                        \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                      \
   }                                                               \
 }
@@ -54,6 +55,7 @@ void test_impl_##name(const simdutf::implementation& implementation)
   const bool expr = (cond);                                 \
   if (!expr) {                                              \
     printf("expected %s to be true, it's false\n", #cond);  \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                \
   }                                                         \
 }
@@ -62,6 +64,7 @@ void test_impl_##name(const simdutf::implementation& implementation)
   const bool expr = !(cond);                                \
   if (!expr) {                                              \
     printf("expected %s to be false, it's true\n", #cond);  \
+    printf("file %s:%d, function %s  \n", __FILE__, __LINE__, __func__); \
     exit(1);                                                \
   }                                                         \
 }
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 57a846f41..c0a62a934 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -40,6 +40,7 @@ if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID STREQUAL "Cl
 endif()
 endif()
 
+message(STATUS "The tools require C++17. If your system does not support C++17, please set SIMDUTF_TOOLS to OFF.")
 set_property(TARGET sutf PROPERTY CXX_STANDARD 17)
 set_property(TARGET sutf PROPERTY CXX_STANDARD_REQUIRED ON)
 set_property(TARGET fastbase64 PROPERTY CXX_STANDARD 17)