Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
3a64443
trimming some unnecessary code
lemire Mar 20, 2024
3f9cb0f
fixing missing rvv implementation
lemire Mar 20, 2024
a9ea1c6
completing the base64 implementation.
lemire Mar 22, 2024
0f49240
adding ppc64
lemire Mar 22, 2024
5daa520
saving
Mar 22, 2024
151aa09
saturated.
lemire Mar 22, 2024
b917aa8
finishing...
Mar 22, 2024
ca17560
various fixes
Mar 27, 2024
94b7dac
Implemented bun benchmark
lemire Mar 27, 2024
c35d8df
Obvious fix.
lemire Mar 27, 2024
1a90f2a
documentation
lemire Mar 27, 2024
bd454ea
adding libbase64 competitor
lemire Mar 27, 2024
bdab72f
more documentation.
lemire Mar 27, 2024
65f933b
base64url (first steps)
lemire Mar 28, 2024
4aa837d
working through
lemire Mar 28, 2024
8dc79aa
implemented base64url for ARM.
lemire Mar 29, 2024
fe1138f
documentation.
lemire Mar 29, 2024
5d1d0d5
prototype base64url
Mar 30, 2024
21717c4
solved based64url
Mar 30, 2024
c96ac90
completing the base64 implementation.
lemire Mar 22, 2024
106e18c
adding ppc64
lemire Mar 22, 2024
d1c9cbc
saving
Mar 22, 2024
8606798
saturated.
lemire Mar 22, 2024
e7eae70
finishing...
Mar 22, 2024
9262b4b
various fixes
Mar 27, 2024
3444f4e
Implemented bun benchmark
lemire Mar 27, 2024
6949b2c
Obvious fix.
lemire Mar 27, 2024
381945b
documentation
lemire Mar 27, 2024
7b304d3
adding libbase64 competitor
lemire Mar 27, 2024
f51ffdf
more documentation.
lemire Mar 27, 2024
3d87826
base64url (first steps)
lemire Mar 28, 2024
c72079c
working through
lemire Mar 28, 2024
200b6bc
implemented base64url for ARM.
lemire Mar 29, 2024
4971bc2
documentation.
lemire Mar 29, 2024
c729247
prototype base64url
Mar 30, 2024
e32acc9
solved based64url
Mar 30, 2024
038ce51
Merge branch 'base64_part2' of github.com:simdutf/simdutf into base64…
Mar 30, 2024
9154818
fixing a missing func definition (bad signature)
lemire Mar 30, 2024
fd037f5
no such thing as version 4 of uraimo/run-on-arch-action
lemire Mar 30, 2024
0de753a
fixes
lemire Mar 30, 2024
ccdf51d
Update benchmarks/base64/benchmark_base64.cpp
lemire Mar 30, 2024
7ec70f2
Update benchmarks/base64/benchmark_base64.cpp
lemire Mar 30, 2024
18dc616
Update benchmarks/base64/libbase64_spaces.h
lemire Mar 30, 2024
aeb2f5f
Update include/simdutf/implementation.h
lemire Mar 30, 2024
e0ce663
Update src/haswell/avx2_base64.cpp
lemire Mar 30, 2024
bb9d1fc
various minor fixes (linting + comments)
Mar 30, 2024
f511d9a
adding another comment.
Mar 30, 2024
e2a224f
cleaning up the base64 benchmark flags
Mar 30, 2024
5e6a366
disabling Ubuntu rvv VLEN=1024 (clang 17) CI due to system failures
Mar 30, 2024
9a92c54
adding the option
Mar 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/workflows/ppc64le.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Ubuntu aarch64 (GCC 11)

on:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: uraimo/run-on-arch-action@v2
name: Test
id: runcmd
with:
arch: ppc64le
githubToken: ${{ github.token }}
distro: ubuntu_latest
install: |
apt-get update -q -y
apt-get install -y cmake make g++
run: |
cmake -DCMAKE_BUILD_TYPE=Release -B build
cmake --build build -j=2
15 changes: 8 additions & 7 deletions .github/workflows/rvv-1024-clang-17.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
name: Ubuntu rvv VLEN=1024 (clang 17)

on:
push:
branches:
- master
pull_request:
branches:
- master
# Fails due to the inability to install packages
#on:
# push:
# branches:
# - master
# pull_request:
# branches:
# - master

jobs:
build:
Expand Down
15 changes: 8 additions & 7 deletions .github/workflows/rvv-128-clang-17.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
name: Ubuntu rvv VLEN=128 (clang 17)

on:
push:
branches:
- master
pull_request:
branches:
- master
# Fails due to the inability to install packages
#on:
# push:
# branches:
# - master
# pull_request:
# branches:
# - master

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ set(SIMDUTF_LIB_VERSION "6.0.0" CACHE STRING "simdutf library version")
set(SIMDUTF_LIB_SOVERSION "6" CACHE STRING "simdutf library soversion")
option(SIMDUTF_TESTS "Whether the tests are included as part of the CMake Build." ON)
option(SIMDUTF_BENCHMARKS "Whether the benchmarks are included as part of the CMake Build." OFF)
option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build." ON)
option(SIMDUTF_TOOLS "Whether the tools are included as part of the CMake build. Requires C++17 or better." ON)
option(SIMDUTF_ICONV "Whether to use iconv as part of the CMake build if available." ON)

set(SIMDUTF_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
Expand Down
148 changes: 139 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ This library provide fast Unicode functions such as
- From an UTF-16LE/BE string, compute the size of the UTF-32 equivalent string (equivalent to UTF-16 character counting),
- UTF-8 and UTF-16LE/BE character counting,
- UTF-16 endianness change (UTF16-LE/BE to UTF-16-BE/LE),
- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary,
- Binary to base64.
- [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) (with or without URL encoding) to binary,
- Binary to base64 (with or without URL encoding).

The functions are accelerated using SIMD instructions (e.g., ARM NEON, SSE, AVX, AVX-512, RISC-V Vector Extension, etc.). When your strings contain hundreds of characters, we can often transcode them at speeds exceeding a billion characters per second. You should expect high speeds not only with English strings (ASCII) but also Chinese, Japanese, Arabic, and so forth. We handle the full character range (including, for example, emojis).

Expand Down Expand Up @@ -1568,7 +1568,7 @@ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * o
Base64
-----

We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary.
We also support converting from [WHATWG forgiving-base64](https://infra.spec.whatwg.org/#forgiving-base64-decode) to binary, and back. In particular, you can convert base64 inputs which contain ASCII spaces to binary. We also support the base64 URL encoding alternative.

Converting binary data to base64 always succeeds and is relatively simple:
```C++
Expand All @@ -1583,12 +1583,50 @@ we prune spaces, we may need to adjust the result size afterword.
std::vector<char> buffer(simdutf::maximal_binary_length_from_base64(base64.data(), base64.size()));
simdutf::result r = simdutf::base64_to_binary(base64.data(), base64.size(), buffer.data());
if(r.error) {
// We have some error, r.count tells you where the error was encountered in the input
// We have some error, r.count tells you where the error was encountered in the input if
// the error is INVALID_BASE64_CHARACTER. If the error is BASE64_INPUT_REMAINDER, then
// a single valid base64 remained, and r.count contains the number of bytes decoded.
} else {
buffer.resize(r.count); // resize the buffer according to actual number of bytes
}
```

In some instances, you may want to limit the size of the output further when decoding base64.
For this purpose, you may use the `base64_to_binary_safe` functions. The functions may also
be useful if you seek to decode the input into segments having a maximal capacity.


```C++
size_t len = 72; // for simplicity we chose len divisible by 3
std::vector<char> base64(len, 'a'); // we want to decode 'aaaaa....'
std::vector<char> back((len + 3) / 4 * 3);
size_t limited_length = back.size() / 2; // Intentionally too small
// We proceed to decode half:
simdutf::result r = simdutf::base64_to_binary_safe(
base64.data(), base64.size(), back.data(), limited_length);
assert(r.error == simdutf::error_code::OUTPUT_BUFFER_TOO_SMALL);
// We decoded r.count base64 8-bit units to limited_length bytes
// Now let us decode the rest !!!
//
// We have read up to r.count in the input buffer and we have
// produced limited_length bytes.
//
size_t input_index = r.count;
size_t limited_length2 = back.size();
r = simdutf::base64_to_binary_safe(base64.data() + input_index,
base64.size() - input_index,
back.data(), limited_length2);
assert(r.error == simdutf::error_code::SUCCESS);
// We decoded r.count base64 8-bit units to limited_length2 bytes
// We are done
assert(limited_length2 + limited_length == (len + 3) / 4 * 3);
```

See our function specifications for more details.

In other instances, you may receive your base64 inputs in 16-bit units (e.g., from UTF-16 strings):
we have function overloads for these cases as well.

Some users may want to decode the base64 inputs in chunks, especially when doing
file or networking programming. These users should see `tools/fastbase64.cpp`, a command-line
utility designed for as an example. It reads and writes base64 files using chunks of at most
Expand All @@ -1597,17 +1635,36 @@ a few tens of kilobytes.
The specification of our base64 functions is as follows:

```C++

// base64_options are used to specify the base64 encoding options.
using base64_options = uint64_t;
enum : base64_options {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enum class maybe?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I am open to using an enum class but I am a bit concerned but I really want this to be a bitset that we can extend later to support different options.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I verified that enum classes do not work well as bitsets:

enum class Joe : uint64_t {
    default_base64 = 0,
    url_base64 = 1,
    allow_spaces = 4,
    allow_padding = 8,
};
int main() {
    Joe t = Joe::default_base64 | Joe:: allow_padding; // Will not compile
}

The idea here is to be able to extend the API without too much of a mess by allowing 'options' if people have a great need for them. Like we could disable white spaces in a later version. Enum classes makes this more difficult than need be.

Granted, they are safer but we could validate the values if we are concerned.

base64_default = 0, /* standard base64 format */
base64_url = 1 /* base64url format*/
};

/**
* Provide the maximal binary length in bytes given the base64 input.
* In general, if the input contains ASCII spaces, the result will be less than
* the maximum length.
*
* @param input the base64 input to process
* @param length the length of the base64 input in bytes
* @return number of base64 bytes
* @return maximal number of binary bytes
*/
simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input, size_t length) noexcept;

/**
* Provide the maximal binary length in bytes given the base64 input.
* In general, if the input contains ASCII spaces, the result will be less than
* the maximum length.
*
* @param input the base64 input to process, in ASCII stored as 16-bit units
* @param length the length of the base64 input in 16-bit units
* @return maximal number of binary bytes
*/
simdutf_warn_unused size_t maximal_binary_length_from_base64(const char16_t * input, size_t length) noexcept;

/**
* Convert a base64 input to a binary ouput.
*
Expand All @@ -1618,19 +1675,27 @@ simdutf_warn_unused size_t maximal_binary_length_from_base64(const char * input,
* See https://infra.spec.whatwg.org/#forgiving-base64-decode
*
* This function will fail in case of invalid input. There are two possible reasons for
* failure: the input is contains a number of base64 characters that when divided by 4, leaves
* a singler remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
* failure: the input contains a number of base64 characters that when divided by 4, leaves
* a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
* that is not a valid base64 character (INVALID_BASE64_CHARACTER).
*
* The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
* the output.
*
* When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
* where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
* r.count contains the number of bytes decoded.
*
* You should call this function with a buffer that is at least maximal_binary_length_from_base64(input, length) bytes long.
* If you fail to provide that much space, the function may cause a buffer overflow.
*
* @param input the base64 string to process
* @param length the length of the string in bytes
* @param output the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
* @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default.
* @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the error (in the input in bytes) if any, or the number of bytes written if successful.
*/
simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output) noexcept;
simdutf_warn_unused result base64_to_binary(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;

/**
* Provide the base64 length in bytes given the length of a binary input.
Expand All @@ -1649,9 +1714,74 @@ simdutf_warn_unused size_t base64_length_from_binary(size_t length) noexcept;
* @param input the binary to process
* @param length the length of the input in bytes
* @param output the pointer to buffer that can hold the conversion result (should be at least base64_length_from_binary(length) bytes long)
* @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default.
* @return number of written bytes, will be equal to base64_length_from_binary(length)
*/
size_t binary_to_base64(const char * input, size_t length, char* output) noexcept;
size_t binary_to_base64(const char * input, size_t length, char* output, base64_options options = base64_default) noexcept;

/**
* Convert a base64 input to a binary ouput.
*
* This function follows the WHATWG forgiving-base64 format, which means that it will
* ignore any ASCII spaces in the input. You may provide a padded input (with one or two
* equal signs at the end) or an unpadded input (without any equal signs at the end).
*
* See https://infra.spec.whatwg.org/#forgiving-base64-decode
*
* This function will fail in case of invalid input. There are two possible reasons for
* failure: the input contains a number of base64 characters that when divided by 4, leaves
* a single remainder character (BASE64_INPUT_REMAINDER), or the input contains a character
* that is not a valid base64 character (INVALID_BASE64_CHARACTER).
*
* When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
* where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
* r.count contains the number of bytes decoded.
*
* You should call this function with a buffer that is at least maximal_binary_length_from_utf6_base64(input, length) bytes long.
* If you fail to provide that much space, the function may cause a buffer overflow.
*
* @param input the base64 string to process, in ASCII stored as 16-bit units
* @param length the length of the string in 16-bit units
* @param output the pointer to buffer that can hold the conversion result (should be at least maximal_binary_length_from_base64(input, length) bytes long).
* @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default.
* @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and either position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of bytes written if successful.
*/
simdutf_warn_unused result base64_to_binary(const char16_t * input, size_t length, char* output, base64_options options = base64_default) noexcept;

/**
* Convert a base64 input to a binary ouput.
*
* This function follows the WHATWG forgiving-base64 format, which means that it will
* ignore any ASCII spaces in the input. You may provide a padded input (with one or two
* equal signs at the end) or an unpadded input (without any equal signs at the end).
*
* See https://infra.spec.whatwg.org/#forgiving-base64-decode
*
* This function will fail in case of invalid input. There are three possible reasons for
* failure: the input contains a number of base64 characters that when divided by 4, leaves
* a single remainder character (BASE64_INPUT_REMAINDER), the input contains a character
* that is not a valid base64 character (INVALID_BASE64_CHARACTER), or the output buffer
* is too small (OUTPUT_BUFFER_TOO_SMALL).
*
* When OUTPUT_BUFFER_TOO_SMALL, we return both the number of bytes written
* and the number of units processed, see description of the parameters and returned value.
*
* When the error is INVALID_BASE64_CHARACTER, r.count contains the index in the input
* where the invalid character was found. When the error is BASE64_INPUT_REMAINDER, then
* r.count contains the number of bytes decoded.
*
* The INVALID_BASE64_CHARACTER cases are considered fatal and you are expected to discard
* the output.
*
* @param input the base64 string to process, in ASCII stored as 8-bit or 16-bit units
* @param length the length of the string in 8-bit or 16-bit units.
* @param output the pointer to buffer that can hold the conversion result.
* @param outlen the number of bytes that can be written in the output buffer. Upon return, it is modified to reflect how many bytes were written.
* @param options the base64 options to use, can be base64_default or base64_url, is base64_default by default.
* @return a result pair struct (of type simdutf::error containing the two fields error and count) with an error code and position of the INVALID_BASE64_CHARACTER error (in the input in units) if any, or the number of units processed if successful.
*/
simdutf_warn_unused result base64_to_binary_safe(const char * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;
simdutf_warn_unused result base64_to_binary_safe(const char16_t * input, size_t length, char* output, size_t& outlen, base64_options options = base64_default) noexcept;

```

Expand Down
3 changes: 3 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ if(Threads_FOUND)
set_property(TARGET threaded PROPERTY CXX_STANDARD 17)
set_property(TARGET threaded PROPERTY CXX_STANDARD_REQUIRED ON)
endif(Threads_FOUND)

option(SIMDUTF_BENCHMARK_BASE64 "Whether the base64 benchmarks are included as part of the CMake Build (requires C++17 or better)." ON)

if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC")
message(STATUS "Not building base64 benchmarks when using clang-cl due to build errors with the aklomp/base64 dependency.")
else()
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/base64/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ CPMAddPackage(


add_executable(benchmark_base64 benchmark_base64.cpp)
message(STATUS "The tools benchmark_base64 require C++17. If your system does not support C++17, please set SIMDUTF_BENCHMARK_BASE64 to OFF.")
set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD 17)
Comment thread
lemire marked this conversation as resolved.
set_property(TARGET benchmark_base64 PROPERTY CXX_STANDARD_REQUIRED ON)

target_link_libraries(benchmark_base64 PUBLIC simdutf)
target_link_libraries(benchmark_base64 PUBLIC base64)
target_link_libraries(benchmark_base64 PUBLIC simdutf::benchmarks::benchmark)
Loading