Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.15)
project(simdutf
DESCRIPTION "Fast Unicode validation, transcoding and processing"
LANGUAGES CXX
VERSION 7.6.0
VERSION 7.7.0
)

include (TestBigEndian)
Expand All @@ -23,8 +23,8 @@ include(GNUInstallDirs)
include(CTest)
include(cmake/simdutf-flags.cmake)

set(SIMDUTF_LIB_VERSION "28.0.0" CACHE STRING "simdutf library version")
set(SIMDUTF_LIB_SOVERSION "28" CACHE STRING "simdutf library soversion")
set(SIMDUTF_LIB_VERSION "29.0.0" CACHE STRING "simdutf library version")
set(SIMDUTF_LIB_SOVERSION "29" CACHE STRING "simdutf library soversion")
option(SIMDUTF_TESTS "Whether the tests are included as part of the CMake Build." ON)
option(SIMDUTF_ATOMIC_BASE64_TESTS "Whether to test the atomic base64 functions. (OFF by default)" OFF)
option(SIMDUTF_BENCHMARKS "Whether the benchmarks are included as part of the CMake Build." OFF)
Expand Down
2 changes: 1 addition & 1 deletion Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ PROJECT_NAME = simdutf
# could be handy for archiving the generated documentation or if some version
# control system is used.

PROJECT_NUMBER = "7.6.0"
PROJECT_NUMBER = "7.7.0"

# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
Expand Down
64 changes: 46 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ Linux or macOS users might follow the following instructions if they have a rece

1. Pull the library in a directory
```
wget https://github.com/simdutf/simdutf/releases/download/v7.6.0/singleheader.zip
wget https://github.com/simdutf/simdutf/releases/download/v7.7.0/singleheader.zip
unzip singleheader.zip
```
You can replace `wget` by `curl -OL https://...` if you prefer.
Expand Down Expand Up @@ -223,7 +223,7 @@ Single-header version
You can create a single-header version of the library where
all of the code is put into two files (`simdutf.h` and `simdutf.cpp`).
We publish a zip archive containing these files, e.g., see
https://github.com/simdutf/simdutf/releases/download/v7.6.0/singleheader.zip
https://github.com/simdutf/simdutf/releases/download/v7.7.0/singleheader.zip

You may generate it on your own using a Python script.

Expand Down Expand Up @@ -450,9 +450,19 @@ enum error_code {
// U+10FFFF,less than or equal than U+7F for ASCII OR less than
// equal than U+FF for Latin1
SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
// UTF-32) OR a high surrogate must be followed by a low surrogate
// UTF-32)
// OR
// a high surrogate must be followed by a low surrogate
// and a low surrogate must be preceded by a high surrogate
// (UTF-16) OR there must be no surrogate at all (Latin1)
// (UTF-16)
// OR
// there must be no surrogate at all and one is
// found (Latin1 functions)
// OR
// *specifically* for the function
// utf8_length_from_utf16_with_replacement, a surrogate (whether
// in error or not) has been found (I.e., whether we are in the
// Basic Multilingual Plane or not).
INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
// base64 string. This may include a misplaced padding character ('=').
BASE64_INPUT_REMAINDER, // The base64 input terminates with a single
Expand Down Expand Up @@ -913,9 +923,16 @@ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t
*
* @param input the UTF-16 string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16LE string as UTF-8
*/
simdutf_warn_unused size_t utf8_length_from_utf16_with_replacement(const char16_t *input,
* @return the number of bytes required to encode the UTF-16 string as UTF-8
* @return a result pair struct (of type simdutf::result containing the two fields error and count)
* where the count is the number of bytes required to encode the UTF-16 string as UTF-8, and the
* error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of this function:
* it indicates that at least one surrogate has been encountered: the surrogates may be matched
* or not (thus this function does not validate). If the returned error code is SUCCESS,
* then the input contains no surrogate, is in the Basic Multilingual Plane, and is necessarily valid.
*/
simdutf_warn_unused result utf8_length_from_utf16_with_replacement(const char16_t *input,
size_t length) noexcept;
/**
* Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
Expand Down Expand Up @@ -949,9 +966,15 @@ simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size
* @param input the UTF-16LE string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16LE string as UTF-8
*/

simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
* @return a result pair struct (of type simdutf::result containing the two fields error and count)
* where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and the
* error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of this function:
* it indicates that at least one surrogate has been encountered: the surrogates may be matched
* or not (thus this function does not validate). If the returned error code is SUCCESS,
* then the input contains no surrogate, is in the Basic Multilingual Plane, and is necessarily valid.
*/
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
const char16_t *input, size_t length) noexcept;


Expand All @@ -962,10 +985,15 @@ simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
*
* @param input the UTF-16BE string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16BE string as UTF-8
*/

simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
* @return a result pair struct (of type simdutf::result containing the two fields error and count)
* where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and
* the error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of this function:
* it indicates that at least one surrogate has been encountered: the surrogates may be matched
* or not (thus this function does not validate). If the returned error code is SUCCESS,
* then the input contains no surrogate, is in the Basic Multilingual Plane, and is necessarily valid.
*/
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
const char16_t *input, size_t length) noexcept;

/**
Expand All @@ -975,9 +1003,9 @@ simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
*
* @param input the UTF-16LE string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16LE string as UTF-8
* @return a result pair struct (of type simdutf::result containing the two fields error and count) where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
*/
simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
const char16_t *input, size_t length) noexcept;

/**
Expand All @@ -987,9 +1015,9 @@ simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
*
* @param input the UTF-16BE string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16BE string as UTF-8
* @return a result pair struct (of type simdutf::result containing the two fields error and count) where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
*/
simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
const char16_t *input, size_t length) noexcept;


Expand Down
12 changes: 8 additions & 4 deletions benchmarks/src/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,15 +215,15 @@ Benchmark::Benchmark(std::vector<input::Testcase> &&testcases)
register_function("convert_valid_utf16le_to_latin1",
&Benchmark::run_convert_valid_utf16le_to_latin1,
simdutf::encoding_type::UTF16_LE);
#if SIMDUTF_BIG_ENDIAN
#if SIMDUTF_IS_BIG_ENDIAN
register_function("convert_utf16_to_utf8_safe",
&Benchmark::run_convert_utf16_to_utf8_safe,
simdutf::encoding_type::UTF16_BE);
#else
register_function("convert_utf16_to_utf8_safe",
&Benchmark::run_convert_utf16_to_utf8_safe,
simdutf::encoding_type::UTF16_LE);
#endif // SIMDUTF_BIG_ENDIAN
#endif // SIMDUTF_IS_BIG_ENDIAN
register_function("convert_utf16le_to_utf8",
&Benchmark::run_convert_utf16le_to_utf8,
simdutf::encoding_type::UTF16_LE);
Expand Down Expand Up @@ -864,7 +864,9 @@ void Benchmark::run_utf8_length_from_utf16le_with_replacement(
volatile size_t sink{0};

auto proc = [&implementation, data, size, &sink]() {
sink = implementation.utf8_length_from_utf16le_with_replacement(data, size);
auto r =
implementation.utf8_length_from_utf16le_with_replacement(data, size);
sink = r.count;
};
count_events(proc, iterations); // warming up!
const auto result = count_events(proc, iterations);
Expand All @@ -878,7 +880,9 @@ void Benchmark::run_utf8_length_from_utf16be_with_replacement(
volatile size_t sink{0};

auto proc = [&implementation, data, size, &sink]() {
sink = implementation.utf8_length_from_utf16be_with_replacement(data, size);
auto r =
implementation.utf8_length_from_utf16be_with_replacement(data, size);
sink = r.count;
};
count_events(proc, iterations); // warming up!
const auto result = count_events(proc, iterations);
Expand Down
14 changes: 12 additions & 2 deletions include/simdutf/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,19 @@ enum error_code {
// U+10FFFF,less than or equal than U+7F for ASCII OR less than
// equal than U+FF for Latin1
SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
// UTF-32) OR a high surrogate must be followed by a low surrogate
// UTF-32)
// OR
// a high surrogate must be followed by a low surrogate
// and a low surrogate must be preceded by a high surrogate
// (UTF-16) OR there must be no surrogate at all (Latin1)
// (UTF-16)
// OR
// there must be no surrogate at all and one is
// found (Latin1 functions)
// OR
// *specifically* for the function
// utf8_length_from_utf16_with_replacement, a surrogate (whether
// in error or not) has been found (I.e., whether we are in the
// Basic Multilingual Plane or not).
INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
// base64 string. This may include a misplaced
// padding character ('=').
Expand Down
76 changes: 58 additions & 18 deletions include/simdutf/implementation.h
Original file line number Diff line number Diff line change
Expand Up @@ -779,13 +779,20 @@ convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
*
* @param input the UTF-16LE string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16LE string as UTF-8
*/

simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
* @return a result pair struct (of type simdutf::result containing the two
* fields error and count) where the count is the number of bytes required to
* encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
* SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of this
* function: it indicates that at least one surrogate has been encountered: the
* surrogates may be matched or not (thus this function does not validate). If
* the returned error code is SUCCESS, then the input contains no surrogate, is
* in the Basic Multilingual Plane, and is necessarily valid.
*/
simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
const char16_t *input, size_t length) noexcept;
#if SIMDUTF_SPAN
simdutf_really_inline simdutf_warn_unused size_t
simdutf_really_inline simdutf_warn_unused result
utf8_length_from_utf16le_with_replacement(
std::span<const char16_t> valid_utf16_input) noexcept {
return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
Expand All @@ -800,13 +807,20 @@ utf8_length_from_utf16le_with_replacement(
*
* @param input the UTF-16BE string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16BE string as UTF-8
*/

simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
* @return a result pair struct (of type simdutf::result containing the two
* fields error and count) where the count is the number of bytes required to
* encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
* SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of this
* function: it indicates that at least one surrogate has been encountered: the
* surrogates may be matched or not (thus this function does not validate). If
* the returned error code is SUCCESS, then the input contains no surrogate, is
* in the Basic Multilingual Plane, and is necessarily valid.
*/
simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
const char16_t *input, size_t length) noexcept;
#if SIMDUTF_SPAN
simdutf_really_inline simdutf_warn_unused size_t
simdutf_really_inline simdutf_warn_unused result
utf8_length_from_utf16be_with_replacement(
std::span<const char16_t> valid_utf16_input) noexcept {
return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
Expand Down Expand Up @@ -2120,12 +2134,20 @@ utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
*
* @param input the UTF-16 string to convert
* @param length the length of the string in 2-byte code units (char16_t)
* @return the number of bytes required to encode the UTF-16LE string as UTF-8
*/
simdutf_warn_unused size_t utf8_length_from_utf16_with_replacement(
* @return a result pair struct (of type simdutf::result containing the two
* fields error and count) where the count is the number of bytes required to
* encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
* SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of this
* function: it indicates that at least one surrogate has been encountered: the
* surrogates may be matched or not (thus this function does not validate). If
* the returned error code is SUCCESS, then the input contains no surrogate, is
* in the Basic Multilingual Plane, and is necessarily valid.
*/
simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
const char16_t *input, size_t length) noexcept;
#if SIMDUTF_SPAN
simdutf_really_inline simdutf_warn_unused size_t
simdutf_really_inline simdutf_warn_unused result
utf8_length_from_utf16_with_replacement(
std::span<const char16_t> valid_utf16_input) noexcept {
return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
Expand Down Expand Up @@ -4152,9 +4174,18 @@ class implementation {
* @param input the UTF-16LE string to convert
* @param length the length of the string in 2-byte code units
* (char16_t)
* @return the number of bytes required to encode the UTF-16LE string as UTF-8
* @return a result pair struct (of type simdutf::result containing the two
* fields error and count) where the count is the number of bytes required to
* encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
* or SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of
* this function: it indicates that at least one surrogate has been
* encountered: the surrogates may be matched or not (thus this function does
* not validate). If the returned error code is SUCCESS, then the input
* contains no surrogate, is in the Basic Multilingual Plane, and is
* necessarily valid.
*/
virtual simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
const char16_t *input, size_t length) const noexcept = 0;

/**
Expand All @@ -4165,9 +4196,18 @@ class implementation {
* @param input the UTF-16BE string to convert
* @param length the length of the string in 2-byte code units
* (char16_t)
* @return the number of bytes required to encode the UTF-16BE string as UTF-8
* @return a result pair struct (of type simdutf::result containing the two
* fields error and count) where the count is the number of bytes required to
* encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
* or SURROGATE. The count is correct regardless of the error field.
* When SURROGATE is returned, it does not indicate an error in the case of
* this function: it indicates that at least one surrogate has been
* encountered: the surrogates may be matched or not (thus this function does
* not validate). If the returned error code is SUCCESS, then the input
* contains no surrogate, is in the Basic Multilingual Plane, and is
* necessarily valid.
*/
virtual simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
const char16_t *input, size_t length) const noexcept = 0;

#endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16
Expand Down
4 changes: 2 additions & 2 deletions include/simdutf/simdutf_version.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#define SIMDUTF_SIMDUTF_VERSION_H

/** The version of simdutf being used (major.minor.revision) */
#define SIMDUTF_VERSION "7.6.0"
#define SIMDUTF_VERSION "7.7.0"

namespace simdutf {
enum {
Expand All @@ -15,7 +15,7 @@ enum {
/**
* The minor version (major.MINOR.revision) of simdutf being used.
*/
SIMDUTF_VERSION_MINOR = 6,
SIMDUTF_VERSION_MINOR = 7,
/**
* The revision (major.minor.REVISION) of simdutf being used.
*/
Expand Down
Loading
Loading