simdutf · lemire · Nov 22, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.15)
 project(simdutf
   DESCRIPTION "Fast Unicode validation, transcoding and processing"
   LANGUAGES CXX
-  VERSION 7.6.0
+  VERSION 7.7.0
 )
 
 include (TestBigEndian)
@@ -23,8 +23,8 @@ include(GNUInstallDirs)
 include(CTest)
 include(cmake/simdutf-flags.cmake)
 
-set(SIMDUTF_LIB_VERSION "28.0.0" CACHE STRING "simdutf library version")
-set(SIMDUTF_LIB_SOVERSION "28" CACHE STRING "simdutf library soversion")
+set(SIMDUTF_LIB_VERSION "29.0.0" CACHE STRING "simdutf library version")
+set(SIMDUTF_LIB_SOVERSION "29" CACHE STRING "simdutf library soversion")
 option(SIMDUTF_TESTS "Whether the tests are included as part of the CMake Build." ON)
 option(SIMDUTF_ATOMIC_BASE64_TESTS "Whether to test the atomic base64 functions. (OFF by default)" OFF)
 option(SIMDUTF_BENCHMARKS "Whether the benchmarks are included as part of the CMake Build." OFF)

diff --git a/Doxyfile b/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = simdutf
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "7.6.0"
+PROJECT_NUMBER         = "7.7.0"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/README.md b/README.md
@@ -152,7 +152,7 @@ Linux or macOS users might follow the following instructions if they have a rece
 
 1. Pull the library in a directory
    ```
-   wget https://github.com/simdutf/simdutf/releases/download/v7.6.0/singleheader.zip
+   wget https://github.com/simdutf/simdutf/releases/download/v7.7.0/singleheader.zip
    unzip singleheader.zip
    ```
    You can replace `wget` by `curl -OL https://...` if you prefer.
@@ -223,7 +223,7 @@ Single-header version
 You can create a single-header version of the library where
 all of the code is put into two files (`simdutf.h` and `simdutf.cpp`).
 We publish a zip archive containing these files, e.g., see
-https://github.com/simdutf/simdutf/releases/download/v7.6.0/singleheader.zip
+https://github.com/simdutf/simdutf/releases/download/v7.7.0/singleheader.zip
 
 You may generate it on your own using a Python script.
 
@@ -450,9 +450,19 @@ enum error_code {
              // U+10FFFF,less than or equal than U+7F for ASCII OR less than
              // equal than U+FF for Latin1
   SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
-             // UTF-32) OR a high surrogate must be followed by a low surrogate
+             // UTF-32)
+             // OR
+             // a high surrogate must be followed by a low surrogate
              // and a low surrogate must be preceded by a high surrogate
-             // (UTF-16) OR there must be no surrogate at all (Latin1)
+             // (UTF-16)
+             // OR
+             // there must be no surrogate at all and one is
+             // found (Latin1 functions)
+             // OR
+             // *specifically* for the function
+             // utf8_length_from_utf16_with_replacement, a surrogate (whether
+             // in error or not) has been found (I.e., whether we are in the
+             // Basic Multilingual Plane or not).
   INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
                             // base64 string. This may include a misplaced padding character ('=').
   BASE64_INPUT_REMAINDER,   // The base64 input terminates with a single
@@ -913,9 +923,16 @@ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t
  *
  * @param input         the UTF-16 string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16LE string as UTF-8
- */
-simdutf_warn_unused size_t utf8_length_from_utf16_with_replacement(const char16_t *input,
+ * @return the number of bytes required to encode the UTF-16 string as UTF-8
+ * @return a result pair struct (of type simdutf::result containing the two fields error and count)
+ * where the count is the number of bytes required to encode the UTF-16 string as UTF-8, and the
+ * error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
+ * When SURROGATE is returned, it does not indicate an error in the case of this function:
+ * it indicates that at least one surrogate has been encountered: the surrogates may be matched
+ * or not (thus this function does not validate). If the returned error code is SUCCESS,
+ * then the input contains no surrogate, is in the Basic Multilingual Plane, and is necessarily valid.
+ */
+simdutf_warn_unused result utf8_length_from_utf16_with_replacement(const char16_t *input,
                                                   size_t length) noexcept;
 /**
  * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
@@ -949,9 +966,15 @@ simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size
  * @param input         the UTF-16LE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
  * @return the number of bytes required to encode the UTF-16LE string as UTF-8
- */
-
-simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
+ * @return a result pair struct (of type simdutf::result containing the two fields error and count)
+ * where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and the
+ * error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
+ * When SURROGATE is returned, it does not indicate an error in the case of this function:
+ * it indicates that at least one surrogate has been encountered: the surrogates may be matched
+ * or not (thus this function does not validate). If the returned error code is SUCCESS,
+ * then the input contains no surrogate, is in the Basic Multilingual Plane, and is necessarily valid.
+ */
+simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
     const char16_t *input, size_t length) noexcept;
 
 
@@ -962,10 +985,15 @@ simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
  *
  * @param input         the UTF-16BE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16BE string as UTF-8
- */
-
-simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
+ * @return a result pair struct (of type simdutf::result containing the two fields error and count)
+ * where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and
+ * the error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
+ * When SURROGATE is returned, it does not indicate an error in the case of this function:
+ * it indicates that at least one surrogate has been encountered: the surrogates may be matched
+ * or not (thus this function does not validate). If the returned error code is SUCCESS,
+ * then the input contains no surrogate, is in the Basic Multilingual Plane, and is necessarily valid.
+ */
+simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
     const char16_t *input, size_t length) noexcept;
 
 /**
@@ -975,9 +1003,9 @@ simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
  *
  * @param input         the UTF-16LE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+ * @return a result pair struct (of type simdutf::result containing the two fields error and count) where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
  */
-simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
+simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
     const char16_t *input, size_t length) noexcept;
 
 /**
@@ -987,9 +1015,9 @@ simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
  *
  * @param input         the UTF-16BE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16BE string as UTF-8
+ * @return a result pair struct (of type simdutf::result containing the two fields error and count) where the count is the number of bytes required to encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or SURROGATE. The count is correct regardless of the error field.
  */
-simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
+simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
     const char16_t *input, size_t length) noexcept;
 
 

diff --git a/benchmarks/src/benchmark.cpp b/benchmarks/src/benchmark.cpp
@@ -215,15 +215,15 @@ Benchmark::Benchmark(std::vector<input::Testcase> &&testcases)
   register_function("convert_valid_utf16le_to_latin1",
                     &Benchmark::run_convert_valid_utf16le_to_latin1,
                     simdutf::encoding_type::UTF16_LE);
-#if SIMDUTF_BIG_ENDIAN
+#if SIMDUTF_IS_BIG_ENDIAN
   register_function("convert_utf16_to_utf8_safe",
                     &Benchmark::run_convert_utf16_to_utf8_safe,
                     simdutf::encoding_type::UTF16_BE);
 #else
   register_function("convert_utf16_to_utf8_safe",
                     &Benchmark::run_convert_utf16_to_utf8_safe,
                     simdutf::encoding_type::UTF16_LE);
-#endif // SIMDUTF_BIG_ENDIAN
+#endif // SIMDUTF_IS_BIG_ENDIAN
   register_function("convert_utf16le_to_utf8",
                     &Benchmark::run_convert_utf16le_to_utf8,
                     simdutf::encoding_type::UTF16_LE);
@@ -864,7 +864,9 @@ void Benchmark::run_utf8_length_from_utf16le_with_replacement(
   volatile size_t sink{0};
 
   auto proc = [&implementation, data, size, &sink]() {
-    sink = implementation.utf8_length_from_utf16le_with_replacement(data, size);
+    auto r =
+        implementation.utf8_length_from_utf16le_with_replacement(data, size);
+    sink = r.count;
   };
   count_events(proc, iterations); // warming up!
   const auto result = count_events(proc, iterations);
@@ -878,7 +880,9 @@ void Benchmark::run_utf8_length_from_utf16be_with_replacement(
   volatile size_t sink{0};
 
   auto proc = [&implementation, data, size, &sink]() {
-    sink = implementation.utf8_length_from_utf16be_with_replacement(data, size);
+    auto r =
+        implementation.utf8_length_from_utf16be_with_replacement(data, size);
+    sink = r.count;
   };
   count_events(proc, iterations); // warming up!
   const auto result = count_events(proc, iterations);

diff --git a/include/simdutf/error.h b/include/simdutf/error.h
@@ -17,9 +17,19 @@ enum error_code {
              // U+10FFFF,less than or equal than U+7F for ASCII OR less than
              // equal than U+FF for Latin1
   SURROGATE, // The decoded character must be not be in U+D800...DFFF (UTF-8 or
-             // UTF-32) OR a high surrogate must be followed by a low surrogate
+             // UTF-32)
+             // OR
+             // a high surrogate must be followed by a low surrogate
              // and a low surrogate must be preceded by a high surrogate
-             // (UTF-16) OR there must be no surrogate at all (Latin1)
+             // (UTF-16)
+             // OR
+             // there must be no surrogate at all and one is
+             // found (Latin1 functions)
+             // OR
+             // *specifically* for the function
+             // utf8_length_from_utf16_with_replacement, a surrogate (whether
+             // in error or not) has been found (I.e., whether we are in the
+             // Basic Multilingual Plane or not).
   INVALID_BASE64_CHARACTER, // Found a character that cannot be part of a valid
                             // base64 string. This may include a misplaced
                             // padding character ('=').

diff --git a/include/simdutf/implementation.h b/include/simdutf/implementation.h
@@ -779,13 +779,20 @@ convert_utf8_to_utf16(const detail::input_span_of_byte_like auto &input,
  *
  * @param input         the UTF-16LE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16LE string as UTF-8
- */
-
-simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) where the count is the number of bytes required to
+ * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS or
+ * SURROGATE. The count is correct regardless of the error field.
+ * When SURROGATE is returned, it does not indicate an error in the case of this
+ * function: it indicates that at least one surrogate has been encountered: the
+ * surrogates may be matched or not (thus this function does not validate). If
+ * the returned error code is SUCCESS, then the input contains no surrogate, is
+ * in the Basic Multilingual Plane, and is necessarily valid.
+ */
+simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
     const char16_t *input, size_t length) noexcept;
   #if SIMDUTF_SPAN
-simdutf_really_inline simdutf_warn_unused size_t
+simdutf_really_inline simdutf_warn_unused result
 utf8_length_from_utf16le_with_replacement(
     std::span<const char16_t> valid_utf16_input) noexcept {
   return utf8_length_from_utf16le_with_replacement(valid_utf16_input.data(),
@@ -800,13 +807,20 @@ utf8_length_from_utf16le_with_replacement(
  *
  * @param input         the UTF-16BE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16BE string as UTF-8
- */
-
-simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) where the count is the number of bytes required to
+ * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS or
+ * SURROGATE. The count is correct regardless of the error field.
+ * When SURROGATE is returned, it does not indicate an error in the case of this
+ * function: it indicates that at least one surrogate has been encountered: the
+ * surrogates may be matched or not (thus this function does not validate). If
+ * the returned error code is SUCCESS, then the input contains no surrogate, is
+ * in the Basic Multilingual Plane, and is necessarily valid.
+ */
+simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
     const char16_t *input, size_t length) noexcept;
   #if SIMDUTF_SPAN
-simdutf_really_inline simdutf_warn_unused size_t
+simdutf_really_inline simdutf_warn_unused result
 utf8_length_from_utf16be_with_replacement(
     std::span<const char16_t> valid_utf16_input) noexcept {
   return utf8_length_from_utf16be_with_replacement(valid_utf16_input.data(),
@@ -2120,12 +2134,20 @@ utf8_length_from_utf16(std::span<const char16_t> valid_utf16_input) noexcept {
  *
  * @param input         the UTF-16 string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
- * @return the number of bytes required to encode the UTF-16LE string as UTF-8
- */
-simdutf_warn_unused size_t utf8_length_from_utf16_with_replacement(
+ * @return a result pair struct (of type simdutf::result containing the two
+ * fields error and count) where the count is the number of bytes required to
+ * encode the UTF-16 string as UTF-8, and the error code is either SUCCESS or
+ * SURROGATE. The count is correct regardless of the error field.
+ * When SURROGATE is returned, it does not indicate an error in the case of this
+ * function: it indicates that at least one surrogate has been encountered: the
+ * surrogates may be matched or not (thus this function does not validate). If
+ * the returned error code is SUCCESS, then the input contains no surrogate, is
+ * in the Basic Multilingual Plane, and is necessarily valid.
+ */
+simdutf_warn_unused result utf8_length_from_utf16_with_replacement(
     const char16_t *input, size_t length) noexcept;
   #if SIMDUTF_SPAN
-simdutf_really_inline simdutf_warn_unused size_t
+simdutf_really_inline simdutf_warn_unused result
 utf8_length_from_utf16_with_replacement(
     std::span<const char16_t> valid_utf16_input) noexcept {
   return utf8_length_from_utf16_with_replacement(valid_utf16_input.data(),
@@ -4152,9 +4174,18 @@ class implementation {
    * @param input         the UTF-16LE string to convert
    * @param length        the length of the string in 2-byte code units
    * (char16_t)
-   * @return the number of bytes required to encode the UTF-16LE string as UTF-8
+   * @return a result pair struct (of type simdutf::result containing the two
+   * fields error and count) where the count is the number of bytes required to
+   * encode the UTF-16LE string as UTF-8, and the error code is either SUCCESS
+   * or SURROGATE. The count is correct regardless of the error field.
+   * When SURROGATE is returned, it does not indicate an error in the case of
+   * this function: it indicates that at least one surrogate has been
+   * encountered: the surrogates may be matched or not (thus this function does
+   * not validate). If the returned error code is SUCCESS, then the input
+   * contains no surrogate, is in the Basic Multilingual Plane, and is
+   * necessarily valid.
    */
-  virtual simdutf_warn_unused size_t utf8_length_from_utf16le_with_replacement(
+  virtual simdutf_warn_unused result utf8_length_from_utf16le_with_replacement(
       const char16_t *input, size_t length) const noexcept = 0;
 
   /**
@@ -4165,9 +4196,18 @@ class implementation {
    * @param input         the UTF-16BE string to convert
    * @param length        the length of the string in 2-byte code units
    * (char16_t)
-   * @return the number of bytes required to encode the UTF-16BE string as UTF-8
+   * @return a result pair struct (of type simdutf::result containing the two
+   * fields error and count) where the count is the number of bytes required to
+   * encode the UTF-16BE string as UTF-8, and the error code is either SUCCESS
+   * or SURROGATE. The count is correct regardless of the error field.
+   * When SURROGATE is returned, it does not indicate an error in the case of
+   * this function: it indicates that at least one surrogate has been
+   * encountered: the surrogates may be matched or not (thus this function does
+   * not validate). If the returned error code is SUCCESS, then the input
+   * contains no surrogate, is in the Basic Multilingual Plane, and is
+   * necessarily valid.
    */
-  virtual simdutf_warn_unused size_t utf8_length_from_utf16be_with_replacement(
+  virtual simdutf_warn_unused result utf8_length_from_utf16be_with_replacement(
       const char16_t *input, size_t length) const noexcept = 0;
 
 #endif // SIMDUTF_FEATURE_UTF8 && SIMDUTF_FEATURE_UTF16

diff --git a/include/simdutf/simdutf_version.h b/include/simdutf/simdutf_version.h
@@ -4,7 +4,7 @@
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "7.6.0"
+#define SIMDUTF_VERSION "7.7.0"
 
 namespace simdutf {
 enum {
@@ -15,7 +15,7 @@ enum {
   /**
    * The minor version (major.MINOR.revision) of simdutf being used.
    */
-  SIMDUTF_VERSION_MINOR = 6,
+  SIMDUTF_VERSION_MINOR = 7,
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */