Implement utf8_to_utf16 using utf8_decode

2025-07-29 18:27:40 +02:00 · 2019-12-15 07:45:57 -08:00
parent 0012917f69
commit ec2463c905
3 changed files with 102 additions and 119 deletions
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@ -1214,6 +1214,61 @@ int snprintf_float(T value, int precision, float_specs specs,
    return exp - fraction_size;
  }
 }
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from buf, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
+  static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+                                 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
+  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  static const int shiftc[] = {0, 18, 12, 6, 0};
+  static const int shifte[] = {0, 6, 4, 2, 0};
+
+  auto s = reinterpret_cast<const unsigned char*>(buf);
+  int len = lengths[s[0] >> 3];
+
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  const char* next = buf + len + !len;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  *c = uint32_t(s[0] & masks[len]) << 18;
+  *c |= uint32_t(s[1] & 0x3f) << 12;
+  *c |= uint32_t(s[2] & 0x3f) << 6;
+  *c |= uint32_t(s[3] & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (s[1] & 0xc0) >> 2;
+  *e |= (s[2] & 0xc0) >> 4;
+  *e |= (s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
 }  // namespace internal

 template <> struct formatter<internal::bigint> {
@ -1240,87 +1295,24 @@ template <> struct formatter<internal::bigint> {
  }
 };

-// A public domain branchless UTF-8 decoder:
-// https://github.com/skeeto/branchless-utf8
-/* Decode the next character, C, from BUF, reporting errors in E.
- *
- * Since this is a branchless decoder, four bytes will be read from the
- * buffer regardless of the actual length of the next character. This
- * means the buffer _must_ have at least three bytes of zero padding
- * following the end of the data stream.
- *
- * Errors are reported in E, which will be non-zero if the parsed
- * character was somehow invalid: invalid byte sequence, non-canonical
- * encoding, or a surrogate half.
- *
- * The function returns a pointer to the next character. When an error
- * occurs, this pointer will be a guess that depends on the particular
- * error, but it will always advance at least one byte.
- */
-static void* utf8_decode(void* buf, uint32_t* c, int* e) {
-  static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
-  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
-  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
-  static const int shiftc[] = {0, 18, 12, 6, 0};
-  static const int shifte[] = {0, 6, 4, 2, 0};
-
-  auto s = reinterpret_cast<unsigned char*>(buf);
-  int len = lengths[s[0] >> 3];
-
-  /* Compute the pointer to the next character early so that the next
-   * iteration can start working on the next character. Neither Clang
-   * nor GCC figure out this reordering on their own.
-   */
-  unsigned char* next = s + len + !len;
-
-  /* Assume a four-byte character and load four bytes. Unused bits are
-   * shifted out.
-   */
-  *c = (uint32_t)(s[0] & masks[len]) << 18;
-  *c |= (uint32_t)(s[1] & 0x3f) << 12;
-  *c |= (uint32_t)(s[2] & 0x3f) << 6;
-  *c |= (uint32_t)(s[3] & 0x3f) << 0;
-  *c >>= shiftc[len];
-
-  /* Accumulate the various error conditions. */
-  *e = (*c < mins[len]) << 6;       // non-canonical encoding
-  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
-  *e |= (*c > 0x10FFFF) << 8;       // out of range?
-  *e |= (s[1] & 0xc0) >> 2;
-  *e |= (s[2] & 0xc0) >> 4;
-  *e |= (s[3]) >> 6;
-  *e ^= 0x2a;  // top two bits of each tail byte correct?
-  *e >>= shifte[len];
-
-  return next;
+FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
+  for (auto p = s.data(), end = p + s.size(); p != end;) {
+    auto cp = uint32_t();
+    auto error = 0;
+    p = utf8_decode(p, &cp, &error);
+    if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
+    if (cp <= 0xFFFF) {
+      buffer_.push_back(static_cast<wchar_t>(cp));
+    } else {
+      cp -= 0x10000;
+      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
+      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
+    }
+  }
+  buffer_.push_back(0);
 }

 #if FMT_USE_WINDOWS_H
-
-FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
-  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
-  if (s.size() > INT_MAX)
-    FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG));
-  int s_size = static_cast<int>(s.size());
-  if (s_size == 0) {
-    // MultiByteToWideChar does not support zero length, handle separately.
-    buffer_.resize(1);
-    buffer_[0] = 0;
-    return;
-  }
-
-  int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(),
-                                   s_size, nullptr, 0);
-  if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
-  buffer_.resize(length + 1);
-  length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size,
-                               &buffer_[0], length);
-  if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
-  buffer_[length] = 0;
-}
-
 FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
  if (int error_code = convert(s)) {
    FMT_THROW(windows_error(error_code,
@ -1389,7 +1381,6 @@ FMT_FUNC void internal::format_windows_error(internal::buffer<char>& out,
  FMT_CATCH(...) {}
  format_error_code(out, error_code, message);
 }
-
 #endif  // FMT_USE_WINDOWS_H

 FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@ -940,29 +940,28 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
  return internal::copy_str<Char>(buffer, buffer + num_digits, out);
 }

-#ifndef _WIN32
-#  define FMT_USE_WINDOWS_H 0
-#elif !defined(FMT_USE_WINDOWS_H)
-#  define FMT_USE_WINDOWS_H 1
-#endif
-
-// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
-// All the functionality that relies on it will be disabled too.
-#if FMT_USE_WINDOWS_H
 // A converter from UTF-8 to UTF-16.
-// It is only provided for Windows since other systems support UTF-8 natively.
 class utf8_to_utf16 {
 private:
  wmemory_buffer buffer_;

 public:
  FMT_API explicit utf8_to_utf16(string_view s);
-  operator wstring_view() const { return wstring_view(&buffer_[0], size()); }
+  operator wstring_view() const { return {&buffer_[0], size()}; }
  size_t size() const { return buffer_.size() - 1; }
  const wchar_t* c_str() const { return &buffer_[0]; }
-  std::wstring str() const { return std::wstring(&buffer_[0], size()); }
+  std::wstring str() const { return {&buffer_[0], size()}; }
 };

+// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
+// All the functionality that relies on it will be disabled too.
+#ifndef _WIN32
+#  define FMT_USE_WINDOWS_H 0
+#elif !defined(FMT_USE_WINDOWS_H)
+#  define FMT_USE_WINDOWS_H 1
+#endif
+
+#if FMT_USE_WINDOWS_H
 // A converter from UTF-16 to UTF-8.
 // It is only provided for Windows since other systems support UTF-8 natively.
 class utf16_to_utf8 {
--- a/test/format-test.cc
+++ b/test/format-test.cc
@ -400,6 +400,23 @@ TEST(MemoryBufferTest, ExceptionInDeallocate) {
  EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size));
 }

+TEST(UtilTest, UTF8ToUTF16) {
+  fmt::internal::utf8_to_utf16 u("лошадка");
+  EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
+  EXPECT_EQ(7, u.size());
+  // U+10437 { DESERET SMALL LETTER YEE }
+  EXPECT_EQ(L"\xD801\xDC37", fmt::internal::utf8_to_utf16("𐐷").str());
+  EXPECT_THROW_MSG(fmt::internal::utf8_to_utf16("\xc3\x28"), std::runtime_error,
+                   "invalid utf8");
+}
+
+TEST(UtilTest, UTF8ToUTF16EmptyString) {
+  std::string s = "";
+  fmt::internal::utf8_to_utf16 u(s.c_str());
+  EXPECT_EQ(L"", u.str());
+  EXPECT_EQ(s.size(), u.size());
+}
+
 #ifdef _WIN32
 TEST(UtilTest, UTF16ToUTF8) {
  std::string s = "ёжик";
@ -415,20 +432,6 @@ TEST(UtilTest, UTF16ToUTF8EmptyString) {
  EXPECT_EQ(s.size(), u.size());
 }

-TEST(UtilTest, UTF8ToUTF16) {
-  std::string s = "лошадка";
-  fmt::internal::utf8_to_utf16 u(s.c_str());
-  EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
-  EXPECT_EQ(7, u.size());
-}
-
-TEST(UtilTest, UTF8ToUTF16EmptyString) {
-  std::string s = "";
-  fmt::internal::utf8_to_utf16 u(s.c_str());
-  EXPECT_EQ(L"", u.str());
-  EXPECT_EQ(s.size(), u.size());
-}
-
 template <typename Converter, typename Char>
 void check_utf_conversion_error(
    const char* message,
@ -450,13 +453,6 @@ TEST(UtilTest, UTF16ToUTF8Error) {
      "cannot convert string from UTF-16 to UTF-8");
 }

-TEST(UtilTest, UTF8ToUTF16Error) {
-  const char* message = "cannot convert string from UTF-8 to UTF-16";
-  check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(message);
-  check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(
-      message, fmt::string_view("foo", INT_MAX + 1u));
-}
-
 TEST(UtilTest, UTF16ToUTF8Convert) {
  fmt::internal::utf16_to_utf8 u;
  EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1)));
@ -1237,8 +1233,7 @@ TEST(FormatterTest, Precision) {
                   format_error,
                   "precision not allowed for this argument type");
  EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value<int>()),
-                   format_error,
-                   "number is too big");
+                   format_error, "number is too big");

  EXPECT_EQ("st", format("{0:.2}", "str"));
 }
@ -1875,8 +1870,8 @@ TEST(FormatTest, Dynamic) {
  args.emplace_back(fmt::internal::make_arg<ctx>(1.5f));

  std::string result = fmt::vformat(
-      "{} and {} and {}", fmt::basic_format_args<ctx>(
-                              args.data(), static_cast<int>(args.size())));
+      "{} and {} and {}",
+      fmt::basic_format_args<ctx>(args.data(), static_cast<int>(args.size())));

  EXPECT_EQ("42 and abc1 and 1.5", result);
 }
@ -2266,9 +2261,7 @@ struct test_format_specs_handler {

  FMT_CONSTEXPR void on_precision(int p) { precision = p; }
  FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {}
-  FMT_CONSTEXPR void on_dynamic_precision(int index) {
-    precision_ref = index;
-  }
+  FMT_CONSTEXPR void on_dynamic_precision(int index) { precision_ref = index; }
  FMT_CONSTEXPR void on_dynamic_precision(string_view) {}

  FMT_CONSTEXPR void end_precision() {}