Implement utf8_to_utf16 using utf8_decode

2025-07-31 11:17:35 +02:00 · 2019-12-15 07:45:57 -08:00
parent 0012917f69
commit ec2463c905
3 changed files with 102 additions and 119 deletions
--- a/include/fmt/format-inl.h
+++ b/include/fmt/format-inl.h
@ -1214,6 +1214,61 @@ int snprintf_float(T value, int precision, float_specs specs,
    return exp - fraction_size;
  }
 }
 // A public domain branchless UTF-8 decoder by Christopher Wellons:
 // https://github.com/skeeto/branchless-utf8
 /* Decode the next character, c, from buf, reporting errors in e.
 *
 * Since this is a branchless decoder, four bytes will be read from the
 * buffer regardless of the actual length of the next character. This
 * means the buffer _must_ have at least three bytes of zero padding
 * following the end of the data stream.
 *
 * Errors are reported in e, which will be non-zero if the parsed
 * character was somehow invalid: invalid byte sequence, non-canonical
 * encoding, or a surrogate half.
 *
 * The function returns a pointer to the next character. When an error
 * occurs, this pointer will be a guess that depends on the particular
 * error, but it will always advance at least one byte.
 */
 FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
  static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
                                 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
  static const int shiftc[] = {0, 18, 12, 6, 0};
  static const int shifte[] = {0, 6, 4, 2, 0};
  auto s = reinterpret_cast<const unsigned char*>(buf);
  int len = lengths[s[0] >> 3];
  // Compute the pointer to the next character early so that the next
  // iteration can start working on the next character. Neither Clang
  // nor GCC figure out this reordering on their own.
  const char* next = buf + len + !len;
  // Assume a four-byte character and load four bytes. Unused bits are
  // shifted out.
  *c = uint32_t(s[0] & masks[len]) << 18;
  *c |= uint32_t(s[1] & 0x3f) << 12;
  *c |= uint32_t(s[2] & 0x3f) << 6;
  *c |= uint32_t(s[3] & 0x3f) << 0;
  *c >>= shiftc[len];
  // Accumulate the various error conditions.
  *e = (*c < mins[len]) << 6;       // non-canonical encoding
  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
  *e |= (*c > 0x10FFFF) << 8;       // out of range?
  *e |= (s[1] & 0xc0) >> 2;
  *e |= (s[2] & 0xc0) >> 4;
  *e |= (s[3]) >> 6;
  *e ^= 0x2a;  // top two bits of each tail byte correct?
  *e >>= shifte[len];
  return next;
 }
 }  // namespace internal
 template <> struct formatter<internal::bigint> {
@ -1240,87 +1295,24 @@ template <> struct formatter<internal::bigint> {
  }
 };
-// A public domain branchless UTF-8 decoder:
+FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
-// https://github.com/skeeto/branchless-utf8
+  for (auto p = s.data(), end = p + s.size(); p != end;) {
-/* Decode the next character, C, from BUF, reporting errors in E.
+    auto cp = uint32_t();
- *
+    auto error = 0;
- * Since this is a branchless decoder, four bytes will be read from the
+    p = utf8_decode(p, &cp, &error);
- * buffer regardless of the actual length of the next character. This
+    if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
- * means the buffer _must_ have at least three bytes of zero padding
+    if (cp <= 0xFFFF) {
- * following the end of the data stream.
+      buffer_.push_back(static_cast<wchar_t>(cp));
- *
+    } else {
- * Errors are reported in E, which will be non-zero if the parsed
+      cp -= 0x10000;
- * character was somehow invalid: invalid byte sequence, non-canonical
+      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
- * encoding, or a surrogate half.
+      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
- *
+    }
- * The function returns a pointer to the next character. When an error
+  }
- * occurs, this pointer will be a guess that depends on the particular
+  buffer_.push_back(0);
 * error, but it will always advance at least one byte.
 */
 static void* utf8_decode(void* buf, uint32_t* c, int* e) {
  static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                                 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
                                 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
  static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
  static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
  static const int shiftc[] = {0, 18, 12, 6, 0};
  static const int shifte[] = {0, 6, 4, 2, 0};
  auto s = reinterpret_cast<unsigned char*>(buf);
  int len = lengths[s[0] >> 3];
  /* Compute the pointer to the next character early so that the next
   * iteration can start working on the next character. Neither Clang
   * nor GCC figure out this reordering on their own.
   */
  unsigned char* next = s + len + !len;
  /* Assume a four-byte character and load four bytes. Unused bits are
   * shifted out.
   */
  *c = (uint32_t)(s[0] & masks[len]) << 18;
  *c |= (uint32_t)(s[1] & 0x3f) << 12;
  *c |= (uint32_t)(s[2] & 0x3f) << 6;
  *c |= (uint32_t)(s[3] & 0x3f) << 0;
  *c >>= shiftc[len];
  /* Accumulate the various error conditions. */
  *e = (*c < mins[len]) << 6;       // non-canonical encoding
  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
  *e |= (*c > 0x10FFFF) << 8;       // out of range?
  *e |= (s[1] & 0xc0) >> 2;
  *e |= (s[2] & 0xc0) >> 4;
  *e |= (s[3]) >> 6;
  *e ^= 0x2a;  // top two bits of each tail byte correct?
  *e >>= shifte[len];
  return next;
 }
 #if FMT_USE_WINDOWS_H
 FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
  static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
  if (s.size() > INT_MAX)
    FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG));
  int s_size = static_cast<int>(s.size());
  if (s_size == 0) {
    // MultiByteToWideChar does not support zero length, handle separately.
    buffer_.resize(1);
    buffer_[0] = 0;
    return;
  }
  int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(),
                                   s_size, nullptr, 0);
  if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
  buffer_.resize(length + 1);
  length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size,
                               &buffer_[0], length);
  if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
  buffer_[length] = 0;
 }
 FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
  if (int error_code = convert(s)) {
    FMT_THROW(windows_error(error_code,
@ -1389,7 +1381,6 @@ FMT_FUNC void internal::format_windows_error(internal::buffer<char>& out,
  FMT_CATCH(...) {}
  format_error_code(out, error_code, message);
 }
 #endif  // FMT_USE_WINDOWS_H
 FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,
--- a/include/fmt/format.h
+++ b/include/fmt/format.h
@ -940,29 +940,28 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
  return internal::copy_str<Char>(buffer, buffer + num_digits, out);
 }
 #ifndef _WIN32
 #  define FMT_USE_WINDOWS_H 0
 #elif !defined(FMT_USE_WINDOWS_H)
 #  define FMT_USE_WINDOWS_H 1
 #endif
 // Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
 // All the functionality that relies on it will be disabled too.
 #if FMT_USE_WINDOWS_H
 // A converter from UTF-8 to UTF-16.
 // It is only provided for Windows since other systems support UTF-8 natively.
 class utf8_to_utf16 {
 private:
  wmemory_buffer buffer_;
 public:
  FMT_API explicit utf8_to_utf16(string_view s);
-  operator wstring_view() const { return wstring_view(&buffer_[0], size()); }
+  operator wstring_view() const { return {&buffer_[0], size()}; }
  size_t size() const { return buffer_.size() - 1; }
  const wchar_t* c_str() const { return &buffer_[0]; }
-  std::wstring str() const { return std::wstring(&buffer_[0], size()); }
+  std::wstring str() const { return {&buffer_[0], size()}; }
 };
 // Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
 // All the functionality that relies on it will be disabled too.
 #ifndef _WIN32
 #  define FMT_USE_WINDOWS_H 0
 #elif !defined(FMT_USE_WINDOWS_H)
 #  define FMT_USE_WINDOWS_H 1
 #endif
 #if FMT_USE_WINDOWS_H
 // A converter from UTF-16 to UTF-8.
 // It is only provided for Windows since other systems support UTF-8 natively.
 class utf16_to_utf8 {
--- a/test/format-test.cc
+++ b/test/format-test.cc
@ -400,6 +400,23 @@ TEST(MemoryBufferTest, ExceptionInDeallocate) {
  EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size));
 }
 TEST(UtilTest, UTF8ToUTF16) {
  fmt::internal::utf8_to_utf16 u("лошадка");
  EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
  EXPECT_EQ(7, u.size());
  // U+10437 { DESERET SMALL LETTER YEE }
  EXPECT_EQ(L"\xD801\xDC37", fmt::internal::utf8_to_utf16("𐐷").str());
  EXPECT_THROW_MSG(fmt::internal::utf8_to_utf16("\xc3\x28"), std::runtime_error,
                   "invalid utf8");
 }
 TEST(UtilTest, UTF8ToUTF16EmptyString) {
  std::string s = "";
  fmt::internal::utf8_to_utf16 u(s.c_str());
  EXPECT_EQ(L"", u.str());
  EXPECT_EQ(s.size(), u.size());
 }
 #ifdef _WIN32
 TEST(UtilTest, UTF16ToUTF8) {
  std::string s = "ёжик";
@ -415,20 +432,6 @@ TEST(UtilTest, UTF16ToUTF8EmptyString) {
  EXPECT_EQ(s.size(), u.size());
 }
 TEST(UtilTest, UTF8ToUTF16) {
  std::string s = "лошадка";
  fmt::internal::utf8_to_utf16 u(s.c_str());
  EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
  EXPECT_EQ(7, u.size());
 }
 TEST(UtilTest, UTF8ToUTF16EmptyString) {
  std::string s = "";
  fmt::internal::utf8_to_utf16 u(s.c_str());
  EXPECT_EQ(L"", u.str());
  EXPECT_EQ(s.size(), u.size());
 }
 template <typename Converter, typename Char>
 void check_utf_conversion_error(
    const char* message,
@ -450,13 +453,6 @@ TEST(UtilTest, UTF16ToUTF8Error) {
      "cannot convert string from UTF-16 to UTF-8");
 }
 TEST(UtilTest, UTF8ToUTF16Error) {
  const char* message = "cannot convert string from UTF-8 to UTF-16";
  check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(message);
  check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(
      message, fmt::string_view("foo", INT_MAX + 1u));
 }
 TEST(UtilTest, UTF16ToUTF8Convert) {
  fmt::internal::utf16_to_utf8 u;
  EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1)));
@ -1237,8 +1233,7 @@ TEST(FormatterTest, Precision) {
                   format_error,
                   "precision not allowed for this argument type");
  EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value<int>()),
-                   format_error,
+                   format_error, "number is too big");
                   "number is too big");
  EXPECT_EQ("st", format("{0:.2}", "str"));
 }
@ -1875,8 +1870,8 @@ TEST(FormatTest, Dynamic) {
  args.emplace_back(fmt::internal::make_arg<ctx>(1.5f));
  std::string result = fmt::vformat(
-      "{} and {} and {}", fmt::basic_format_args<ctx>(
+      "{} and {} and {}",
-                              args.data(), static_cast<int>(args.size())));
+      fmt::basic_format_args<ctx>(args.data(), static_cast<int>(args.size())));
  EXPECT_EQ("42 and abc1 and 1.5", result);
 }
@ -2266,9 +2261,7 @@ struct test_format_specs_handler {
  FMT_CONSTEXPR void on_precision(int p) { precision = p; }
  FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {}
-  FMT_CONSTEXPR void on_dynamic_precision(int index) {
+  FMT_CONSTEXPR void on_dynamic_precision(int index) { precision_ref = index; }
    precision_ref = index;
  }
  FMT_CONSTEXPR void on_dynamic_precision(string_view) {}
  FMT_CONSTEXPR void end_precision() {}