diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 25c2f904..1146d8ce 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -1214,6 +1214,61 @@ int snprintf_float(T value, int precision, float_specs specs, return exp - fraction_size; } } + +// A public domain branchless UTF-8 decoder by Christopher Wellons: +// https://github.com/skeeto/branchless-utf8 +/* Decode the next character, c, from buf, reporting errors in e. + * + * Since this is a branchless decoder, four bytes will be read from the + * buffer regardless of the actual length of the next character. This + * means the buffer _must_ have at least three bytes of zero padding + * following the end of the data stream. + * + * Errors are reported in e, which will be non-zero if the parsed + * character was somehow invalid: invalid byte sequence, non-canonical + * encoding, or a surrogate half. + * + * The function returns a pointer to the next character. When an error + * occurs, this pointer will be a guess that depends on the particular + * error, but it will always advance at least one byte. + */ +FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) { + static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 2, 3, 3, 4, 0}; + static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; + static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; + static const int shiftc[] = {0, 18, 12, 6, 0}; + static const int shifte[] = {0, 6, 4, 2, 0}; + + auto s = reinterpret_cast(buf); + int len = lengths[s[0] >> 3]; + + // Compute the pointer to the next character early so that the next + // iteration can start working on the next character. Neither Clang + // nor GCC figure out this reordering on their own. + const char* next = buf + len + !len; + + // Assume a four-byte character and load four bytes. Unused bits are + // shifted out. + *c = uint32_t(s[0] & masks[len]) << 18; + *c |= uint32_t(s[1] & 0x3f) << 12; + *c |= uint32_t(s[2] & 0x3f) << 6; + *c |= uint32_t(s[3] & 0x3f) << 0; + *c >>= shiftc[len]; + + // Accumulate the various error conditions. + *e = (*c < mins[len]) << 6; // non-canonical encoding + *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? + *e |= (*c > 0x10FFFF) << 8; // out of range? + *e |= (s[1] & 0xc0) >> 2; + *e |= (s[2] & 0xc0) >> 4; + *e |= (s[3]) >> 6; + *e ^= 0x2a; // top two bits of each tail byte correct? + *e >>= shifte[len]; + + return next; +} } // namespace internal template <> struct formatter { @@ -1240,87 +1295,24 @@ template <> struct formatter { } }; -// A public domain branchless UTF-8 decoder: -// https://github.com/skeeto/branchless-utf8 -/* Decode the next character, C, from BUF, reporting errors in E. - * - * Since this is a branchless decoder, four bytes will be read from the - * buffer regardless of the actual length of the next character. This - * means the buffer _must_ have at least three bytes of zero padding - * following the end of the data stream. - * - * Errors are reported in E, which will be non-zero if the parsed - * character was somehow invalid: invalid byte sequence, non-canonical - * encoding, or a surrogate half. - * - * The function returns a pointer to the next character. When an error - * occurs, this pointer will be a guess that depends on the particular - * error, but it will always advance at least one byte. - */ -static void* utf8_decode(void* buf, uint32_t* c, int* e) { - static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 2, 2, 2, 2, 3, 3, 4, 0}; - static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; - static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; - static const int shiftc[] = {0, 18, 12, 6, 0}; - static const int shifte[] = {0, 6, 4, 2, 0}; - - auto s = reinterpret_cast(buf); - int len = lengths[s[0] >> 3]; - - /* Compute the pointer to the next character early so that the next - * iteration can start working on the next character. Neither Clang - * nor GCC figure out this reordering on their own. - */ - unsigned char* next = s + len + !len; - - /* Assume a four-byte character and load four bytes. Unused bits are - * shifted out. - */ - *c = (uint32_t)(s[0] & masks[len]) << 18; - *c |= (uint32_t)(s[1] & 0x3f) << 12; - *c |= (uint32_t)(s[2] & 0x3f) << 6; - *c |= (uint32_t)(s[3] & 0x3f) << 0; - *c >>= shiftc[len]; - - /* Accumulate the various error conditions. */ - *e = (*c < mins[len]) << 6; // non-canonical encoding - *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? - *e |= (*c > 0x10FFFF) << 8; // out of range? - *e |= (s[1] & 0xc0) >> 2; - *e |= (s[2] & 0xc0) >> 4; - *e |= (s[3]) >> 6; - *e ^= 0x2a; // top two bits of each tail byte correct? - *e >>= shifte[len]; - - return next; +FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) { + for (auto p = s.data(), end = p + s.size(); p != end;) { + auto cp = uint32_t(); + auto error = 0; + p = utf8_decode(p, &cp, &error); + if (error != 0) FMT_THROW(std::runtime_error("invalid utf8")); + if (cp <= 0xFFFF) { + buffer_.push_back(static_cast(cp)); + } else { + cp -= 0x10000; + buffer_.push_back(static_cast(0xD800 + (cp >> 10))); + buffer_.push_back(static_cast(0xDC00 + (cp & 0x3FF))); + } + } + buffer_.push_back(0); } #if FMT_USE_WINDOWS_H - -FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) { - static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16"; - if (s.size() > INT_MAX) - FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG)); - int s_size = static_cast(s.size()); - if (s_size == 0) { - // MultiByteToWideChar does not support zero length, handle separately. - buffer_.resize(1); - buffer_[0] = 0; - return; - } - - int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), - s_size, nullptr, 0); - if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG)); - buffer_.resize(length + 1); - length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size, - &buffer_[0], length); - if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG)); - buffer_[length] = 0; -} - FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) { if (int error_code = convert(s)) { FMT_THROW(windows_error(error_code, @@ -1389,7 +1381,6 @@ FMT_FUNC void internal::format_windows_error(internal::buffer& out, FMT_CATCH(...) {} format_error_code(out, error_code, message); } - #endif // FMT_USE_WINDOWS_H FMT_FUNC void format_system_error(internal::buffer& out, int error_code, diff --git a/include/fmt/format.h b/include/fmt/format.h index 8a4ab7fb..6662e334 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -940,29 +940,28 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) { return internal::copy_str(buffer, buffer + num_digits, out); } -#ifndef _WIN32 -# define FMT_USE_WINDOWS_H 0 -#elif !defined(FMT_USE_WINDOWS_H) -# define FMT_USE_WINDOWS_H 1 -#endif - -// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h. -// All the functionality that relies on it will be disabled too. -#if FMT_USE_WINDOWS_H // A converter from UTF-8 to UTF-16. -// It is only provided for Windows since other systems support UTF-8 natively. class utf8_to_utf16 { private: wmemory_buffer buffer_; public: FMT_API explicit utf8_to_utf16(string_view s); - operator wstring_view() const { return wstring_view(&buffer_[0], size()); } + operator wstring_view() const { return {&buffer_[0], size()}; } size_t size() const { return buffer_.size() - 1; } const wchar_t* c_str() const { return &buffer_[0]; } - std::wstring str() const { return std::wstring(&buffer_[0], size()); } + std::wstring str() const { return {&buffer_[0], size()}; } }; +// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h. +// All the functionality that relies on it will be disabled too. +#ifndef _WIN32 +# define FMT_USE_WINDOWS_H 0 +#elif !defined(FMT_USE_WINDOWS_H) +# define FMT_USE_WINDOWS_H 1 +#endif + +#if FMT_USE_WINDOWS_H // A converter from UTF-16 to UTF-8. // It is only provided for Windows since other systems support UTF-8 natively. class utf16_to_utf8 { diff --git a/test/format-test.cc b/test/format-test.cc index 7f718e7a..195ac570 100644 --- a/test/format-test.cc +++ b/test/format-test.cc @@ -400,6 +400,23 @@ TEST(MemoryBufferTest, ExceptionInDeallocate) { EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size)); } +TEST(UtilTest, UTF8ToUTF16) { + fmt::internal::utf8_to_utf16 u("лошадка"); + EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str()); + EXPECT_EQ(7, u.size()); + // U+10437 { DESERET SMALL LETTER YEE } + EXPECT_EQ(L"\xD801\xDC37", fmt::internal::utf8_to_utf16("𐐷").str()); + EXPECT_THROW_MSG(fmt::internal::utf8_to_utf16("\xc3\x28"), std::runtime_error, + "invalid utf8"); +} + +TEST(UtilTest, UTF8ToUTF16EmptyString) { + std::string s = ""; + fmt::internal::utf8_to_utf16 u(s.c_str()); + EXPECT_EQ(L"", u.str()); + EXPECT_EQ(s.size(), u.size()); +} + #ifdef _WIN32 TEST(UtilTest, UTF16ToUTF8) { std::string s = "ёжик"; @@ -415,20 +432,6 @@ TEST(UtilTest, UTF16ToUTF8EmptyString) { EXPECT_EQ(s.size(), u.size()); } -TEST(UtilTest, UTF8ToUTF16) { - std::string s = "лошадка"; - fmt::internal::utf8_to_utf16 u(s.c_str()); - EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str()); - EXPECT_EQ(7, u.size()); -} - -TEST(UtilTest, UTF8ToUTF16EmptyString) { - std::string s = ""; - fmt::internal::utf8_to_utf16 u(s.c_str()); - EXPECT_EQ(L"", u.str()); - EXPECT_EQ(s.size(), u.size()); -} - template void check_utf_conversion_error( const char* message, @@ -450,13 +453,6 @@ TEST(UtilTest, UTF16ToUTF8Error) { "cannot convert string from UTF-16 to UTF-8"); } -TEST(UtilTest, UTF8ToUTF16Error) { - const char* message = "cannot convert string from UTF-8 to UTF-16"; - check_utf_conversion_error(message); - check_utf_conversion_error( - message, fmt::string_view("foo", INT_MAX + 1u)); -} - TEST(UtilTest, UTF16ToUTF8Convert) { fmt::internal::utf16_to_utf8 u; EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1))); @@ -1237,8 +1233,7 @@ TEST(FormatterTest, Precision) { format_error, "precision not allowed for this argument type"); EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value()), - format_error, - "number is too big"); + format_error, "number is too big"); EXPECT_EQ("st", format("{0:.2}", "str")); } @@ -1875,8 +1870,8 @@ TEST(FormatTest, Dynamic) { args.emplace_back(fmt::internal::make_arg(1.5f)); std::string result = fmt::vformat( - "{} and {} and {}", fmt::basic_format_args( - args.data(), static_cast(args.size()))); + "{} and {} and {}", + fmt::basic_format_args(args.data(), static_cast(args.size()))); EXPECT_EQ("42 and abc1 and 1.5", result); } @@ -2266,9 +2261,7 @@ struct test_format_specs_handler { FMT_CONSTEXPR void on_precision(int p) { precision = p; } FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {} - FMT_CONSTEXPR void on_dynamic_precision(int index) { - precision_ref = index; - } + FMT_CONSTEXPR void on_dynamic_precision(int index) { precision_ref = index; } FMT_CONSTEXPR void on_dynamic_precision(string_view) {} FMT_CONSTEXPR void end_precision() {}