diff --git a/include/fmt/core.h b/include/fmt/core.h index b0eef0ae..930b6bb6 100644 --- a/include/fmt/core.h +++ b/include/fmt/core.h @@ -2297,12 +2297,16 @@ constexpr auto to_ascii(Char c) -> underlying_t { return c; } +template +FMT_CONSTEXPR auto code_point_length_impl(Char begin) -> int { + return "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4" + [static_cast(begin) >> 3]; +} + template FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int { if (const_check(sizeof(Char) != 1)) return 1; - auto lengths = - "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"; - int len = lengths[static_cast(*begin) >> 3]; + int len = code_point_length_impl(*begin); // Compute the pointer to the next character early so that the next // iteration can start working on the next character. Neither Clang diff --git a/include/fmt/format.h b/include/fmt/format.h index e146476e..7c607dbd 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -602,25 +602,28 @@ FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(InputIt begin, InputIt end, */ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e) -> const char* { - constexpr const int prefix_masks[] = {0x00, 0x80, 0xe0, 0xf0, 0xf8}; constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; constexpr const int shiftc[] = {0, 18, 12, 6, 0}; constexpr const int shifte[] = {0, 6, 4, 2, 0}; - int len = code_point_length(s); - const char* next = s + len; + int len = code_point_length_impl(*s); + // Compute the pointer to the next character early so that the next + // iteration can start working on the next character. Neither Clang + // nor GCC figure out this reordering on their own. + const char* next = s + len + !len; + + using uchar = unsigned char; // Assume a four-byte character and load four bytes. Unused bits are // shifted out. - *c = uint32_t(s[0] & masks[len]) << 18; - *c |= uint32_t(s[1] & 0x3f) << 12; - *c |= uint32_t(s[2] & 0x3f) << 6; - *c |= uint32_t(s[3] & 0x3f) << 0; + *c = uint32_t(uchar(s[0]) & masks[len]) << 18; + *c |= uint32_t(uchar(s[1]) & 0x3f) << 12; + *c |= uint32_t(uchar(s[2]) & 0x3f) << 6; + *c |= uint32_t(uchar(s[3]) & 0x3f) << 0; *c >>= shiftc[len]; // Accumulate the various error conditions. - using uchar = unsigned char; *e = (*c < mins[len]) << 6; // non-canonical encoding *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? *e |= (*c > 0x10FFFF) << 8; // out of range? @@ -629,8 +632,6 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e) *e |= uchar(s[3]) >> 6; *e ^= 0x2a; // top two bits of each tail byte correct? *e >>= shifte[len]; - *e |= ((uchar(s[0]) & prefix_masks[len]) != - uchar((prefix_masks[len] << 1) & 0xFF)); // first byte correct? return next; } diff --git a/test/format-impl-test.cc b/test/format-impl-test.cc index e6c6b652..46486a6f 100644 --- a/test/format-impl-test.cc +++ b/test/format-impl-test.cc @@ -430,3 +430,118 @@ TEST(format_impl_test, write_console_signature) { (void)p; } #endif + +// A public domain branchless UTF-8 decoder by Christopher Wellons: +// https://github.com/skeeto/branchless-utf8 +constexpr bool unicode_is_surrogate(uint32_t c) { + return c >= 0xD800U && c <= 0xDFFFU; +} + +FMT_CONSTEXPR char* utf8_encode(char* s, uint32_t c) { + if (c >= (1UL << 16)) { + s[0] = static_cast(0xf0 | (c >> 18)); + s[1] = static_cast(0x80 | ((c >> 12) & 0x3f)); + s[2] = static_cast(0x80 | ((c >> 6) & 0x3f)); + s[3] = static_cast(0x80 | ((c >> 0) & 0x3f)); + return s + 4; + } else if (c >= (1UL << 11)) { + s[0] = static_cast(0xe0 | (c >> 12)); + s[1] = static_cast(0x80 | ((c >> 6) & 0x3f)); + s[2] = static_cast(0x80 | ((c >> 0) & 0x3f)); + return s + 3; + } else if (c >= (1UL << 7)) { + s[0] = static_cast(0xc0 | (c >> 6)); + s[1] = static_cast(0x80 | ((c >> 0) & 0x3f)); + return s + 2; + } else { + s[0] = static_cast(c); + return s + 1; + } +} + +// Make sure it can decode every character +TEST(format_impl_test, utf8_decode_decode_all) { + for (uint32_t i = 0; i < 0x10ffff; i++) { + if (!unicode_is_surrogate(i)) { + int e; + uint32_t c; + char buf[8] = {0}; + char* end = utf8_encode(buf, i); + const char* res = fmt::detail::utf8_decode(buf, &c, &e); + EXPECT_EQ(end, res); + EXPECT_EQ(c, i); + EXPECT_EQ(e, 0); + } + } +} + +// Reject everything outside of U+0000..U+10FFFF +TEST(format_impl_test, utf8_decode_out_of_range) { + for (uint32_t i = 0x110000; i < 0x1fffff; i++) { + int e; + uint32_t c; + char buf[8] = {0}; + utf8_encode(buf, i); + const char* end = fmt::detail::utf8_decode(buf, &c, &e); + EXPECT_NE(e, 0); + EXPECT_EQ(end - buf, 4); + } +} + +// Does it reject all surrogate halves? +TEST(format_impl_test, utf8_decode_surrogate_halves) { + for (uint32_t i = 0xd800; i <= 0xdfff; i++) { + int e; + uint32_t c; + char buf[8] = {0}; + utf8_encode(buf, i); + fmt::detail::utf8_decode(buf, &c, &e); + EXPECT_NE(e, 0); + } +} + +// How about non-canonical encodings? +TEST(format_impl_test, utf8_decode_non_canonical_encodings) { + int e; + uint32_t c; + const char* end; + + char buf2[8] = {char(0xc0), char(0xA4)}; + end = fmt::detail::utf8_decode(buf2, &c, &e); + EXPECT_NE(e, 0); // non-canonical len 2 + EXPECT_EQ(end, buf2 + 2); // non-canonical recover 2 + + char buf3[8] = {char(0xe0), char(0x80), char(0xA4)}; + end = fmt::detail::utf8_decode(buf3, &c, &e); + EXPECT_NE(e, 0); // non-canonical len 3 + EXPECT_EQ(end, buf3 + 3); // non-canonical recover 3 + + char buf4[8] = {char(0xf0), char(0x80), char(0x80), char(0xA4)}; + end = fmt::detail::utf8_decode(buf4, &c, &e); + EXPECT_NE(e, 0); // non-canonical encoding len 4 + EXPECT_EQ(end, buf4 + 4); // non-canonical recover 4 +} + +// Let's try some bogus byte sequences +TEST(format_impl_test, utf8_decode_bogus_byte_sequences) { + int e; + uint32_t c; + + // Invalid first byte + char buf0[4] = {char(0xff)}; + auto len = fmt::detail::utf8_decode(buf0, &c, &e) - buf0; + EXPECT_NE(e, 0); // "bogus [ff] 0x%02x U+%04lx", e, (unsigned long)c); + EXPECT_EQ(len, 1); // "bogus [ff] recovery %d", len); + + // Invalid first byte + char buf1[4] = {char(0x80)}; + len = fmt::detail::utf8_decode(buf1, &c, &e) - buf1; + EXPECT_NE(e, 0); // "bogus [80] 0x%02x U+%04lx", e, (unsigned long)c); + EXPECT_EQ(len, 1); // "bogus [80] recovery %d", len); + + // Looks like a two-byte sequence but second byte is wrong + char buf2[4] = {char(0xc0), char(0x0a)}; + len = fmt::detail::utf8_decode(buf2, &c, &e) - buf2; + EXPECT_NE(e, 0); // "bogus [c0 0a] 0x%02x U+%04lx", e, (unsigned long)c + EXPECT_EQ(len, 2); // "bogus [c0 0a] recovery %d", len); +}