diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 09a1e749..76e65cc1 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -2589,54 +2589,6 @@ int snprintf_float(T value, int precision, float_specs specs, } } -// A public domain branchless UTF-8 decoder by Christopher Wellons: -// https://github.com/skeeto/branchless-utf8 -/* Decode the next character, c, from buf, reporting errors in e. - * - * Since this is a branchless decoder, four bytes will be read from the - * buffer regardless of the actual length of the next character. This - * means the buffer _must_ have at least three bytes of zero padding - * following the end of the data stream. - * - * Errors are reported in e, which will be non-zero if the parsed - * character was somehow invalid: invalid byte sequence, non-canonical - * encoding, or a surrogate half. - * - * The function returns a pointer to the next character. When an error - * occurs, this pointer will be a guess that depends on the particular - * error, but it will always advance at least one byte. - */ -inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) { - static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; - static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; - static const int shiftc[] = {0, 18, 12, 6, 0}; - static const int shifte[] = {0, 6, 4, 2, 0}; - - int len = code_point_length(buf); - const char* next = buf + len; - - // Assume a four-byte character and load four bytes. Unused bits are - // shifted out. - auto s = reinterpret_cast(buf); - *c = uint32_t(s[0] & masks[len]) << 18; - *c |= uint32_t(s[1] & 0x3f) << 12; - *c |= uint32_t(s[2] & 0x3f) << 6; - *c |= uint32_t(s[3] & 0x3f) << 0; - *c >>= shiftc[len]; - - // Accumulate the various error conditions. - *e = (*c < mins[len]) << 6; // non-canonical encoding - *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? - *e |= (*c > 0x10FFFF) << 8; // out of range? - *e |= (s[1] & 0xc0) >> 2; - *e |= (s[2] & 0xc0) >> 4; - *e |= (s[3]) >> 6; - *e ^= 0x2a; // top two bits of each tail byte correct? - *e >>= shifte[len]; - - return next; -} - struct stringifier { template FMT_INLINE std::string operator()(T value) const { return to_string(value); @@ -2678,10 +2630,7 @@ template <> struct formatter { }; FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) { - auto transcode = [this](const char* p) { - auto cp = uint32_t(); - auto error = 0; - p = utf8_decode(p, &cp, &error); + for_each_codepoint(s, [this](uint32_t cp, int error) { if (error != 0) FMT_THROW(std::runtime_error("invalid utf8")); if (cp <= 0xFFFF) { buffer_.push_back(static_cast(cp)); @@ -2690,21 +2639,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) { buffer_.push_back(static_cast(0xD800 + (cp >> 10))); buffer_.push_back(static_cast(0xDC00 + (cp & 0x3FF))); } - return p; - }; - auto p = s.data(); - const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars. - if (s.size() >= block_size) { - for (auto end = p + s.size() - block_size + 1; p < end;) p = transcode(p); - } - if (auto num_chars_left = s.data() + s.size() - p) { - char buf[2 * block_size - 1] = {}; - memcpy(buf, p, to_unsigned(num_chars_left)); - p = buf; - do { - p = transcode(p); - } while (p - buf < num_chars_left); - } + }); buffer_.push_back(0); } diff --git a/include/fmt/format.h b/include/fmt/format.h index 827b3776..29e0106a 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -539,42 +539,6 @@ class truncating_iterator truncating_iterator& operator*() { return *this; } }; -template -inline size_t count_code_points(basic_string_view s) { - return s.size(); -} - -// Counts the number of code points in a UTF-8 string. -FMT_CONSTEXPR inline size_t count_code_points(basic_string_view s) { - const char* data = s.data(); - size_t num_code_points = 0; - for (size_t i = 0, size = s.size(); i != size; ++i) { - if ((data[i] & 0xc0) != 0x80) ++num_code_points; - } - return num_code_points; -} - -inline size_t count_code_points(basic_string_view s) { - return count_code_points(basic_string_view( - reinterpret_cast(s.data()), s.size())); -} - -template -inline size_t code_point_index(basic_string_view s, size_t n) { - size_t size = s.size(); - return n < size ? n : size; -} - -// Calculates the index of the nth code point in a UTF-8 string. -inline size_t code_point_index(basic_string_view s, size_t n) { - const char8_type* data = s.data(); - size_t num_code_points = 0; - for (size_t i = 0, size = s.size(); i != size; ++i) { - if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i; - } - return s.size(); -} - // is spectacularly slow to compile in C++20 so use a simple fill_n // instead (#1998). template @@ -634,6 +598,130 @@ inline counting_iterator copy_str(InputIt begin, InputIt end, return it + (end - begin); } +template +FMT_CONSTEXPR int code_point_length(const Char* begin) { + if (const_check(sizeof(Char) != 1)) return 1; + constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0}; + int len = lengths[static_cast(*begin) >> 3]; + + // Compute the pointer to the next character early so that the next + // iteration can start working on the next character. Neither Clang + // nor GCC figure out this reordering on their own. + return len + !len; +} + +// A public domain branchless UTF-8 decoder by Christopher Wellons: +// https://github.com/skeeto/branchless-utf8 +/* Decode the next character, c, from s, reporting errors in e. + * + * Since this is a branchless decoder, four bytes will be read from the + * buffer regardless of the actual length of the next character. This + * means the buffer _must_ have at least three bytes of zero padding + * following the end of the data stream. + * + * Errors are reported in e, which will be non-zero if the parsed + * character was somehow invalid: invalid byte sequence, non-canonical + * encoding, or a surrogate half. + * + * The function returns a pointer to the next character. When an error + * occurs, this pointer will be a guess that depends on the particular + * error, but it will always advance at least one byte. + */ +FMT_CONSTEXPR inline const char* utf8_decode(const char* s, uint32_t* c, + int* e) { + constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; + constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; + constexpr const int shiftc[] = {0, 18, 12, 6, 0}; + constexpr const int shifte[] = {0, 6, 4, 2, 0}; + + int len = code_point_length(s); + const char* next = s + len; + + // Assume a four-byte character and load four bytes. Unused bits are + // shifted out. + *c = uint32_t(s[0] & masks[len]) << 18; + *c |= uint32_t(s[1] & 0x3f) << 12; + *c |= uint32_t(s[2] & 0x3f) << 6; + *c |= uint32_t(s[3] & 0x3f) << 0; + *c >>= shiftc[len]; + + // Accumulate the various error conditions. + using uchar = unsigned char; + *e = (*c < mins[len]) << 6; // non-canonical encoding + *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half? + *e |= (*c > 0x10FFFF) << 8; // out of range? + *e |= (uchar(s[1]) & 0xc0) >> 2; + *e |= (uchar(s[2]) & 0xc0) >> 4; + *e |= uchar(s[3]) >> 6; + *e ^= 0x2a; // top two bits of each tail byte correct? + *e >>= shifte[len]; + + return next; +} + +template +FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) { + auto decode = [f](const char* p) { + auto cp = uint32_t(); + auto error = 0; + p = utf8_decode(p, &cp, &error); + f(cp, error); + return p; + }; + auto p = s.data(); + const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars. + if (s.size() >= block_size) { + for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p); + } + if (auto num_chars_left = s.data() + s.size() - p) { + char buf[2 * block_size - 1] = {}; + copy_str(p, p + num_chars_left, buf); + p = buf; + do { + p = decode(p); + } while (p - buf < num_chars_left); + } +} + +template +inline size_t compute_width(basic_string_view s) { + return s.size(); +} + +// Computes approximate display width of a UTF-8 string. +FMT_CONSTEXPR inline size_t compute_width(string_view s) { + size_t num_code_points = 0; + // It is not a lambda for compatibility with C++14. + struct count_code_points { + size_t* count; + FMT_CONSTEXPR void operator()(uint32_t, int) const { ++*count; } + }; + for_each_codepoint(s, count_code_points{&num_code_points}); + return num_code_points; +} + +inline size_t compute_width(basic_string_view s) { + return compute_width(basic_string_view( + reinterpret_cast(s.data()), s.size())); +} + +template +inline size_t code_point_index(basic_string_view s, size_t n) { + size_t size = s.size(); + return n < size ? n : size; +} + +// Calculates the index of the nth code point in a UTF-8 string. +inline size_t code_point_index(basic_string_view s, size_t n) { + const char8_type* data = s.data(); + size_t num_code_points = 0; + for (size_t i = 0, size = s.size(); i != size; ++i) { + if ((data[i] & 0xc0) != 0x80 && ++num_code_points > n) return i; + } + return s.size(); +} + template using is_fast_float = bool_constant::is_iec559 && sizeof(T) <= sizeof(double)>; @@ -1674,7 +1762,7 @@ FMT_CONSTEXPR OutputIt write(OutputIt out, basic_string_view s, if (specs.precision >= 0 && to_unsigned(specs.precision) < size) size = code_point_index(s, to_unsigned(specs.precision)); auto width = specs.width != 0 - ? count_code_points(basic_string_view(data, size)) + ? compute_width(basic_string_view(data, size)) : 0; using iterator = remove_reference_t; return write_padded(out, specs, size, width, [=](iterator it) { @@ -2274,9 +2362,8 @@ class arg_formatter_base { template void write(const Ch* s, size_t size, const format_specs& specs) { - auto width = specs.width != 0 - ? count_code_points(basic_string_view(s, size)) - : 0; + auto width = + specs.width != 0 ? compute_width(basic_string_view(s, size)) : 0; out_ = write_padded(out_, specs, size, width, [=](reserve_iterator it) { return copy_str(s, s + size, it); }); @@ -2879,19 +2966,6 @@ template struct precision_adapter { SpecHandler& handler; }; -template -FMT_CONSTEXPR int code_point_length(const Char* begin) { - if (const_check(sizeof(Char) != 1)) return 1; - constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0}; - int len = lengths[static_cast(*begin) >> 3]; - - // Compute the pointer to the next character early so that the next - // iteration can start working on the next character. Neither Clang - // nor GCC figure out this reordering on their own. - return len + !len; -} - template constexpr bool is_ascii_letter(Char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } @@ -3742,8 +3816,9 @@ template inline const void* ptr(const std::shared_ptr& p) { #if !FMT_MSC_VER // MSVC lets function pointers decay to void pointers, so this // overload is unnecessary. -template inline const void* ptr(T (*fn)(Args...)) { - return detail::bit_cast(fn); +template +inline const void* ptr(T (*fn)(Args...)) { + return detail::bit_cast(fn); } #endif diff --git a/test/format-impl-test.cc b/test/format-impl-test.cc index 1290c077..201aa03b 100644 --- a/test/format-impl-test.cc +++ b/test/format-impl-test.cc @@ -408,9 +408,9 @@ TEST(FormatTest, FormatErrorCode) { } } -TEST(FormatTest, CountCodePoints) { +TEST(FormatTest, ComputeWidth) { EXPECT_EQ(4, - fmt::detail::count_code_points( + fmt::detail::compute_width( fmt::basic_string_view( reinterpret_cast("ёжик")))); } diff --git a/test/format-test.cc b/test/format-test.cc index 684dd1b8..470aafb8 100644 --- a/test/format-test.cc +++ b/test/format-test.cc @@ -1467,8 +1467,7 @@ TEST(FormatterTest, FormatUCharString) { EXPECT_EQ("test", format("{0:s}", ptr)); } -void function_pointer_test(int, double, std::string) { -} +void function_pointer_test(int, double, std::string) {} TEST(FormatterTest, FormatPointer) { check_unknown_types(reinterpret_cast(0x1234), "p", "pointer"); @@ -1482,8 +1481,9 @@ TEST(FormatterTest, FormatPointer) { EXPECT_EQ(format("{}", fmt::ptr(up.get())), format("{}", fmt::ptr(up))); std::shared_ptr sp(new int(1)); EXPECT_EQ(format("{}", fmt::ptr(sp.get())), format("{}", fmt::ptr(sp))); - EXPECT_EQ(format("{}", fmt::detail::bit_cast(&function_pointer_test)), - format("{}", fmt::ptr(function_pointer_test))); + EXPECT_EQ( + format("{}", fmt::detail::bit_cast(&function_pointer_test)), + format("{}", fmt::ptr(function_pointer_test))); EXPECT_EQ("0x0", format("{}", nullptr)); } @@ -2565,7 +2565,7 @@ TEST(FormatTest, FormatUTF8Precision) { str_type str(reinterpret_cast( u8"caf\u00e9s")); // cafés auto result = fmt::format(format, str); - EXPECT_EQ(fmt::detail::count_code_points(result), 4); + EXPECT_EQ(fmt::detail::compute_width(result), 4); EXPECT_EQ(result.size(), 5); EXPECT_EQ(from_u8str(result), from_u8str(str.substr(0, 5))); }