diff --git a/include/fmt/format-inl.h b/include/fmt/format-inl.h index 94a36d1b..8f247a47 100644 --- a/include/fmt/format-inl.h +++ b/include/fmt/format-inl.h @@ -2525,8 +2525,8 @@ template <> struct formatter { }; FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) { - for_each_codepoint(s, [this](uint32_t cp, int error) { - if (error != 0) FMT_THROW(std::runtime_error("invalid utf8")); + for_each_codepoint(s, [this](uint32_t cp, string_view) { + if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8")); if (cp <= 0xFFFF) { buffer_.push_back(static_cast(cp)); } else { @@ -2534,6 +2534,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) { buffer_.push_back(static_cast(0xD800 + (cp >> 10))); buffer_.push_back(static_cast(0xDC00 + (cp & 0x3FF))); } + return true; }); buffer_.push_back(0); } diff --git a/include/fmt/format.h b/include/fmt/format.h index ee3d7f75..117c91dd 100644 --- a/include/fmt/format.h +++ b/include/fmt/format.h @@ -483,27 +483,38 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e) return next; } +enum { invalid_code_point = ~uint32_t() }; + +// Invokes f(cp, sv) for every code point cp in s with sv being the string view +// corresponding to the code point. cp is invalid_code_point on error. template FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) { - auto decode = [f](const char* p) { + auto decode = [f](const char* buf_ptr, const char* ptr) { auto cp = uint32_t(); auto error = 0; - p = utf8_decode(p, &cp, &error); - f(cp, error); - return p; + auto end = utf8_decode(buf_ptr, &cp, &error); + bool result = f(error ? invalid_code_point : cp, + string_view(ptr, to_unsigned(end - buf_ptr))); + return result ? end : nullptr; }; auto p = s.data(); const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars. if (s.size() >= block_size) { - for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p); + for (auto end = p + s.size() - block_size + 1; p < end;) { + p = decode(p, p); + if (!p) return; + } } if (auto num_chars_left = s.data() + s.size() - p) { char buf[2 * block_size - 1] = {}; copy_str(p, p + num_chars_left, buf); - p = buf; + const char* buf_ptr = buf; do { - p = decode(p); - } while (p - buf < num_chars_left); + auto end = decode(buf_ptr, p); + if (!end) return; + p += end - buf_ptr; + buf_ptr = end; + } while (buf_ptr - buf < num_chars_left); } } @@ -518,10 +529,10 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) { // It is not a lambda for compatibility with C++14. struct count_code_points { size_t* count; - FMT_CONSTEXPR void operator()(uint32_t cp, int error) const { + FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool { *count += detail::to_unsigned( 1 + - (error == 0 && cp >= 0x1100 && + (cp >= 0x1100 && (cp <= 0x115f || // Hangul Jamo init. consonants cp == 0x2329 || // LEFT-POINTING ANGLE BRACKET cp == 0x232a || // RIGHT-POINTING ANGLE BRACKET @@ -539,6 +550,7 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) { (cp >= 0x1f300 && cp <= 0x1f64f) || // Supplemental Symbols and Pictographs: (cp >= 0x1f900 && cp <= 0x1f9ff)))); + return true; } }; for_each_codepoint(s, count_code_points{&num_code_points}); diff --git a/include/fmt/ranges.h b/include/fmt/ranges.h index 44f29bc2..8ee5a0ee 100644 --- a/include/fmt/ranges.h +++ b/include/fmt/ranges.h @@ -227,17 +227,65 @@ template OutputIt write_delimiter(OutputIt out) { return out; } -template inline bool is_printable_ascii(Char c) { - return c >= 0x20 && c < 0x7e; +inline auto is_printable(uint32_t cp) -> bool { + if (0x2a6de <= cp && cp < 0x2a700) return false; + if (0x2b735 <= cp && cp < 0x2b740) return false; + if (0x2b81e <= cp && cp < 0x2b820) return false; + if (0x2cea2 <= cp && cp < 0x2ceb0) return false; + if (0x2ebe1 <= cp && cp < 0x2f800) return false; + if (0x2fa1e <= cp && cp < 0x30000) return false; + if (0x3134b <= cp && cp < 0xe0100) return false; + if (0xe01f0 <= cp && cp < 0x110000) return false; + return true; } -template < - typename Char, typename OutputIt, typename T, - FMT_ENABLE_IF(is_std_string_like::type>::value)> -OutputIt write_range_entry(OutputIt out, const T& str) { +inline auto needs_escape(uint32_t cp) -> bool { + return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' || + !is_printable(cp); +} + +template struct find_escape_result { + const Char* begin; + const Char* end; + uint32_t cp; +}; + +template +auto find_escape(const Char* begin, const Char* end) + -> find_escape_result { + for (; begin != end; ++begin) { + auto cp = static_cast::type>(*begin); + if (needs_escape(cp)) return {begin, begin + 1, cp}; + } + return {begin, nullptr, 0}; +} + +auto find_escape(const char* begin, const char* end) + -> find_escape_result { + if (!is_utf8()) return find_escape(begin, end); + auto result = find_escape_result{end, nullptr, 0}; + for_each_codepoint(string_view(begin, to_unsigned(end - begin)), + [&](uint32_t cp, string_view sv) { + if (needs_escape(cp)) { + result = {sv.begin(), sv.end(), cp}; + return false; + } + return true; + }); + return result; +} + +template +auto write_range_entry(OutputIt out, basic_string_view str) -> OutputIt { *out++ = '"'; - for (Char c : basic_string_view(str)) { - switch (c) { + auto begin = str.begin(), end = str.end(); + do { + auto escape = find_escape(begin, end); + out = copy_str(begin, escape.begin, out); + begin = escape.end; + if (!begin) break; + auto c = static_cast(escape.cp); + switch (escape.cp) { case '\n': *out++ = '\\'; c = 'n'; @@ -256,13 +304,14 @@ OutputIt write_range_entry(OutputIt out, const T& str) { *out++ = '\\'; break; default: - if (is_printable_ascii(c)) break; - if (sizeof(Char) != 1 && c >= 0x80) break; - out = format_to(out, "\\x{:02x}", c); + for (Char escape_char : basic_string_view( + escape.begin, to_unsigned(escape.end - escape.begin))) { + out = format_to(out, "\\x{:02x}", escape_char); + } continue; } *out++ = c; - } + } while (begin != end); *out++ = '"'; return out; } diff --git a/test/ranges-test.cc b/test/ranges-test.cc index 21924cb9..2d6ce814 100644 --- a/test/ranges-test.cc +++ b/test/ranges-test.cc @@ -264,7 +264,8 @@ TEST(ranges_test, join_range) { #endif // FMT_RANGES_TEST_ENABLE_JOIN TEST(ranges_test, escape_string) { - EXPECT_EQ(fmt::format("{}", std::vector{"\n\r\t\"\\"}), - "[\"\\n\\r\\t\\\"\\\\\"]"); - EXPECT_EQ(fmt::format("{}", std::vector{"\x7"}), "[\"\\x07\"]"); -} \ No newline at end of file + using vec = std::vector; + EXPECT_EQ(fmt::format("{}", vec{"\n\r\t\"\\"}), "[\"\\n\\r\\t\\\"\\\\\"]"); + EXPECT_EQ(fmt::format("{}", vec{"\x07"}), "[\"\\x07\"]"); + EXPECT_EQ(fmt::format("{}", vec{"\x7f"}), "[\"\\x7f\"]"); +}