More escaping

This commit is contained in:
Victor Zverovich
2021-08-21 09:35:05 -07:00
parent f69a572538
commit 6397095ca4
4 changed files with 91 additions and 28 deletions

View File

@ -2525,8 +2525,8 @@ template <> struct formatter<detail::bigint> {
}; };
FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) { FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
for_each_codepoint(s, [this](uint32_t cp, int error) { for_each_codepoint(s, [this](uint32_t cp, string_view) {
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8")); if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8"));
if (cp <= 0xFFFF) { if (cp <= 0xFFFF) {
buffer_.push_back(static_cast<wchar_t>(cp)); buffer_.push_back(static_cast<wchar_t>(cp));
} else { } else {
@ -2534,6 +2534,7 @@ FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10))); buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF))); buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
} }
return true;
}); });
buffer_.push_back(0); buffer_.push_back(0);
} }

View File

@ -483,27 +483,38 @@ FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
return next; return next;
} }
enum { invalid_code_point = ~uint32_t() };
// Invokes f(cp, sv) for every code point cp in s with sv being the string view
// corresponding to the code point. cp is invalid_code_point on error.
template <typename F> template <typename F>
FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) { FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
auto decode = [f](const char* p) { auto decode = [f](const char* buf_ptr, const char* ptr) {
auto cp = uint32_t(); auto cp = uint32_t();
auto error = 0; auto error = 0;
p = utf8_decode(p, &cp, &error); auto end = utf8_decode(buf_ptr, &cp, &error);
f(cp, error); bool result = f(error ? invalid_code_point : cp,
return p; string_view(ptr, to_unsigned(end - buf_ptr)));
return result ? end : nullptr;
}; };
auto p = s.data(); auto p = s.data();
const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars. const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
if (s.size() >= block_size) { if (s.size() >= block_size) {
for (auto end = p + s.size() - block_size + 1; p < end;) p = decode(p); for (auto end = p + s.size() - block_size + 1; p < end;) {
p = decode(p, p);
if (!p) return;
}
} }
if (auto num_chars_left = s.data() + s.size() - p) { if (auto num_chars_left = s.data() + s.size() - p) {
char buf[2 * block_size - 1] = {}; char buf[2 * block_size - 1] = {};
copy_str<char>(p, p + num_chars_left, buf); copy_str<char>(p, p + num_chars_left, buf);
p = buf; const char* buf_ptr = buf;
do { do {
p = decode(p); auto end = decode(buf_ptr, p);
} while (p - buf < num_chars_left); if (!end) return;
p += end - buf_ptr;
buf_ptr = end;
} while (buf_ptr - buf < num_chars_left);
} }
} }
@ -518,10 +529,10 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) {
// It is not a lambda for compatibility with C++14. // It is not a lambda for compatibility with C++14.
struct count_code_points { struct count_code_points {
size_t* count; size_t* count;
FMT_CONSTEXPR void operator()(uint32_t cp, int error) const { FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
*count += detail::to_unsigned( *count += detail::to_unsigned(
1 + 1 +
(error == 0 && cp >= 0x1100 && (cp >= 0x1100 &&
(cp <= 0x115f || // Hangul Jamo init. consonants (cp <= 0x115f || // Hangul Jamo init. consonants
cp == 0x2329 || // LEFT-POINTING ANGLE BRACKET cp == 0x2329 || // LEFT-POINTING ANGLE BRACKET
cp == 0x232a || // RIGHT-POINTING ANGLE BRACKET cp == 0x232a || // RIGHT-POINTING ANGLE BRACKET
@ -539,6 +550,7 @@ FMT_CONSTEXPR inline size_t compute_width(string_view s) {
(cp >= 0x1f300 && cp <= 0x1f64f) || (cp >= 0x1f300 && cp <= 0x1f64f) ||
// Supplemental Symbols and Pictographs: // Supplemental Symbols and Pictographs:
(cp >= 0x1f900 && cp <= 0x1f9ff)))); (cp >= 0x1f900 && cp <= 0x1f9ff))));
return true;
} }
}; };
for_each_codepoint(s, count_code_points{&num_code_points}); for_each_codepoint(s, count_code_points{&num_code_points});

View File

@ -227,17 +227,65 @@ template <typename OutputIt> OutputIt write_delimiter(OutputIt out) {
return out; return out;
} }
template <typename Char> inline bool is_printable_ascii(Char c) { inline auto is_printable(uint32_t cp) -> bool {
return c >= 0x20 && c < 0x7e; if (0x2a6de <= cp && cp < 0x2a700) return false;
if (0x2b735 <= cp && cp < 0x2b740) return false;
if (0x2b81e <= cp && cp < 0x2b820) return false;
if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
if (0x2ebe1 <= cp && cp < 0x2f800) return false;
if (0x2fa1e <= cp && cp < 0x30000) return false;
if (0x3134b <= cp && cp < 0xe0100) return false;
if (0xe01f0 <= cp && cp < 0x110000) return false;
return true;
} }
template < inline auto needs_escape(uint32_t cp) -> bool {
typename Char, typename OutputIt, typename T, return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\' ||
FMT_ENABLE_IF(is_std_string_like<typename std::decay<T>::type>::value)> !is_printable(cp);
OutputIt write_range_entry(OutputIt out, const T& str) { }
template <typename Char> struct find_escape_result {
const Char* begin;
const Char* end;
uint32_t cp;
};
template <typename Char>
auto find_escape(const Char* begin, const Char* end)
-> find_escape_result<Char> {
for (; begin != end; ++begin) {
auto cp = static_cast<typename std::make_unsigned<Char>::type>(*begin);
if (needs_escape(cp)) return {begin, begin + 1, cp};
}
return {begin, nullptr, 0};
}
auto find_escape(const char* begin, const char* end)
-> find_escape_result<char> {
if (!is_utf8()) return find_escape<char>(begin, end);
auto result = find_escape_result<char>{end, nullptr, 0};
for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
[&](uint32_t cp, string_view sv) {
if (needs_escape(cp)) {
result = {sv.begin(), sv.end(), cp};
return false;
}
return true;
});
return result;
}
template <typename Char, typename OutputIt>
auto write_range_entry(OutputIt out, basic_string_view<Char> str) -> OutputIt {
*out++ = '"'; *out++ = '"';
for (Char c : basic_string_view<Char>(str)) { auto begin = str.begin(), end = str.end();
switch (c) { do {
auto escape = find_escape(begin, end);
out = copy_str<Char>(begin, escape.begin, out);
begin = escape.end;
if (!begin) break;
auto c = static_cast<Char>(escape.cp);
switch (escape.cp) {
case '\n': case '\n':
*out++ = '\\'; *out++ = '\\';
c = 'n'; c = 'n';
@ -256,13 +304,14 @@ OutputIt write_range_entry(OutputIt out, const T& str) {
*out++ = '\\'; *out++ = '\\';
break; break;
default: default:
if (is_printable_ascii(c)) break; for (Char escape_char : basic_string_view<Char>(
if (sizeof(Char) != 1 && c >= 0x80) break; escape.begin, to_unsigned(escape.end - escape.begin))) {
out = format_to(out, "\\x{:02x}", c); out = format_to(out, "\\x{:02x}", escape_char);
}
continue; continue;
} }
*out++ = c; *out++ = c;
} } while (begin != end);
*out++ = '"'; *out++ = '"';
return out; return out;
} }

View File

@ -264,7 +264,8 @@ TEST(ranges_test, join_range) {
#endif // FMT_RANGES_TEST_ENABLE_JOIN #endif // FMT_RANGES_TEST_ENABLE_JOIN
TEST(ranges_test, escape_string) { TEST(ranges_test, escape_string) {
EXPECT_EQ(fmt::format("{}", std::vector<std::string>{"\n\r\t\"\\"}), using vec = std::vector<std::string>;
"[\"\\n\\r\\t\\\"\\\\\"]"); EXPECT_EQ(fmt::format("{}", vec{"\n\r\t\"\\"}), "[\"\\n\\r\\t\\\"\\\\\"]");
EXPECT_EQ(fmt::format("{}", std::vector<std::string>{"\x7"}), "[\"\\x07\"]"); EXPECT_EQ(fmt::format("{}", vec{"\x07"}), "[\"\\x07\"]");
} EXPECT_EQ(fmt::format("{}", vec{"\x7f"}), "[\"\\x7f\"]");
}