Make path formatting lossless with WTF-8

This commit is contained in:
Victor Zverovich
2025-11-05 09:19:10 -10:00
parent a4c7e17133
commit 1122268510
3 changed files with 29 additions and 16 deletions

View File

@@ -1311,7 +1311,13 @@ class utf8_to_utf16 {
inline auto str() const -> std::wstring { return {&buffer_[0], size()}; }
};
enum class to_utf8_error_policy { abort, replace };
enum class to_utf8_error_policy { abort, replace, wtf };
inline void to_utf8_3bytes(buffer<char>& buf, uint32_t cp) {
buf.push_back(static_cast<char>(0xe0 | (cp >> 12)));
buf.push_back(static_cast<char>(0x80 | ((cp & 0xfff) >> 6)));
buf.push_back(static_cast<char>(0x80 | (cp & 0x3f)));
}
// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
@@ -1353,8 +1359,16 @@ template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
// Handle a surrogate pair.
++p;
if (p == s.end() || (c & 0xfc00) != 0xd800 || (*p & 0xfc00) != 0xdc00) {
if (policy == to_utf8_error_policy::abort) return false;
switch (policy) {
case to_utf8_error_policy::abort:
return false;
case to_utf8_error_policy::replace:
buf.append(string_view("\xEF\xBF\xBD"));
break;
case to_utf8_error_policy::wtf:
to_utf8_3bytes(buf, c);
break;
}
--p;
continue;
}
@@ -1366,9 +1380,7 @@ template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
to_utf8_3bytes(buf, c);
} else if (c >= 0x10000 && c <= 0x10ffff) {
buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));

View File

@@ -84,11 +84,13 @@ namespace detail {
template <typename Char, typename PathChar>
auto get_path_string(const std::filesystem::path& p,
const std::basic_string<PathChar>& native) {
if constexpr (std::is_same_v<Char, char> && std::is_same_v<PathChar, wchar_t>)
return to_utf8<wchar_t>(native, to_utf8_error_policy::replace);
else
if constexpr (std::is_same_v<Char, char> &&
std::is_same_v<PathChar, wchar_t>) {
return to_utf8<wchar_t>(native, to_utf8_error_policy::wtf);
} else {
return p.string<Char>();
}
}
template <typename Char, typename PathChar>
void write_escaped_path(basic_memory_buffer<Char>& quoted,

View File

@@ -39,13 +39,12 @@ TEST(std_test, path) {
EXPECT_EQ(fmt::format("{}", path(L"\x0428\x0447\x0443\x0447\x044B\x043D\x0448"
L"\x0447\x044B\x043D\x0430")),
"Шчучыншчына");
EXPECT_EQ(fmt::format("{}", path(L"\xd800")), "<EFBFBD>");
EXPECT_EQ(fmt::format("{}", path(L"HEAD \xd800 TAIL")), "HEAD <20> TAIL");
EXPECT_EQ(fmt::format("{}", path(L"HEAD \xD83D\xDE00 TAIL")),
"HEAD \xF0\x9F\x98\x80 TAIL");
EXPECT_EQ(fmt::format("{}", path(L"HEAD \xD83D\xD83D\xDE00 TAIL")),
"HEAD <20>\xF0\x9F\x98\x80 TAIL");
EXPECT_EQ(fmt::format("{:?}", path(L"\xd800")), "\"\\ud800\"");
EXPECT_EQ(fmt::format("{}", path(L"\xD800")), "\xED\xA0\x80");
EXPECT_EQ(fmt::format("{}", path(L"[\xD800]")), "[\xED\xA0\x80]");
EXPECT_EQ(fmt::format("{}", path(L"[\xD83D\xDE00]")), "[\xF0\x9F\x98\x80]");
EXPECT_EQ(fmt::format("{}", path(L"[\xD83D\xD83D\xDE00]")),
"[\xED\xA0\xBD\xF0\x9F\x98\x80]");
EXPECT_EQ(fmt::format("{:?}", path(L"\xD800")), "\"\\ud800\"");
# endif
}