Unification utf16/utf32 to utf8 conversion

Signed-off-by: Vladislav Shchapov <vladislav@shchapov.ru>
This commit is contained in:
Vladislav Shchapov
2023-05-02 21:29:02 +05:00
committed by Victor Zverovich
parent e84b00e014
commit dde8cf3bb7
7 changed files with 88 additions and 142 deletions

View File

@@ -1418,6 +1418,68 @@ class utf8_to_utf16 {
auto str() const -> std::wstring { return {&buffer_[0], size()}; }
};
// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
template <typename WChar, typename Buffer = memory_buffer>
class unicode_to_utf8 {
private:
Buffer buffer_;
public:
unicode_to_utf8() {}
explicit unicode_to_utf8(basic_string_view<WChar> s) {
static_assert(sizeof(WChar) == 2 || sizeof(WChar) == 4,
"Expect utf16 or utf32");
if (!convert(s))
FMT_THROW(std::runtime_error(sizeof(WChar) == 2 ? "invalid utf16"
: "invalid utf32"));
}
operator string_view() const { return string_view(&buffer_[0], size()); }
size_t size() const { return buffer_.size() - 1; }
const char* c_str() const { return &buffer_[0]; }
std::string str() const { return std::string(&buffer_[0], size()); }
// Performs conversion returning a bool instead of throwing exception on
// conversion error. This method may still throw in case of memory allocation
// error.
bool convert(basic_string_view<WChar> s) {
if (!convert(buffer_, s)) return false;
buffer_.push_back(0);
return true;
}
static bool convert(Buffer& buf, basic_string_view<WChar> s) {
for (auto p = s.begin(); p != s.end(); ++p) {
uint32_t c = static_cast<uint32_t>(*p);
if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
// surrogate pair
++p;
if (p == s.end() || (c & 0xfc00) != 0xd800 || (*p & 0xfc00) != 0xdc00) {
return false;
}
c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
}
if (c < 0x80) {
buf.push_back(static_cast<char>(c));
} else if (c < 0x800) {
buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else if (c >= 0x10000 && c <= 0x10ffff) {
buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
} else {
return false;
}
}
return true;
}
};
// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
inline uint128_fallback umul128(uint64_t x, uint64_t y) noexcept {
#if FMT_USE_INT128