mirror of
https://github.com/fmtlib/fmt.git
synced 2025-07-29 18:27:40 +02:00
Implement utf8_to_utf16 using utf8_decode
This commit is contained in:
@ -1214,6 +1214,61 @@ int snprintf_float(T value, int precision, float_specs specs,
|
||||
return exp - fraction_size;
|
||||
}
|
||||
}
|
||||
|
||||
// A public domain branchless UTF-8 decoder by Christopher Wellons:
|
||||
// https://github.com/skeeto/branchless-utf8
|
||||
/* Decode the next character, c, from buf, reporting errors in e.
|
||||
*
|
||||
* Since this is a branchless decoder, four bytes will be read from the
|
||||
* buffer regardless of the actual length of the next character. This
|
||||
* means the buffer _must_ have at least three bytes of zero padding
|
||||
* following the end of the data stream.
|
||||
*
|
||||
* Errors are reported in e, which will be non-zero if the parsed
|
||||
* character was somehow invalid: invalid byte sequence, non-canonical
|
||||
* encoding, or a surrogate half.
|
||||
*
|
||||
* The function returns a pointer to the next character. When an error
|
||||
* occurs, this pointer will be a guess that depends on the particular
|
||||
* error, but it will always advance at least one byte.
|
||||
*/
|
||||
FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
|
||||
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
|
||||
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
||||
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
||||
static const int shiftc[] = {0, 18, 12, 6, 0};
|
||||
static const int shifte[] = {0, 6, 4, 2, 0};
|
||||
|
||||
auto s = reinterpret_cast<const unsigned char*>(buf);
|
||||
int len = lengths[s[0] >> 3];
|
||||
|
||||
// Compute the pointer to the next character early so that the next
|
||||
// iteration can start working on the next character. Neither Clang
|
||||
// nor GCC figure out this reordering on their own.
|
||||
const char* next = buf + len + !len;
|
||||
|
||||
// Assume a four-byte character and load four bytes. Unused bits are
|
||||
// shifted out.
|
||||
*c = uint32_t(s[0] & masks[len]) << 18;
|
||||
*c |= uint32_t(s[1] & 0x3f) << 12;
|
||||
*c |= uint32_t(s[2] & 0x3f) << 6;
|
||||
*c |= uint32_t(s[3] & 0x3f) << 0;
|
||||
*c >>= shiftc[len];
|
||||
|
||||
// Accumulate the various error conditions.
|
||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||
*e |= (s[1] & 0xc0) >> 2;
|
||||
*e |= (s[2] & 0xc0) >> 4;
|
||||
*e |= (s[3]) >> 6;
|
||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||
*e >>= shifte[len];
|
||||
|
||||
return next;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
template <> struct formatter<internal::bigint> {
|
||||
@ -1240,87 +1295,24 @@ template <> struct formatter<internal::bigint> {
|
||||
}
|
||||
};
|
||||
|
||||
// A public domain branchless UTF-8 decoder:
|
||||
// https://github.com/skeeto/branchless-utf8
|
||||
/* Decode the next character, C, from BUF, reporting errors in E.
|
||||
*
|
||||
* Since this is a branchless decoder, four bytes will be read from the
|
||||
* buffer regardless of the actual length of the next character. This
|
||||
* means the buffer _must_ have at least three bytes of zero padding
|
||||
* following the end of the data stream.
|
||||
*
|
||||
* Errors are reported in E, which will be non-zero if the parsed
|
||||
* character was somehow invalid: invalid byte sequence, non-canonical
|
||||
* encoding, or a surrogate half.
|
||||
*
|
||||
* The function returns a pointer to the next character. When an error
|
||||
* occurs, this pointer will be a guess that depends on the particular
|
||||
* error, but it will always advance at least one byte.
|
||||
*/
|
||||
static void* utf8_decode(void* buf, uint32_t* c, int* e) {
|
||||
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
|
||||
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
||||
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
||||
static const int shiftc[] = {0, 18, 12, 6, 0};
|
||||
static const int shifte[] = {0, 6, 4, 2, 0};
|
||||
|
||||
auto s = reinterpret_cast<unsigned char*>(buf);
|
||||
int len = lengths[s[0] >> 3];
|
||||
|
||||
/* Compute the pointer to the next character early so that the next
|
||||
* iteration can start working on the next character. Neither Clang
|
||||
* nor GCC figure out this reordering on their own.
|
||||
*/
|
||||
unsigned char* next = s + len + !len;
|
||||
|
||||
/* Assume a four-byte character and load four bytes. Unused bits are
|
||||
* shifted out.
|
||||
*/
|
||||
*c = (uint32_t)(s[0] & masks[len]) << 18;
|
||||
*c |= (uint32_t)(s[1] & 0x3f) << 12;
|
||||
*c |= (uint32_t)(s[2] & 0x3f) << 6;
|
||||
*c |= (uint32_t)(s[3] & 0x3f) << 0;
|
||||
*c >>= shiftc[len];
|
||||
|
||||
/* Accumulate the various error conditions. */
|
||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||
*e |= (s[1] & 0xc0) >> 2;
|
||||
*e |= (s[2] & 0xc0) >> 4;
|
||||
*e |= (s[3]) >> 6;
|
||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||
*e >>= shifte[len];
|
||||
|
||||
return next;
|
||||
FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
|
||||
for (auto p = s.data(), end = p + s.size(); p != end;) {
|
||||
auto cp = uint32_t();
|
||||
auto error = 0;
|
||||
p = utf8_decode(p, &cp, &error);
|
||||
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
|
||||
if (cp <= 0xFFFF) {
|
||||
buffer_.push_back(static_cast<wchar_t>(cp));
|
||||
} else {
|
||||
cp -= 0x10000;
|
||||
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
|
||||
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
|
||||
}
|
||||
}
|
||||
buffer_.push_back(0);
|
||||
}
|
||||
|
||||
#if FMT_USE_WINDOWS_H
|
||||
|
||||
FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
|
||||
static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
|
||||
if (s.size() > INT_MAX)
|
||||
FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG));
|
||||
int s_size = static_cast<int>(s.size());
|
||||
if (s_size == 0) {
|
||||
// MultiByteToWideChar does not support zero length, handle separately.
|
||||
buffer_.resize(1);
|
||||
buffer_[0] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(),
|
||||
s_size, nullptr, 0);
|
||||
if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
|
||||
buffer_.resize(length + 1);
|
||||
length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size,
|
||||
&buffer_[0], length);
|
||||
if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
|
||||
buffer_[length] = 0;
|
||||
}
|
||||
|
||||
FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
|
||||
if (int error_code = convert(s)) {
|
||||
FMT_THROW(windows_error(error_code,
|
||||
@ -1389,7 +1381,6 @@ FMT_FUNC void internal::format_windows_error(internal::buffer<char>& out,
|
||||
FMT_CATCH(...) {}
|
||||
format_error_code(out, error_code, message);
|
||||
}
|
||||
|
||||
#endif // FMT_USE_WINDOWS_H
|
||||
|
||||
FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,
|
||||
|
@ -940,29 +940,28 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
|
||||
return internal::copy_str<Char>(buffer, buffer + num_digits, out);
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
# define FMT_USE_WINDOWS_H 0
|
||||
#elif !defined(FMT_USE_WINDOWS_H)
|
||||
# define FMT_USE_WINDOWS_H 1
|
||||
#endif
|
||||
|
||||
// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
|
||||
// All the functionality that relies on it will be disabled too.
|
||||
#if FMT_USE_WINDOWS_H
|
||||
// A converter from UTF-8 to UTF-16.
|
||||
// It is only provided for Windows since other systems support UTF-8 natively.
|
||||
class utf8_to_utf16 {
|
||||
private:
|
||||
wmemory_buffer buffer_;
|
||||
|
||||
public:
|
||||
FMT_API explicit utf8_to_utf16(string_view s);
|
||||
operator wstring_view() const { return wstring_view(&buffer_[0], size()); }
|
||||
operator wstring_view() const { return {&buffer_[0], size()}; }
|
||||
size_t size() const { return buffer_.size() - 1; }
|
||||
const wchar_t* c_str() const { return &buffer_[0]; }
|
||||
std::wstring str() const { return std::wstring(&buffer_[0], size()); }
|
||||
std::wstring str() const { return {&buffer_[0], size()}; }
|
||||
};
|
||||
|
||||
// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
|
||||
// All the functionality that relies on it will be disabled too.
|
||||
#ifndef _WIN32
|
||||
# define FMT_USE_WINDOWS_H 0
|
||||
#elif !defined(FMT_USE_WINDOWS_H)
|
||||
# define FMT_USE_WINDOWS_H 1
|
||||
#endif
|
||||
|
||||
#if FMT_USE_WINDOWS_H
|
||||
// A converter from UTF-16 to UTF-8.
|
||||
// It is only provided for Windows since other systems support UTF-8 natively.
|
||||
class utf16_to_utf8 {
|
||||
|
@ -400,6 +400,23 @@ TEST(MemoryBufferTest, ExceptionInDeallocate) {
|
||||
EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size));
|
||||
}
|
||||
|
||||
TEST(UtilTest, UTF8ToUTF16) {
|
||||
fmt::internal::utf8_to_utf16 u("лошадка");
|
||||
EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
|
||||
EXPECT_EQ(7, u.size());
|
||||
// U+10437 { DESERET SMALL LETTER YEE }
|
||||
EXPECT_EQ(L"\xD801\xDC37", fmt::internal::utf8_to_utf16("𐐷").str());
|
||||
EXPECT_THROW_MSG(fmt::internal::utf8_to_utf16("\xc3\x28"), std::runtime_error,
|
||||
"invalid utf8");
|
||||
}
|
||||
|
||||
TEST(UtilTest, UTF8ToUTF16EmptyString) {
|
||||
std::string s = "";
|
||||
fmt::internal::utf8_to_utf16 u(s.c_str());
|
||||
EXPECT_EQ(L"", u.str());
|
||||
EXPECT_EQ(s.size(), u.size());
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
TEST(UtilTest, UTF16ToUTF8) {
|
||||
std::string s = "ёжик";
|
||||
@ -415,20 +432,6 @@ TEST(UtilTest, UTF16ToUTF8EmptyString) {
|
||||
EXPECT_EQ(s.size(), u.size());
|
||||
}
|
||||
|
||||
TEST(UtilTest, UTF8ToUTF16) {
|
||||
std::string s = "лошадка";
|
||||
fmt::internal::utf8_to_utf16 u(s.c_str());
|
||||
EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
|
||||
EXPECT_EQ(7, u.size());
|
||||
}
|
||||
|
||||
TEST(UtilTest, UTF8ToUTF16EmptyString) {
|
||||
std::string s = "";
|
||||
fmt::internal::utf8_to_utf16 u(s.c_str());
|
||||
EXPECT_EQ(L"", u.str());
|
||||
EXPECT_EQ(s.size(), u.size());
|
||||
}
|
||||
|
||||
template <typename Converter, typename Char>
|
||||
void check_utf_conversion_error(
|
||||
const char* message,
|
||||
@ -450,13 +453,6 @@ TEST(UtilTest, UTF16ToUTF8Error) {
|
||||
"cannot convert string from UTF-16 to UTF-8");
|
||||
}
|
||||
|
||||
TEST(UtilTest, UTF8ToUTF16Error) {
|
||||
const char* message = "cannot convert string from UTF-8 to UTF-16";
|
||||
check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(message);
|
||||
check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(
|
||||
message, fmt::string_view("foo", INT_MAX + 1u));
|
||||
}
|
||||
|
||||
TEST(UtilTest, UTF16ToUTF8Convert) {
|
||||
fmt::internal::utf16_to_utf8 u;
|
||||
EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1)));
|
||||
@ -1237,8 +1233,7 @@ TEST(FormatterTest, Precision) {
|
||||
format_error,
|
||||
"precision not allowed for this argument type");
|
||||
EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value<int>()),
|
||||
format_error,
|
||||
"number is too big");
|
||||
format_error, "number is too big");
|
||||
|
||||
EXPECT_EQ("st", format("{0:.2}", "str"));
|
||||
}
|
||||
@ -1875,8 +1870,8 @@ TEST(FormatTest, Dynamic) {
|
||||
args.emplace_back(fmt::internal::make_arg<ctx>(1.5f));
|
||||
|
||||
std::string result = fmt::vformat(
|
||||
"{} and {} and {}", fmt::basic_format_args<ctx>(
|
||||
args.data(), static_cast<int>(args.size())));
|
||||
"{} and {} and {}",
|
||||
fmt::basic_format_args<ctx>(args.data(), static_cast<int>(args.size())));
|
||||
|
||||
EXPECT_EQ("42 and abc1 and 1.5", result);
|
||||
}
|
||||
@ -2266,9 +2261,7 @@ struct test_format_specs_handler {
|
||||
|
||||
FMT_CONSTEXPR void on_precision(int p) { precision = p; }
|
||||
FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {}
|
||||
FMT_CONSTEXPR void on_dynamic_precision(int index) {
|
||||
precision_ref = index;
|
||||
}
|
||||
FMT_CONSTEXPR void on_dynamic_precision(int index) { precision_ref = index; }
|
||||
FMT_CONSTEXPR void on_dynamic_precision(string_view) {}
|
||||
|
||||
FMT_CONSTEXPR void end_precision() {}
|
||||
|
Reference in New Issue
Block a user