mirror of
https://github.com/fmtlib/fmt.git
synced 2025-07-31 11:17:35 +02:00
Implement utf8_to_utf16 using utf8_decode
This commit is contained in:
@ -1214,6 +1214,61 @@ int snprintf_float(T value, int precision, float_specs specs,
|
|||||||
return exp - fraction_size;
|
return exp - fraction_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A public domain branchless UTF-8 decoder by Christopher Wellons:
|
||||||
|
// https://github.com/skeeto/branchless-utf8
|
||||||
|
/* Decode the next character, c, from buf, reporting errors in e.
|
||||||
|
*
|
||||||
|
* Since this is a branchless decoder, four bytes will be read from the
|
||||||
|
* buffer regardless of the actual length of the next character. This
|
||||||
|
* means the buffer _must_ have at least three bytes of zero padding
|
||||||
|
* following the end of the data stream.
|
||||||
|
*
|
||||||
|
* Errors are reported in e, which will be non-zero if the parsed
|
||||||
|
* character was somehow invalid: invalid byte sequence, non-canonical
|
||||||
|
* encoding, or a surrogate half.
|
||||||
|
*
|
||||||
|
* The function returns a pointer to the next character. When an error
|
||||||
|
* occurs, this pointer will be a guess that depends on the particular
|
||||||
|
* error, but it will always advance at least one byte.
|
||||||
|
*/
|
||||||
|
FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
|
||||||
|
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
|
||||||
|
0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
|
||||||
|
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
||||||
|
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
||||||
|
static const int shiftc[] = {0, 18, 12, 6, 0};
|
||||||
|
static const int shifte[] = {0, 6, 4, 2, 0};
|
||||||
|
|
||||||
|
auto s = reinterpret_cast<const unsigned char*>(buf);
|
||||||
|
int len = lengths[s[0] >> 3];
|
||||||
|
|
||||||
|
// Compute the pointer to the next character early so that the next
|
||||||
|
// iteration can start working on the next character. Neither Clang
|
||||||
|
// nor GCC figure out this reordering on their own.
|
||||||
|
const char* next = buf + len + !len;
|
||||||
|
|
||||||
|
// Assume a four-byte character and load four bytes. Unused bits are
|
||||||
|
// shifted out.
|
||||||
|
*c = uint32_t(s[0] & masks[len]) << 18;
|
||||||
|
*c |= uint32_t(s[1] & 0x3f) << 12;
|
||||||
|
*c |= uint32_t(s[2] & 0x3f) << 6;
|
||||||
|
*c |= uint32_t(s[3] & 0x3f) << 0;
|
||||||
|
*c >>= shiftc[len];
|
||||||
|
|
||||||
|
// Accumulate the various error conditions.
|
||||||
|
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
||||||
|
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
||||||
|
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
||||||
|
*e |= (s[1] & 0xc0) >> 2;
|
||||||
|
*e |= (s[2] & 0xc0) >> 4;
|
||||||
|
*e |= (s[3]) >> 6;
|
||||||
|
*e ^= 0x2a; // top two bits of each tail byte correct?
|
||||||
|
*e >>= shifte[len];
|
||||||
|
|
||||||
|
return next;
|
||||||
|
}
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
|
||||||
template <> struct formatter<internal::bigint> {
|
template <> struct formatter<internal::bigint> {
|
||||||
@ -1240,87 +1295,24 @@ template <> struct formatter<internal::bigint> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// A public domain branchless UTF-8 decoder:
|
FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
|
||||||
// https://github.com/skeeto/branchless-utf8
|
for (auto p = s.data(), end = p + s.size(); p != end;) {
|
||||||
/* Decode the next character, C, from BUF, reporting errors in E.
|
auto cp = uint32_t();
|
||||||
*
|
auto error = 0;
|
||||||
* Since this is a branchless decoder, four bytes will be read from the
|
p = utf8_decode(p, &cp, &error);
|
||||||
* buffer regardless of the actual length of the next character. This
|
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
|
||||||
* means the buffer _must_ have at least three bytes of zero padding
|
if (cp <= 0xFFFF) {
|
||||||
* following the end of the data stream.
|
buffer_.push_back(static_cast<wchar_t>(cp));
|
||||||
*
|
} else {
|
||||||
* Errors are reported in E, which will be non-zero if the parsed
|
cp -= 0x10000;
|
||||||
* character was somehow invalid: invalid byte sequence, non-canonical
|
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
|
||||||
* encoding, or a surrogate half.
|
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
|
||||||
*
|
}
|
||||||
* The function returns a pointer to the next character. When an error
|
}
|
||||||
* occurs, this pointer will be a guess that depends on the particular
|
buffer_.push_back(0);
|
||||||
* error, but it will always advance at least one byte.
|
|
||||||
*/
|
|
||||||
static void* utf8_decode(void* buf, uint32_t* c, int* e) {
|
|
||||||
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
||||||
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
|
|
||||||
0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
|
|
||||||
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
|
|
||||||
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
|
|
||||||
static const int shiftc[] = {0, 18, 12, 6, 0};
|
|
||||||
static const int shifte[] = {0, 6, 4, 2, 0};
|
|
||||||
|
|
||||||
auto s = reinterpret_cast<unsigned char*>(buf);
|
|
||||||
int len = lengths[s[0] >> 3];
|
|
||||||
|
|
||||||
/* Compute the pointer to the next character early so that the next
|
|
||||||
* iteration can start working on the next character. Neither Clang
|
|
||||||
* nor GCC figure out this reordering on their own.
|
|
||||||
*/
|
|
||||||
unsigned char* next = s + len + !len;
|
|
||||||
|
|
||||||
/* Assume a four-byte character and load four bytes. Unused bits are
|
|
||||||
* shifted out.
|
|
||||||
*/
|
|
||||||
*c = (uint32_t)(s[0] & masks[len]) << 18;
|
|
||||||
*c |= (uint32_t)(s[1] & 0x3f) << 12;
|
|
||||||
*c |= (uint32_t)(s[2] & 0x3f) << 6;
|
|
||||||
*c |= (uint32_t)(s[3] & 0x3f) << 0;
|
|
||||||
*c >>= shiftc[len];
|
|
||||||
|
|
||||||
/* Accumulate the various error conditions. */
|
|
||||||
*e = (*c < mins[len]) << 6; // non-canonical encoding
|
|
||||||
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
|
|
||||||
*e |= (*c > 0x10FFFF) << 8; // out of range?
|
|
||||||
*e |= (s[1] & 0xc0) >> 2;
|
|
||||||
*e |= (s[2] & 0xc0) >> 4;
|
|
||||||
*e |= (s[3]) >> 6;
|
|
||||||
*e ^= 0x2a; // top two bits of each tail byte correct?
|
|
||||||
*e >>= shifte[len];
|
|
||||||
|
|
||||||
return next;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if FMT_USE_WINDOWS_H
|
#if FMT_USE_WINDOWS_H
|
||||||
|
|
||||||
FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
|
|
||||||
static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
|
|
||||||
if (s.size() > INT_MAX)
|
|
||||||
FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG));
|
|
||||||
int s_size = static_cast<int>(s.size());
|
|
||||||
if (s_size == 0) {
|
|
||||||
// MultiByteToWideChar does not support zero length, handle separately.
|
|
||||||
buffer_.resize(1);
|
|
||||||
buffer_[0] = 0;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(),
|
|
||||||
s_size, nullptr, 0);
|
|
||||||
if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
|
|
||||||
buffer_.resize(length + 1);
|
|
||||||
length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size,
|
|
||||||
&buffer_[0], length);
|
|
||||||
if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
|
|
||||||
buffer_[length] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
|
FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
|
||||||
if (int error_code = convert(s)) {
|
if (int error_code = convert(s)) {
|
||||||
FMT_THROW(windows_error(error_code,
|
FMT_THROW(windows_error(error_code,
|
||||||
@ -1389,7 +1381,6 @@ FMT_FUNC void internal::format_windows_error(internal::buffer<char>& out,
|
|||||||
FMT_CATCH(...) {}
|
FMT_CATCH(...) {}
|
||||||
format_error_code(out, error_code, message);
|
format_error_code(out, error_code, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // FMT_USE_WINDOWS_H
|
#endif // FMT_USE_WINDOWS_H
|
||||||
|
|
||||||
FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,
|
FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,
|
||||||
|
@ -940,29 +940,28 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
|
|||||||
return internal::copy_str<Char>(buffer, buffer + num_digits, out);
|
return internal::copy_str<Char>(buffer, buffer + num_digits, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
# define FMT_USE_WINDOWS_H 0
|
|
||||||
#elif !defined(FMT_USE_WINDOWS_H)
|
|
||||||
# define FMT_USE_WINDOWS_H 1
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
|
|
||||||
// All the functionality that relies on it will be disabled too.
|
|
||||||
#if FMT_USE_WINDOWS_H
|
|
||||||
// A converter from UTF-8 to UTF-16.
|
// A converter from UTF-8 to UTF-16.
|
||||||
// It is only provided for Windows since other systems support UTF-8 natively.
|
|
||||||
class utf8_to_utf16 {
|
class utf8_to_utf16 {
|
||||||
private:
|
private:
|
||||||
wmemory_buffer buffer_;
|
wmemory_buffer buffer_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
FMT_API explicit utf8_to_utf16(string_view s);
|
FMT_API explicit utf8_to_utf16(string_view s);
|
||||||
operator wstring_view() const { return wstring_view(&buffer_[0], size()); }
|
operator wstring_view() const { return {&buffer_[0], size()}; }
|
||||||
size_t size() const { return buffer_.size() - 1; }
|
size_t size() const { return buffer_.size() - 1; }
|
||||||
const wchar_t* c_str() const { return &buffer_[0]; }
|
const wchar_t* c_str() const { return &buffer_[0]; }
|
||||||
std::wstring str() const { return std::wstring(&buffer_[0], size()); }
|
std::wstring str() const { return {&buffer_[0], size()}; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
|
||||||
|
// All the functionality that relies on it will be disabled too.
|
||||||
|
#ifndef _WIN32
|
||||||
|
# define FMT_USE_WINDOWS_H 0
|
||||||
|
#elif !defined(FMT_USE_WINDOWS_H)
|
||||||
|
# define FMT_USE_WINDOWS_H 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if FMT_USE_WINDOWS_H
|
||||||
// A converter from UTF-16 to UTF-8.
|
// A converter from UTF-16 to UTF-8.
|
||||||
// It is only provided for Windows since other systems support UTF-8 natively.
|
// It is only provided for Windows since other systems support UTF-8 natively.
|
||||||
class utf16_to_utf8 {
|
class utf16_to_utf8 {
|
||||||
|
@ -400,6 +400,23 @@ TEST(MemoryBufferTest, ExceptionInDeallocate) {
|
|||||||
EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size));
|
EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(UtilTest, UTF8ToUTF16) {
|
||||||
|
fmt::internal::utf8_to_utf16 u("лошадка");
|
||||||
|
EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
|
||||||
|
EXPECT_EQ(7, u.size());
|
||||||
|
// U+10437 { DESERET SMALL LETTER YEE }
|
||||||
|
EXPECT_EQ(L"\xD801\xDC37", fmt::internal::utf8_to_utf16("𐐷").str());
|
||||||
|
EXPECT_THROW_MSG(fmt::internal::utf8_to_utf16("\xc3\x28"), std::runtime_error,
|
||||||
|
"invalid utf8");
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(UtilTest, UTF8ToUTF16EmptyString) {
|
||||||
|
std::string s = "";
|
||||||
|
fmt::internal::utf8_to_utf16 u(s.c_str());
|
||||||
|
EXPECT_EQ(L"", u.str());
|
||||||
|
EXPECT_EQ(s.size(), u.size());
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
TEST(UtilTest, UTF16ToUTF8) {
|
TEST(UtilTest, UTF16ToUTF8) {
|
||||||
std::string s = "ёжик";
|
std::string s = "ёжик";
|
||||||
@ -415,20 +432,6 @@ TEST(UtilTest, UTF16ToUTF8EmptyString) {
|
|||||||
EXPECT_EQ(s.size(), u.size());
|
EXPECT_EQ(s.size(), u.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(UtilTest, UTF8ToUTF16) {
|
|
||||||
std::string s = "лошадка";
|
|
||||||
fmt::internal::utf8_to_utf16 u(s.c_str());
|
|
||||||
EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
|
|
||||||
EXPECT_EQ(7, u.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(UtilTest, UTF8ToUTF16EmptyString) {
|
|
||||||
std::string s = "";
|
|
||||||
fmt::internal::utf8_to_utf16 u(s.c_str());
|
|
||||||
EXPECT_EQ(L"", u.str());
|
|
||||||
EXPECT_EQ(s.size(), u.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Converter, typename Char>
|
template <typename Converter, typename Char>
|
||||||
void check_utf_conversion_error(
|
void check_utf_conversion_error(
|
||||||
const char* message,
|
const char* message,
|
||||||
@ -450,13 +453,6 @@ TEST(UtilTest, UTF16ToUTF8Error) {
|
|||||||
"cannot convert string from UTF-16 to UTF-8");
|
"cannot convert string from UTF-16 to UTF-8");
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(UtilTest, UTF8ToUTF16Error) {
|
|
||||||
const char* message = "cannot convert string from UTF-8 to UTF-16";
|
|
||||||
check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(message);
|
|
||||||
check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(
|
|
||||||
message, fmt::string_view("foo", INT_MAX + 1u));
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(UtilTest, UTF16ToUTF8Convert) {
|
TEST(UtilTest, UTF16ToUTF8Convert) {
|
||||||
fmt::internal::utf16_to_utf8 u;
|
fmt::internal::utf16_to_utf8 u;
|
||||||
EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1)));
|
EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1)));
|
||||||
@ -1237,8 +1233,7 @@ TEST(FormatterTest, Precision) {
|
|||||||
format_error,
|
format_error,
|
||||||
"precision not allowed for this argument type");
|
"precision not allowed for this argument type");
|
||||||
EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value<int>()),
|
EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value<int>()),
|
||||||
format_error,
|
format_error, "number is too big");
|
||||||
"number is too big");
|
|
||||||
|
|
||||||
EXPECT_EQ("st", format("{0:.2}", "str"));
|
EXPECT_EQ("st", format("{0:.2}", "str"));
|
||||||
}
|
}
|
||||||
@ -1875,8 +1870,8 @@ TEST(FormatTest, Dynamic) {
|
|||||||
args.emplace_back(fmt::internal::make_arg<ctx>(1.5f));
|
args.emplace_back(fmt::internal::make_arg<ctx>(1.5f));
|
||||||
|
|
||||||
std::string result = fmt::vformat(
|
std::string result = fmt::vformat(
|
||||||
"{} and {} and {}", fmt::basic_format_args<ctx>(
|
"{} and {} and {}",
|
||||||
args.data(), static_cast<int>(args.size())));
|
fmt::basic_format_args<ctx>(args.data(), static_cast<int>(args.size())));
|
||||||
|
|
||||||
EXPECT_EQ("42 and abc1 and 1.5", result);
|
EXPECT_EQ("42 and abc1 and 1.5", result);
|
||||||
}
|
}
|
||||||
@ -2266,9 +2261,7 @@ struct test_format_specs_handler {
|
|||||||
|
|
||||||
FMT_CONSTEXPR void on_precision(int p) { precision = p; }
|
FMT_CONSTEXPR void on_precision(int p) { precision = p; }
|
||||||
FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {}
|
FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {}
|
||||||
FMT_CONSTEXPR void on_dynamic_precision(int index) {
|
FMT_CONSTEXPR void on_dynamic_precision(int index) { precision_ref = index; }
|
||||||
precision_ref = index;
|
|
||||||
}
|
|
||||||
FMT_CONSTEXPR void on_dynamic_precision(string_view) {}
|
FMT_CONSTEXPR void on_dynamic_precision(string_view) {}
|
||||||
|
|
||||||
FMT_CONSTEXPR void end_precision() {}
|
FMT_CONSTEXPR void end_precision() {}
|
||||||
|
Reference in New Issue
Block a user