Implement utf8_to_utf16 using utf8_decode

This commit is contained in:
Victor Zverovich
2019-12-15 07:45:57 -08:00
parent 0012917f69
commit ec2463c905
3 changed files with 102 additions and 119 deletions

View File

@ -1214,6 +1214,61 @@ int snprintf_float(T value, int precision, float_specs specs,
return exp - fraction_size;
}
}
// A public domain branchless UTF-8 decoder by Christopher Wellons:
// https://github.com/skeeto/branchless-utf8
/* Decode the next character, c, from buf, reporting errors in e.
*
* Since this is a branchless decoder, four bytes will be read from the
* buffer regardless of the actual length of the next character. This
* means the buffer _must_ have at least three bytes of zero padding
* following the end of the data stream.
*
* Errors are reported in e, which will be non-zero if the parsed
* character was somehow invalid: invalid byte sequence, non-canonical
* encoding, or a surrogate half.
*
* The function returns a pointer to the next character. When an error
* occurs, this pointer will be a guess that depends on the particular
* error, but it will always advance at least one byte.
*/
FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
auto s = reinterpret_cast<const unsigned char*>(buf);
int len = lengths[s[0] >> 3];
// Compute the pointer to the next character early so that the next
// iteration can start working on the next character. Neither Clang
// nor GCC figure out this reordering on their own.
const char* next = buf + len + !len;
// Assume a four-byte character and load four bytes. Unused bits are
// shifted out.
*c = uint32_t(s[0] & masks[len]) << 18;
*c |= uint32_t(s[1] & 0x3f) << 12;
*c |= uint32_t(s[2] & 0x3f) << 6;
*c |= uint32_t(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
// Accumulate the various error conditions.
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
*e |= (s[1] & 0xc0) >> 2;
*e |= (s[2] & 0xc0) >> 4;
*e |= (s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
return next;
}
} // namespace internal
template <> struct formatter<internal::bigint> {
@ -1240,87 +1295,24 @@ template <> struct formatter<internal::bigint> {
}
};
// A public domain branchless UTF-8 decoder:
// https://github.com/skeeto/branchless-utf8
/* Decode the next character, C, from BUF, reporting errors in E.
*
* Since this is a branchless decoder, four bytes will be read from the
* buffer regardless of the actual length of the next character. This
* means the buffer _must_ have at least three bytes of zero padding
* following the end of the data stream.
*
* Errors are reported in E, which will be non-zero if the parsed
* character was somehow invalid: invalid byte sequence, non-canonical
* encoding, or a surrogate half.
*
* The function returns a pointer to the next character. When an error
* occurs, this pointer will be a guess that depends on the particular
* error, but it will always advance at least one byte.
*/
static void* utf8_decode(void* buf, uint32_t* c, int* e) {
static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
static const int shiftc[] = {0, 18, 12, 6, 0};
static const int shifte[] = {0, 6, 4, 2, 0};
auto s = reinterpret_cast<unsigned char*>(buf);
int len = lengths[s[0] >> 3];
/* Compute the pointer to the next character early so that the next
* iteration can start working on the next character. Neither Clang
* nor GCC figure out this reordering on their own.
*/
unsigned char* next = s + len + !len;
/* Assume a four-byte character and load four bytes. Unused bits are
* shifted out.
*/
*c = (uint32_t)(s[0] & masks[len]) << 18;
*c |= (uint32_t)(s[1] & 0x3f) << 12;
*c |= (uint32_t)(s[2] & 0x3f) << 6;
*c |= (uint32_t)(s[3] & 0x3f) << 0;
*c >>= shiftc[len];
/* Accumulate the various error conditions. */
*e = (*c < mins[len]) << 6; // non-canonical encoding
*e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
*e |= (*c > 0x10FFFF) << 8; // out of range?
*e |= (s[1] & 0xc0) >> 2;
*e |= (s[2] & 0xc0) >> 4;
*e |= (s[3]) >> 6;
*e ^= 0x2a; // top two bits of each tail byte correct?
*e >>= shifte[len];
return next;
FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
for (auto p = s.data(), end = p + s.size(); p != end;) {
auto cp = uint32_t();
auto error = 0;
p = utf8_decode(p, &cp, &error);
if (error != 0) FMT_THROW(std::runtime_error("invalid utf8"));
if (cp <= 0xFFFF) {
buffer_.push_back(static_cast<wchar_t>(cp));
} else {
cp -= 0x10000;
buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
}
}
buffer_.push_back(0);
}
#if FMT_USE_WINDOWS_H
FMT_FUNC internal::utf8_to_utf16::utf8_to_utf16(string_view s) {
static const char ERROR_MSG[] = "cannot convert string from UTF-8 to UTF-16";
if (s.size() > INT_MAX)
FMT_THROW(windows_error(ERROR_INVALID_PARAMETER, ERROR_MSG));
int s_size = static_cast<int>(s.size());
if (s_size == 0) {
// MultiByteToWideChar does not support zero length, handle separately.
buffer_.resize(1);
buffer_[0] = 0;
return;
}
int length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(),
s_size, nullptr, 0);
if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
buffer_.resize(length + 1);
length = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, s.data(), s_size,
&buffer_[0], length);
if (length == 0) FMT_THROW(windows_error(GetLastError(), ERROR_MSG));
buffer_[length] = 0;
}
FMT_FUNC internal::utf16_to_utf8::utf16_to_utf8(wstring_view s) {
if (int error_code = convert(s)) {
FMT_THROW(windows_error(error_code,
@ -1389,7 +1381,6 @@ FMT_FUNC void internal::format_windows_error(internal::buffer<char>& out,
FMT_CATCH(...) {}
format_error_code(out, error_code, message);
}
#endif // FMT_USE_WINDOWS_H
FMT_FUNC void format_system_error(internal::buffer<char>& out, int error_code,

View File

@ -940,29 +940,28 @@ inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
return internal::copy_str<Char>(buffer, buffer + num_digits, out);
}
#ifndef _WIN32
# define FMT_USE_WINDOWS_H 0
#elif !defined(FMT_USE_WINDOWS_H)
# define FMT_USE_WINDOWS_H 1
#endif
// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
// All the functionality that relies on it will be disabled too.
#if FMT_USE_WINDOWS_H
// A converter from UTF-8 to UTF-16.
// It is only provided for Windows since other systems support UTF-8 natively.
class utf8_to_utf16 {
private:
wmemory_buffer buffer_;
public:
FMT_API explicit utf8_to_utf16(string_view s);
operator wstring_view() const { return wstring_view(&buffer_[0], size()); }
operator wstring_view() const { return {&buffer_[0], size()}; }
size_t size() const { return buffer_.size() - 1; }
const wchar_t* c_str() const { return &buffer_[0]; }
std::wstring str() const { return std::wstring(&buffer_[0], size()); }
std::wstring str() const { return {&buffer_[0], size()}; }
};
// Define FMT_USE_WINDOWS_H to 0 to disable use of windows.h.
// All the functionality that relies on it will be disabled too.
#ifndef _WIN32
# define FMT_USE_WINDOWS_H 0
#elif !defined(FMT_USE_WINDOWS_H)
# define FMT_USE_WINDOWS_H 1
#endif
#if FMT_USE_WINDOWS_H
// A converter from UTF-16 to UTF-8.
// It is only provided for Windows since other systems support UTF-8 natively.
class utf16_to_utf8 {

View File

@ -400,6 +400,23 @@ TEST(MemoryBufferTest, ExceptionInDeallocate) {
EXPECT_CALL(alloc, deallocate(&mem2[0], 2 * size));
}
TEST(UtilTest, UTF8ToUTF16) {
fmt::internal::utf8_to_utf16 u("лошадка");
EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
EXPECT_EQ(7, u.size());
// U+10437 { DESERET SMALL LETTER YEE }
EXPECT_EQ(L"\xD801\xDC37", fmt::internal::utf8_to_utf16("𐐷").str());
EXPECT_THROW_MSG(fmt::internal::utf8_to_utf16("\xc3\x28"), std::runtime_error,
"invalid utf8");
}
TEST(UtilTest, UTF8ToUTF16EmptyString) {
std::string s = "";
fmt::internal::utf8_to_utf16 u(s.c_str());
EXPECT_EQ(L"", u.str());
EXPECT_EQ(s.size(), u.size());
}
#ifdef _WIN32
TEST(UtilTest, UTF16ToUTF8) {
std::string s = "ёжик";
@ -415,20 +432,6 @@ TEST(UtilTest, UTF16ToUTF8EmptyString) {
EXPECT_EQ(s.size(), u.size());
}
TEST(UtilTest, UTF8ToUTF16) {
std::string s = "лошадка";
fmt::internal::utf8_to_utf16 u(s.c_str());
EXPECT_EQ(L"\x043B\x043E\x0448\x0430\x0434\x043A\x0430", u.str());
EXPECT_EQ(7, u.size());
}
TEST(UtilTest, UTF8ToUTF16EmptyString) {
std::string s = "";
fmt::internal::utf8_to_utf16 u(s.c_str());
EXPECT_EQ(L"", u.str());
EXPECT_EQ(s.size(), u.size());
}
template <typename Converter, typename Char>
void check_utf_conversion_error(
const char* message,
@ -450,13 +453,6 @@ TEST(UtilTest, UTF16ToUTF8Error) {
"cannot convert string from UTF-16 to UTF-8");
}
TEST(UtilTest, UTF8ToUTF16Error) {
const char* message = "cannot convert string from UTF-8 to UTF-16";
check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(message);
check_utf_conversion_error<fmt::internal::utf8_to_utf16, char>(
message, fmt::string_view("foo", INT_MAX + 1u));
}
TEST(UtilTest, UTF16ToUTF8Convert) {
fmt::internal::utf16_to_utf8 u;
EXPECT_EQ(ERROR_INVALID_PARAMETER, u.convert(fmt::wstring_view(0, 1)));
@ -1237,8 +1233,7 @@ TEST(FormatterTest, Precision) {
format_error,
"precision not allowed for this argument type");
EXPECT_THROW_MSG(format("{:.{}e}", 42.0, fmt::internal::max_value<int>()),
format_error,
"number is too big");
format_error, "number is too big");
EXPECT_EQ("st", format("{0:.2}", "str"));
}
@ -1875,8 +1870,8 @@ TEST(FormatTest, Dynamic) {
args.emplace_back(fmt::internal::make_arg<ctx>(1.5f));
std::string result = fmt::vformat(
"{} and {} and {}", fmt::basic_format_args<ctx>(
args.data(), static_cast<int>(args.size())));
"{} and {} and {}",
fmt::basic_format_args<ctx>(args.data(), static_cast<int>(args.size())));
EXPECT_EQ("42 and abc1 and 1.5", result);
}
@ -2266,9 +2261,7 @@ struct test_format_specs_handler {
FMT_CONSTEXPR void on_precision(int p) { precision = p; }
FMT_CONSTEXPR void on_dynamic_precision(fmt::internal::auto_id) {}
FMT_CONSTEXPR void on_dynamic_precision(int index) {
precision_ref = index;
}
FMT_CONSTEXPR void on_dynamic_precision(int index) { precision_ref = index; }
FMT_CONSTEXPR void on_dynamic_precision(string_view) {}
FMT_CONSTEXPR void end_precision() {}