Tidy up utf8_checker and tests

This commit is contained in:
Vinnie Falco
2017-08-13 20:46:01 -07:00
parent 66b657a85f
commit f570593a01
3 changed files with 99 additions and 45 deletions

View File

@ -15,6 +15,7 @@ WebSocket:
* Refactor read_op + fail_op * Refactor read_op + fail_op
* Websocket close will automatically drain * Websocket close will automatically drain
* Autobahn|Testsuite fixes * Autobahn|Testsuite fixes
* Tidy up utf8_checker and tests
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------

View File

@ -30,9 +30,9 @@ namespace detail {
template<class = void> template<class = void>
class utf8_checker_t class utf8_checker_t
{ {
std::size_t need_ = 0; std::size_t need_ = 0; // chars we need to finish the code point
std::uint8_t* p_ = have_; std::uint8_t* p_ = cp_; // current position in temp buffer
std::uint8_t have_[4]; std::uint8_t cp_[4]; // a temp buffer for the code point
public: public:
/** Prepare to process text as valid utf8 /** Prepare to process text as valid utf8
@ -67,7 +67,7 @@ utf8_checker_t<_>::
reset() reset()
{ {
need_ = 0; need_ = 0;
p_ = have_; p_ = cp_;
} }
template<class _> template<class _>
@ -105,21 +105,21 @@ write(std::uint8_t const* in, std::size_t size)
auto const valid = auto const valid =
[](std::uint8_t const*& p) [](std::uint8_t const*& p)
{ {
if (p[0] < 128) if(p[0] < 128)
{ {
++p; ++p;
return true; return true;
} }
if ((p[0] & 0x60) == 0x40) if((p[0] & 0x60) == 0x40)
{ {
if ((p[1] & 0xc0) != 0x80) if((p[1] & 0xc0) != 0x80)
return false; return false;
p += 2; p += 2;
return true; return true;
} }
if ((p[0] & 0xf0) == 0xe0) if((p[0] & 0xf0) == 0xe0)
{ {
if ((p[1] & 0xc0) != 0x80 || if((p[1] & 0xc0) != 0x80 ||
(p[2] & 0xc0) != 0x80 || (p[2] & 0xc0) != 0x80 ||
(p[0] == 224 && p[1] < 160) || (p[0] == 224 && p[1] < 160) ||
(p[0] == 237 && p[1] > 159)) (p[0] == 237 && p[1] > 159))
@ -127,9 +127,9 @@ write(std::uint8_t const* in, std::size_t size)
p += 3; p += 3;
return true; return true;
} }
if ((p[0] & 0xf8) == 0xf0) if((p[0] & 0xf8) == 0xf0)
{ {
if (p[0] > 244 || if(p[0] > 244 ||
(p[1] & 0xc0) != 0x80 || (p[1] & 0xc0) != 0x80 ||
(p[2] & 0xc0) != 0x80 || (p[2] & 0xc0) != 0x80 ||
(p[3] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80 ||
@ -144,26 +144,26 @@ write(std::uint8_t const* in, std::size_t size)
auto const valid_have = auto const valid_have =
[&]() [&]()
{ {
if ((have_[0] & 0x60) == 0x40) if((cp_[0] & 0x60) == 0x40)
return have_[0] <= 223; return cp_[0] <= 223;
if ((have_[0] & 0xf0) == 0xe0) if((cp_[0] & 0xf0) == 0xe0)
{ {
if (p_ - have_ > 1 && if(p_ - cp_ > 1 &&
((have_[1] & 0xc0) != 0x80 || ((cp_[1] & 0xc0) != 0x80 ||
(have_[0] == 224 && have_[1] < 160) || (cp_[0] == 224 && cp_[1] < 160) ||
(have_[0] == 237 && have_[1] > 159))) (cp_[0] == 237 && cp_[1] > 159)))
return false; return false;
return true; return true;
} }
if ((have_[0] & 0xf8) == 0xf0) if((cp_[0] & 0xf8) == 0xf0)
{ {
auto const n = p_ - have_; auto const n = p_ - cp_;
if (n > 2 && (have_[2] & 0xc0) != 0x80) if(n > 2 && (cp_[2] & 0xc0) != 0x80)
return false; return false;
if (n > 1 && if(n > 1 &&
((have_[1] & 0xc0) != 0x80 || ((cp_[1] & 0xc0) != 0x80 ||
(have_[0] == 240 && have_[1] < 144) || (cp_[0] == 240 && cp_[1] < 144) ||
(have_[0] == 244 && have_[1] > 143))) (cp_[0] == 244 && cp_[1] > 143)))
return false; return false;
} }
return true; return true;
@ -171,51 +171,69 @@ write(std::uint8_t const* in, std::size_t size)
auto const needed = auto const needed =
[](std::uint8_t const v) [](std::uint8_t const v)
{ {
if (v < 128) if(v < 128)
return 1; return 1;
if (v < 194) if(v < 194)
return 0; return 0;
if (v < 224) if(v < 224)
return 2; return 2;
if (v < 240) if(v < 240)
return 3; return 3;
if (v < 245) if(v < 245)
return 4; return 4;
return 0; return 0;
}; };
auto const end = in + size; auto const end = in + size;
if (need_ > 0)
// Finish up any incomplete code point
if(need_ > 0)
{ {
// Calculate what we have
auto n = (std::min)(size, need_); auto n = (std::min)(size, need_);
size -= n; size -= n;
need_ -= n; need_ -= n;
// Add characters to the code point
while(n--) while(n--)
*p_++ = *in++; *p_++ = *in++;
BOOST_ASSERT(p_ <= cp_ + 5);
// Still incomplete?
if(need_ > 0) if(need_ > 0)
{ {
// Incomplete code point
BOOST_ASSERT(in == end); BOOST_ASSERT(in == end);
// Do partial validation on the incomplete
// code point, this is called "Fail fast"
// in Autobahn|Testsuite parlance.
return valid_have(); return valid_have();
} }
std::uint8_t const* p = &have_[0];
if (! valid(p)) // Complete code point, validate it
std::uint8_t const* p = &cp_[0];
if(! valid(p))
return false; return false;
p_ = have_; p_ = cp_;
} }
if(size <= sizeof(std::size_t)) if(size <= sizeof(std::size_t))
goto slow; goto slow;
// align in to sizeof(std::size_t) boundary // Align `in` to sizeof(std::size_t) boundary
{ {
auto const in0 = in; auto const in0 = in;
auto last = reinterpret_cast<std::uint8_t const*>( auto last = reinterpret_cast<std::uint8_t const*>(
((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) / ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
sizeof(std::size_t)) * sizeof(std::size_t)); sizeof(std::size_t)) * sizeof(std::size_t));
// Check one character at a time for low-ASCII
while(in < last) while(in < last)
{ {
if(*in & 0x80) if(*in & 0x80)
{ {
// Not low-ASCII so switch to slow loop
size = size - (in - in0); size = size - (in - in0);
goto slow; goto slow;
} }
@ -224,7 +242,7 @@ write(std::uint8_t const* in, std::size_t size)
size = size - (in - in0); size = size - (in - in0);
} }
// fast loop // Fast loop: Process 4 or 8 low-ASCII characters at a time
{ {
auto const in0 = in; auto const in0 = in;
auto last = in + size - 7; auto last = in + size - 7;
@ -246,6 +264,7 @@ write(std::uint8_t const* in, std::size_t size)
} }
in += sizeof(std::size_t); in += sizeof(std::size_t);
} }
// There's at least one more full code point left
last += 4; last += 4;
while(in < last) while(in < last)
if(! valid(in)) if(! valid(in))
@ -253,8 +272,8 @@ write(std::uint8_t const* in, std::size_t size)
goto tail; goto tail;
} }
// slow loop: one code point at a time
slow: slow:
// Slow loop: Full validation on one code point at a time
{ {
auto last = in + size - 3; auto last = in + size - 3;
while(in < last) while(in < last)
@ -263,24 +282,45 @@ slow:
} }
tail: tail:
// Handle the remaining bytes. The last
// characters could split a code point so
// we save the partial code point for later.
//
// On entry to the loop, `in` points to the
// beginning of a code point.
//
for(;;) for(;;)
{ {
// Number of chars left
auto n = end - in; auto n = end - in;
if(! n) if(! n)
break; break;
// Chars we need to finish this code point
auto const need = needed(*in); auto const need = needed(*in);
if (need == 0) if(need == 0)
return false; return false;
if(need <= n) if(need <= n)
{ {
// Check a whole code point
if(! valid(in)) if(! valid(in))
return false; return false;
} }
else else
{ {
// Calculate how many chars we need
// to finish this partial code point
need_ = need - n; need_ = need - n;
// Save the partial code point
while(n--) while(n--)
*p_++ = *in++; *p_++ = *in++;
BOOST_ASSERT(in == end);
BOOST_ASSERT(p_ <= cp_ + 5);
// Do partial validation on the incomplete
// code point, this is called "Fail fast"
// in Autobahn|Testsuite parlance.
return valid_have(); return valid_have();
} }
} }

View File

@ -42,14 +42,12 @@ public:
BEAST_EXPECT(utf8.finish()); BEAST_EXPECT(utf8.finish());
// Invalid range 128-193 // Invalid range 128-193
for(auto it = std::next(buf.begin(), 128); for(unsigned char c = 128; c < 194; ++c)
it != std::next(buf.begin(), 194); ++it) BEAST_EXPECT(! utf8.write(&c, 1));
BEAST_EXPECT(! utf8.write(&(*it), 1));
// Invalid range 245-255 // Invalid range 245-255
for(auto it = std::next(buf.begin(), 245); for(unsigned char c = 245; c; ++c)
it != buf.end(); ++it) BEAST_EXPECT(! utf8.write(&c, 1));
BEAST_EXPECT(! utf8.write(&(*it), 1));
// Invalid sequence // Invalid sequence
std::fill(buf.begin(), buf.end(), '\xff'); std::fill(buf.begin(), buf.end(), '\xff');
@ -79,6 +77,7 @@ public:
// Second byte invalid range 0-127 // Second byte invalid range 0-127
buf[1] = static_cast<std::uint8_t>(j); buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 2)); BEAST_EXPECT(! utf8.write(buf, 2));
utf8.reset();
} }
for(auto j = 192; j <= 255; ++j) for(auto j = 192; j <= 255; ++j)
@ -86,6 +85,7 @@ public:
// Second byte invalid range 192-255 // Second byte invalid range 192-255
buf[1] = static_cast<std::uint8_t>(j); buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 2)); BEAST_EXPECT(! utf8.write(buf, 2));
utf8.reset();
} }
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
@ -134,6 +134,7 @@ public:
// Second byte invalid range 0-127 or 0-159 // Second byte invalid range 0-127 or 0-159
buf[1] = static_cast<std::uint8_t>(l); buf[1] = static_cast<std::uint8_t>(l);
BEAST_EXPECT(! utf8.write(buf, 3)); BEAST_EXPECT(! utf8.write(buf, 3));
utf8.reset();
if (l > 127) if (l > 127)
{ {
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
@ -149,7 +150,8 @@ public:
{ {
// Second byte invalid range 160-255 or 192-255 // Second byte invalid range 160-255 or 192-255
buf[1] = static_cast<std::uint8_t>(l); buf[1] = static_cast<std::uint8_t>(l);
BEAST_EXPECT(!utf8.write(buf, 3)); BEAST_EXPECT(! utf8.write(buf, 3));
utf8.reset();
if (l > 159) if (l > 159)
{ {
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
@ -166,6 +168,7 @@ public:
// Third byte invalid range 0-127 // Third byte invalid range 0-127
buf[2] = static_cast<std::uint8_t>(k); buf[2] = static_cast<std::uint8_t>(k);
BEAST_EXPECT(! utf8.write(buf, 3)); BEAST_EXPECT(! utf8.write(buf, 3));
utf8.reset();
} }
for(auto k = 192; k <= 255; ++k) for(auto k = 192; k <= 255; ++k)
@ -173,6 +176,7 @@ public:
// Third byte invalid range 192-255 // Third byte invalid range 192-255
buf[2] = static_cast<std::uint8_t>(k); buf[2] = static_cast<std::uint8_t>(k);
BEAST_EXPECT(! utf8.write(buf, 3)); BEAST_EXPECT(! utf8.write(buf, 3));
utf8.reset();
} }
// Segmented sequence third byte invalid // Segmented sequence third byte invalid
@ -186,6 +190,7 @@ public:
// Second byte invalid range 0-127 or 0-159 // Second byte invalid range 0-127 or 0-159
buf[1] = static_cast<std::uint8_t>(j); buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 3)); BEAST_EXPECT(! utf8.write(buf, 3));
utf8.reset();
} }
for(auto j = e + 1; j <= 255; ++j) for(auto j = e + 1; j <= 255; ++j)
@ -193,6 +198,7 @@ public:
// Second byte invalid range 160-255 or 192-255 // Second byte invalid range 160-255 or 192-255
buf[1] = static_cast<std::uint8_t>(j); buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 3)); BEAST_EXPECT(! utf8.write(buf, 3));
utf8.reset();
} }
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
@ -251,6 +257,7 @@ public:
{ {
buf[1] = static_cast<std::uint8_t>(r); buf[1] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
if (r > 127) if (r > 127)
{ {
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
@ -267,6 +274,7 @@ public:
{ {
buf[1] = static_cast<std::uint8_t>(r); buf[1] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
BEAST_EXPECT(! utf8.write(buf, 2)); BEAST_EXPECT(! utf8.write(buf, 2));
utf8.reset(); utf8.reset();
@ -280,6 +288,7 @@ public:
{ {
buf[3] = static_cast<std::uint8_t>(r); buf[3] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
} }
// Segmented sequence fourth byte invalid // Segmented sequence fourth byte invalid
@ -293,6 +302,7 @@ public:
{ {
buf[2] = static_cast<std::uint8_t>(r); buf[2] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
} }
// Segmented sequence third byte invalid // Segmented sequence third byte invalid
@ -306,6 +316,7 @@ public:
{ {
buf[1] = static_cast<std::uint8_t>(r); buf[1] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
} }
// Second byte invalid range 144-255 or 192-255 // Second byte invalid range 144-255 or 192-255
@ -313,6 +324,7 @@ public:
{ {
buf[1] = static_cast<std::uint8_t>(r); buf[1] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
} }
// Segmented sequence second byte invalid // Segmented sequence second byte invalid
@ -326,6 +338,7 @@ public:
{ {
buf[0] = static_cast<std::uint8_t>(r); buf[0] = static_cast<std::uint8_t>(r);
BEAST_EXPECT(! utf8.write(buf, 4)); BEAST_EXPECT(! utf8.write(buf, 4));
utf8.reset();
} }
} }