mirror of
https://github.com/boostorg/beast.git
synced 2025-07-30 21:07:26 +02:00
Tidy up utf8_checker and tests
This commit is contained in:
@ -15,6 +15,7 @@ WebSocket:
|
|||||||
* Refactor read_op + fail_op
|
* Refactor read_op + fail_op
|
||||||
* Websocket close will automatically drain
|
* Websocket close will automatically drain
|
||||||
* Autobahn|Testsuite fixes
|
* Autobahn|Testsuite fixes
|
||||||
|
* Tidy up utf8_checker and tests
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
@ -30,9 +30,9 @@ namespace detail {
|
|||||||
template<class = void>
|
template<class = void>
|
||||||
class utf8_checker_t
|
class utf8_checker_t
|
||||||
{
|
{
|
||||||
std::size_t need_ = 0;
|
std::size_t need_ = 0; // chars we need to finish the code point
|
||||||
std::uint8_t* p_ = have_;
|
std::uint8_t* p_ = cp_; // current position in temp buffer
|
||||||
std::uint8_t have_[4];
|
std::uint8_t cp_[4]; // a temp buffer for the code point
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/** Prepare to process text as valid utf8
|
/** Prepare to process text as valid utf8
|
||||||
@ -67,7 +67,7 @@ utf8_checker_t<_>::
|
|||||||
reset()
|
reset()
|
||||||
{
|
{
|
||||||
need_ = 0;
|
need_ = 0;
|
||||||
p_ = have_;
|
p_ = cp_;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class _>
|
template<class _>
|
||||||
@ -105,21 +105,21 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
auto const valid =
|
auto const valid =
|
||||||
[](std::uint8_t const*& p)
|
[](std::uint8_t const*& p)
|
||||||
{
|
{
|
||||||
if (p[0] < 128)
|
if(p[0] < 128)
|
||||||
{
|
{
|
||||||
++p;
|
++p;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((p[0] & 0x60) == 0x40)
|
if((p[0] & 0x60) == 0x40)
|
||||||
{
|
{
|
||||||
if ((p[1] & 0xc0) != 0x80)
|
if((p[1] & 0xc0) != 0x80)
|
||||||
return false;
|
return false;
|
||||||
p += 2;
|
p += 2;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((p[0] & 0xf0) == 0xe0)
|
if((p[0] & 0xf0) == 0xe0)
|
||||||
{
|
{
|
||||||
if ((p[1] & 0xc0) != 0x80 ||
|
if((p[1] & 0xc0) != 0x80 ||
|
||||||
(p[2] & 0xc0) != 0x80 ||
|
(p[2] & 0xc0) != 0x80 ||
|
||||||
(p[0] == 224 && p[1] < 160) ||
|
(p[0] == 224 && p[1] < 160) ||
|
||||||
(p[0] == 237 && p[1] > 159))
|
(p[0] == 237 && p[1] > 159))
|
||||||
@ -127,9 +127,9 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
p += 3;
|
p += 3;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((p[0] & 0xf8) == 0xf0)
|
if((p[0] & 0xf8) == 0xf0)
|
||||||
{
|
{
|
||||||
if (p[0] > 244 ||
|
if(p[0] > 244 ||
|
||||||
(p[1] & 0xc0) != 0x80 ||
|
(p[1] & 0xc0) != 0x80 ||
|
||||||
(p[2] & 0xc0) != 0x80 ||
|
(p[2] & 0xc0) != 0x80 ||
|
||||||
(p[3] & 0xc0) != 0x80 ||
|
(p[3] & 0xc0) != 0x80 ||
|
||||||
@ -144,26 +144,26 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
auto const valid_have =
|
auto const valid_have =
|
||||||
[&]()
|
[&]()
|
||||||
{
|
{
|
||||||
if ((have_[0] & 0x60) == 0x40)
|
if((cp_[0] & 0x60) == 0x40)
|
||||||
return have_[0] <= 223;
|
return cp_[0] <= 223;
|
||||||
if ((have_[0] & 0xf0) == 0xe0)
|
if((cp_[0] & 0xf0) == 0xe0)
|
||||||
{
|
{
|
||||||
if (p_ - have_ > 1 &&
|
if(p_ - cp_ > 1 &&
|
||||||
((have_[1] & 0xc0) != 0x80 ||
|
((cp_[1] & 0xc0) != 0x80 ||
|
||||||
(have_[0] == 224 && have_[1] < 160) ||
|
(cp_[0] == 224 && cp_[1] < 160) ||
|
||||||
(have_[0] == 237 && have_[1] > 159)))
|
(cp_[0] == 237 && cp_[1] > 159)))
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if ((have_[0] & 0xf8) == 0xf0)
|
if((cp_[0] & 0xf8) == 0xf0)
|
||||||
{
|
{
|
||||||
auto const n = p_ - have_;
|
auto const n = p_ - cp_;
|
||||||
if (n > 2 && (have_[2] & 0xc0) != 0x80)
|
if(n > 2 && (cp_[2] & 0xc0) != 0x80)
|
||||||
return false;
|
return false;
|
||||||
if (n > 1 &&
|
if(n > 1 &&
|
||||||
((have_[1] & 0xc0) != 0x80 ||
|
((cp_[1] & 0xc0) != 0x80 ||
|
||||||
(have_[0] == 240 && have_[1] < 144) ||
|
(cp_[0] == 240 && cp_[1] < 144) ||
|
||||||
(have_[0] == 244 && have_[1] > 143)))
|
(cp_[0] == 244 && cp_[1] > 143)))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -171,51 +171,69 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
auto const needed =
|
auto const needed =
|
||||||
[](std::uint8_t const v)
|
[](std::uint8_t const v)
|
||||||
{
|
{
|
||||||
if (v < 128)
|
if(v < 128)
|
||||||
return 1;
|
return 1;
|
||||||
if (v < 194)
|
if(v < 194)
|
||||||
return 0;
|
return 0;
|
||||||
if (v < 224)
|
if(v < 224)
|
||||||
return 2;
|
return 2;
|
||||||
if (v < 240)
|
if(v < 240)
|
||||||
return 3;
|
return 3;
|
||||||
if (v < 245)
|
if(v < 245)
|
||||||
return 4;
|
return 4;
|
||||||
return 0;
|
return 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto const end = in + size;
|
auto const end = in + size;
|
||||||
if (need_ > 0)
|
|
||||||
|
// Finish up any incomplete code point
|
||||||
|
if(need_ > 0)
|
||||||
{
|
{
|
||||||
|
// Calculate what we have
|
||||||
auto n = (std::min)(size, need_);
|
auto n = (std::min)(size, need_);
|
||||||
size -= n;
|
size -= n;
|
||||||
need_ -= n;
|
need_ -= n;
|
||||||
|
|
||||||
|
// Add characters to the code point
|
||||||
while(n--)
|
while(n--)
|
||||||
*p_++ = *in++;
|
*p_++ = *in++;
|
||||||
|
BOOST_ASSERT(p_ <= cp_ + 5);
|
||||||
|
|
||||||
|
// Still incomplete?
|
||||||
if(need_ > 0)
|
if(need_ > 0)
|
||||||
{
|
{
|
||||||
|
// Incomplete code point
|
||||||
BOOST_ASSERT(in == end);
|
BOOST_ASSERT(in == end);
|
||||||
|
|
||||||
|
// Do partial validation on the incomplete
|
||||||
|
// code point, this is called "Fail fast"
|
||||||
|
// in Autobahn|Testsuite parlance.
|
||||||
return valid_have();
|
return valid_have();
|
||||||
}
|
}
|
||||||
std::uint8_t const* p = &have_[0];
|
|
||||||
if (! valid(p))
|
// Complete code point, validate it
|
||||||
|
std::uint8_t const* p = &cp_[0];
|
||||||
|
if(! valid(p))
|
||||||
return false;
|
return false;
|
||||||
p_ = have_;
|
p_ = cp_;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(size <= sizeof(std::size_t))
|
if(size <= sizeof(std::size_t))
|
||||||
goto slow;
|
goto slow;
|
||||||
|
|
||||||
// align in to sizeof(std::size_t) boundary
|
// Align `in` to sizeof(std::size_t) boundary
|
||||||
{
|
{
|
||||||
auto const in0 = in;
|
auto const in0 = in;
|
||||||
auto last = reinterpret_cast<std::uint8_t const*>(
|
auto last = reinterpret_cast<std::uint8_t const*>(
|
||||||
((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
|
((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
|
||||||
sizeof(std::size_t)) * sizeof(std::size_t));
|
sizeof(std::size_t)) * sizeof(std::size_t));
|
||||||
|
|
||||||
|
// Check one character at a time for low-ASCII
|
||||||
while(in < last)
|
while(in < last)
|
||||||
{
|
{
|
||||||
if(*in & 0x80)
|
if(*in & 0x80)
|
||||||
{
|
{
|
||||||
|
// Not low-ASCII so switch to slow loop
|
||||||
size = size - (in - in0);
|
size = size - (in - in0);
|
||||||
goto slow;
|
goto slow;
|
||||||
}
|
}
|
||||||
@ -224,7 +242,7 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
size = size - (in - in0);
|
size = size - (in - in0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// fast loop
|
// Fast loop: Process 4 or 8 low-ASCII characters at a time
|
||||||
{
|
{
|
||||||
auto const in0 = in;
|
auto const in0 = in;
|
||||||
auto last = in + size - 7;
|
auto last = in + size - 7;
|
||||||
@ -246,6 +264,7 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
}
|
}
|
||||||
in += sizeof(std::size_t);
|
in += sizeof(std::size_t);
|
||||||
}
|
}
|
||||||
|
// There's at least one more full code point left
|
||||||
last += 4;
|
last += 4;
|
||||||
while(in < last)
|
while(in < last)
|
||||||
if(! valid(in))
|
if(! valid(in))
|
||||||
@ -253,8 +272,8 @@ write(std::uint8_t const* in, std::size_t size)
|
|||||||
goto tail;
|
goto tail;
|
||||||
}
|
}
|
||||||
|
|
||||||
// slow loop: one code point at a time
|
|
||||||
slow:
|
slow:
|
||||||
|
// Slow loop: Full validation on one code point at a time
|
||||||
{
|
{
|
||||||
auto last = in + size - 3;
|
auto last = in + size - 3;
|
||||||
while(in < last)
|
while(in < last)
|
||||||
@ -263,24 +282,45 @@ slow:
|
|||||||
}
|
}
|
||||||
|
|
||||||
tail:
|
tail:
|
||||||
|
// Handle the remaining bytes. The last
|
||||||
|
// characters could split a code point so
|
||||||
|
// we save the partial code point for later.
|
||||||
|
//
|
||||||
|
// On entry to the loop, `in` points to the
|
||||||
|
// beginning of a code point.
|
||||||
|
//
|
||||||
for(;;)
|
for(;;)
|
||||||
{
|
{
|
||||||
|
// Number of chars left
|
||||||
auto n = end - in;
|
auto n = end - in;
|
||||||
if(! n)
|
if(! n)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
// Chars we need to finish this code point
|
||||||
auto const need = needed(*in);
|
auto const need = needed(*in);
|
||||||
if (need == 0)
|
if(need == 0)
|
||||||
return false;
|
return false;
|
||||||
if(need <= n)
|
if(need <= n)
|
||||||
{
|
{
|
||||||
|
// Check a whole code point
|
||||||
if(! valid(in))
|
if(! valid(in))
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// Calculate how many chars we need
|
||||||
|
// to finish this partial code point
|
||||||
need_ = need - n;
|
need_ = need - n;
|
||||||
|
|
||||||
|
// Save the partial code point
|
||||||
while(n--)
|
while(n--)
|
||||||
*p_++ = *in++;
|
*p_++ = *in++;
|
||||||
|
BOOST_ASSERT(in == end);
|
||||||
|
BOOST_ASSERT(p_ <= cp_ + 5);
|
||||||
|
|
||||||
|
// Do partial validation on the incomplete
|
||||||
|
// code point, this is called "Fail fast"
|
||||||
|
// in Autobahn|Testsuite parlance.
|
||||||
return valid_have();
|
return valid_have();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -42,14 +42,12 @@ public:
|
|||||||
BEAST_EXPECT(utf8.finish());
|
BEAST_EXPECT(utf8.finish());
|
||||||
|
|
||||||
// Invalid range 128-193
|
// Invalid range 128-193
|
||||||
for(auto it = std::next(buf.begin(), 128);
|
for(unsigned char c = 128; c < 194; ++c)
|
||||||
it != std::next(buf.begin(), 194); ++it)
|
BEAST_EXPECT(! utf8.write(&c, 1));
|
||||||
BEAST_EXPECT(! utf8.write(&(*it), 1));
|
|
||||||
|
|
||||||
// Invalid range 245-255
|
// Invalid range 245-255
|
||||||
for(auto it = std::next(buf.begin(), 245);
|
for(unsigned char c = 245; c; ++c)
|
||||||
it != buf.end(); ++it)
|
BEAST_EXPECT(! utf8.write(&c, 1));
|
||||||
BEAST_EXPECT(! utf8.write(&(*it), 1));
|
|
||||||
|
|
||||||
// Invalid sequence
|
// Invalid sequence
|
||||||
std::fill(buf.begin(), buf.end(), '\xff');
|
std::fill(buf.begin(), buf.end(), '\xff');
|
||||||
@ -79,6 +77,7 @@ public:
|
|||||||
// Second byte invalid range 0-127
|
// Second byte invalid range 0-127
|
||||||
buf[1] = static_cast<std::uint8_t>(j);
|
buf[1] = static_cast<std::uint8_t>(j);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 2));
|
BEAST_EXPECT(! utf8.write(buf, 2));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
for(auto j = 192; j <= 255; ++j)
|
for(auto j = 192; j <= 255; ++j)
|
||||||
@ -86,6 +85,7 @@ public:
|
|||||||
// Second byte invalid range 192-255
|
// Second byte invalid range 192-255
|
||||||
buf[1] = static_cast<std::uint8_t>(j);
|
buf[1] = static_cast<std::uint8_t>(j);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 2));
|
BEAST_EXPECT(! utf8.write(buf, 2));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
@ -134,6 +134,7 @@ public:
|
|||||||
// Second byte invalid range 0-127 or 0-159
|
// Second byte invalid range 0-127 or 0-159
|
||||||
buf[1] = static_cast<std::uint8_t>(l);
|
buf[1] = static_cast<std::uint8_t>(l);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||||
|
utf8.reset();
|
||||||
if (l > 127)
|
if (l > 127)
|
||||||
{
|
{
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
@ -149,7 +150,8 @@ public:
|
|||||||
{
|
{
|
||||||
// Second byte invalid range 160-255 or 192-255
|
// Second byte invalid range 160-255 or 192-255
|
||||||
buf[1] = static_cast<std::uint8_t>(l);
|
buf[1] = static_cast<std::uint8_t>(l);
|
||||||
BEAST_EXPECT(!utf8.write(buf, 3));
|
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||||
|
utf8.reset();
|
||||||
if (l > 159)
|
if (l > 159)
|
||||||
{
|
{
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
@ -166,6 +168,7 @@ public:
|
|||||||
// Third byte invalid range 0-127
|
// Third byte invalid range 0-127
|
||||||
buf[2] = static_cast<std::uint8_t>(k);
|
buf[2] = static_cast<std::uint8_t>(k);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
for(auto k = 192; k <= 255; ++k)
|
for(auto k = 192; k <= 255; ++k)
|
||||||
@ -173,6 +176,7 @@ public:
|
|||||||
// Third byte invalid range 192-255
|
// Third byte invalid range 192-255
|
||||||
buf[2] = static_cast<std::uint8_t>(k);
|
buf[2] = static_cast<std::uint8_t>(k);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segmented sequence third byte invalid
|
// Segmented sequence third byte invalid
|
||||||
@ -186,6 +190,7 @@ public:
|
|||||||
// Second byte invalid range 0-127 or 0-159
|
// Second byte invalid range 0-127 or 0-159
|
||||||
buf[1] = static_cast<std::uint8_t>(j);
|
buf[1] = static_cast<std::uint8_t>(j);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
for(auto j = e + 1; j <= 255; ++j)
|
for(auto j = e + 1; j <= 255; ++j)
|
||||||
@ -193,6 +198,7 @@ public:
|
|||||||
// Second byte invalid range 160-255 or 192-255
|
// Second byte invalid range 160-255 or 192-255
|
||||||
buf[1] = static_cast<std::uint8_t>(j);
|
buf[1] = static_cast<std::uint8_t>(j);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
@ -251,6 +257,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[1] = static_cast<std::uint8_t>(r);
|
buf[1] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
if (r > 127)
|
if (r > 127)
|
||||||
{
|
{
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
@ -267,6 +274,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[1] = static_cast<std::uint8_t>(r);
|
buf[1] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
BEAST_EXPECT(! utf8.write(buf, 2));
|
BEAST_EXPECT(! utf8.write(buf, 2));
|
||||||
utf8.reset();
|
utf8.reset();
|
||||||
@ -280,6 +288,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[3] = static_cast<std::uint8_t>(r);
|
buf[3] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segmented sequence fourth byte invalid
|
// Segmented sequence fourth byte invalid
|
||||||
@ -293,6 +302,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[2] = static_cast<std::uint8_t>(r);
|
buf[2] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segmented sequence third byte invalid
|
// Segmented sequence third byte invalid
|
||||||
@ -306,6 +316,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[1] = static_cast<std::uint8_t>(r);
|
buf[1] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second byte invalid range 144-255 or 192-255
|
// Second byte invalid range 144-255 or 192-255
|
||||||
@ -313,6 +324,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[1] = static_cast<std::uint8_t>(r);
|
buf[1] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segmented sequence second byte invalid
|
// Segmented sequence second byte invalid
|
||||||
@ -326,6 +338,7 @@ public:
|
|||||||
{
|
{
|
||||||
buf[0] = static_cast<std::uint8_t>(r);
|
buf[0] = static_cast<std::uint8_t>(r);
|
||||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||||
|
utf8.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user