mirror of
https://github.com/boostorg/beast.git
synced 2025-08-03 14:54:32 +02:00
Optimize utf8 validation
This commit is contained in:
committed by
Vinnie Falco
parent
804038095c
commit
d91fbd0611
@@ -1,3 +1,9 @@
|
||||
1.0.0-b19
|
||||
|
||||
* Optimize utf8 validation
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
1.0.0-b18
|
||||
|
||||
* Increase optimization settings for MSVC builds
|
||||
|
@@ -9,18 +9,22 @@
|
||||
#define BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
|
||||
|
||||
#include <boost/asio/buffer.hpp>
|
||||
#include <boost/assert.hpp>
|
||||
#include <beast/core/buffer_concepts.hpp>
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <string> // DEPRECATED
|
||||
|
||||
#include <beast/unit_test/dstream.hpp>
|
||||
#include <iostream>
|
||||
|
||||
namespace beast {
|
||||
namespace websocket {
|
||||
namespace detail {
|
||||
|
||||
// Code adapted from
|
||||
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
|
||||
/*
|
||||
Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
/* This is a modified work.
|
||||
|
||||
Original version and license:
|
||||
https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
@@ -39,100 +43,97 @@ namespace detail {
|
||||
ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *
|
||||
|
||||
Additional changes:
|
||||
Optimized for predominantly 7-bit content, 2016
|
||||
https://github.com/uWebSockets/uWebSockets/blob/755bd362649c06abff102f18e273c5792c51c1a0/src/WebSocketProtocol.h#L198
|
||||
Copyright (c) 2016 Alex Hultman and contributors
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgement in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
/** A UTF8 validator.
|
||||
|
||||
This validator can be used to check if a buffer containing UTF8 text is
|
||||
valid. The write function may be called incrementally with segmented UTF8
|
||||
sequences. The finish function determines if all processed text is valid.
|
||||
*/
|
||||
template<class = void>
|
||||
class utf8_checker_t
|
||||
{
|
||||
// Table for the UTF8 decode state machine
|
||||
using lut_type = std::uint8_t[400];
|
||||
static
|
||||
lut_type const&
|
||||
lut()
|
||||
{
|
||||
// 400 elements
|
||||
static std::uint8_t constexpr tab[] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // s7..s8
|
||||
};
|
||||
return tab;
|
||||
}
|
||||
|
||||
std::uint32_t state_ = 0;
|
||||
std::uint32_t codepoint_ = 0;
|
||||
std::size_t need_ = 0;
|
||||
std::uint8_t* p_ = have_;
|
||||
std::uint8_t have_[4];
|
||||
|
||||
public:
|
||||
/** Prepare to process text as valid utf8
|
||||
*/
|
||||
void
|
||||
reset();
|
||||
|
||||
// Returns `true` on success
|
||||
bool
|
||||
write(void const* buffer, std::size_t size);
|
||||
|
||||
// Returns `true` on success
|
||||
template<class BufferSequence>
|
||||
bool
|
||||
write(BufferSequence const& bs);
|
||||
|
||||
// Returns `true` on success
|
||||
/** Check that all processed text is valid utf8
|
||||
*/
|
||||
bool
|
||||
finish();
|
||||
|
||||
/** Check if text is valid UTF8
|
||||
|
||||
@return `true` if the text is valid utf8 or false otherwise.
|
||||
*/
|
||||
bool
|
||||
write(std::uint8_t const* in, std::size_t size);
|
||||
|
||||
/** Check if text is valid UTF8
|
||||
|
||||
@return `true` if the text is valid utf8 or false otherwise.
|
||||
*/
|
||||
template<class ConstBufferSequence>
|
||||
bool
|
||||
write(ConstBufferSequence const& bs);
|
||||
};
|
||||
|
||||
template<class _>
|
||||
void
|
||||
utf8_checker_t<_>::reset()
|
||||
{
|
||||
state_ = 0;
|
||||
codepoint_ = 0;
|
||||
need_ = 0;
|
||||
p_ = have_;
|
||||
}
|
||||
|
||||
template<class _>
|
||||
bool
|
||||
utf8_checker_t<_>::write(void const* buffer, std::size_t size)
|
||||
utf8_checker_t<_>::finish()
|
||||
{
|
||||
auto p = static_cast<std::uint8_t const*>(buffer);
|
||||
auto plut = &lut()[0];
|
||||
while(size)
|
||||
{
|
||||
auto const byte = *p;
|
||||
auto const type = plut[byte];
|
||||
if(state_)
|
||||
codepoint_ = (byte & 0x3fu) | (codepoint_ << 6);
|
||||
else
|
||||
codepoint_ = (0xff >> type) & byte;
|
||||
state_ = plut[256 + state_ * 16 + type];
|
||||
if(state_ == 1)
|
||||
{
|
||||
auto const success = need_ == 0;
|
||||
reset();
|
||||
return false;
|
||||
}
|
||||
++p;
|
||||
--size;
|
||||
}
|
||||
return true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template<class _>
|
||||
template<class BufferSequence>
|
||||
template<class ConstBufferSequence>
|
||||
bool
|
||||
utf8_checker_t<_>::write(BufferSequence const& bs)
|
||||
utf8_checker_t<_>::write(ConstBufferSequence const& bs)
|
||||
{
|
||||
static_assert(is_ConstBufferSequence<ConstBufferSequence>::value,
|
||||
"ConstBufferSequence requirements not met");
|
||||
using boost::asio::buffer_cast;
|
||||
using boost::asio::buffer_size;
|
||||
for(auto const& b : bs)
|
||||
if(! write(buffer_cast<void const*>(b),
|
||||
if(! write(buffer_cast<std::uint8_t const*>(b),
|
||||
buffer_size(b)))
|
||||
return false;
|
||||
return true;
|
||||
@@ -140,11 +141,166 @@ utf8_checker_t<_>::write(BufferSequence const& bs)
|
||||
|
||||
template<class _>
|
||||
bool
|
||||
utf8_checker_t<_>::finish()
|
||||
utf8_checker_t<_>::write(std::uint8_t const* in, std::size_t size)
|
||||
{
|
||||
auto const success = state_ == 0;
|
||||
reset();
|
||||
return success;
|
||||
auto const valid =
|
||||
[](std::uint8_t const*& in)
|
||||
{
|
||||
if (in[0] < 128)
|
||||
{
|
||||
++in;
|
||||
return true;
|
||||
}
|
||||
if ((in[0] & 0x60) == 0x40)
|
||||
{
|
||||
if ((in[0] & 0xfe) == 0xc0 ||
|
||||
(in[1] & 0xc0) != 0x80)
|
||||
return false;
|
||||
in += 2;
|
||||
return true;
|
||||
}
|
||||
if ((in[0] & 0xf0) == 0xe0)
|
||||
{
|
||||
if ((in[1] & 0xc0) != 0x80 ||
|
||||
(in[2] & 0xc0) != 0x80 ||
|
||||
(in[0] == 224 && in[1] < 160) ||
|
||||
(in[0] == 237 && in[1] > 159))
|
||||
return false;
|
||||
in += 3;
|
||||
return true;
|
||||
}
|
||||
if ((in[0] & 0xf8) == 0xf0)
|
||||
{
|
||||
if (in[0] > 244 ||
|
||||
(in[1] & 0xc0) != 0x80 ||
|
||||
(in[2] & 0xc0) != 0x80 ||
|
||||
(in[3] & 0xc0) != 0x80 ||
|
||||
(in[0] == 240 && in[1] < 144) ||
|
||||
(in[0] == 244 && in[1] > 143))
|
||||
return false;
|
||||
in += 4;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
auto const valid_have =
|
||||
[&]()
|
||||
{
|
||||
if ((have_[0] & 0x60) == 0x40)
|
||||
return have_[0] <= 223;
|
||||
if ((have_[0] & 0xf0) == 0xe0)
|
||||
{
|
||||
if (p_ - have_ > 1 &&
|
||||
((have_[1] & 0xc0) != 0x80 ||
|
||||
(have_[0] == 224 && have_[1] < 160) ||
|
||||
(have_[0] == 237 && have_[1] > 159)))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
if ((have_[0] & 0xf8) == 0xf0)
|
||||
{
|
||||
auto const size = p_ - have_;
|
||||
if (have_[0] > 244 ||
|
||||
(size > 2 && (have_[2] & 0xc0) != 0x80))
|
||||
return false;
|
||||
if (size > 1 &&
|
||||
((have_[1] & 0xc0) != 0x80 ||
|
||||
(have_[0] == 240 && have_[1] < 144) ||
|
||||
(have_[0] == 244 && have_[1] > 143)))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
auto const needed =
|
||||
[](std::uint8_t const in)
|
||||
{
|
||||
if (in < 128)
|
||||
return 1;
|
||||
if (in < 194)
|
||||
return 0;
|
||||
if (in < 224)
|
||||
return 2;
|
||||
if (in < 240)
|
||||
return 3;
|
||||
if (in < 245)
|
||||
return 4;
|
||||
return 0;
|
||||
};
|
||||
|
||||
auto const end = in + size;
|
||||
if (need_ > 0)
|
||||
{
|
||||
auto n = std::min(size, need_);
|
||||
size -= n;
|
||||
need_ -= n;
|
||||
while(n--)
|
||||
*p_++ = *in++;
|
||||
if(need_ > 0)
|
||||
{
|
||||
BOOST_ASSERT(in == end);
|
||||
return valid_have();
|
||||
}
|
||||
std::uint8_t const* p = &have_[0];
|
||||
if (! valid(p))
|
||||
return false;
|
||||
p_ = have_;
|
||||
}
|
||||
|
||||
auto last = in + size - 7;
|
||||
while(in < last)
|
||||
{
|
||||
#if BEAST_WEBSOCKET_NO_UNALIGNED_READ
|
||||
auto constexpr align = sizeof(std::size_t) - 1;
|
||||
auto constexpr mask = static_cast<
|
||||
std::size_t>(0x8080808080808080 &
|
||||
~std::size_t{0});
|
||||
if(
|
||||
((reinterpret_cast<
|
||||
std::uintptr_t>(in) & align) == 0) &&
|
||||
(*reinterpret_cast<
|
||||
std::size_t const*>(in) & mask) == 0)
|
||||
in += sizeof(std::size_t);
|
||||
else if(! valid(in))
|
||||
return false;
|
||||
#else
|
||||
auto constexpr mask = static_cast<
|
||||
std::size_t>(0x8080808080808080 &
|
||||
~std::size_t{0});
|
||||
if(
|
||||
(*reinterpret_cast<
|
||||
std::size_t const*>(in) & mask) == 0)
|
||||
in += sizeof(std::size_t);
|
||||
else if(! valid(in))
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
last += 4;
|
||||
while(in < last)
|
||||
if(! valid(in))
|
||||
return false;
|
||||
|
||||
for(;;)
|
||||
{
|
||||
auto n = end - in;
|
||||
if(! n)
|
||||
break;
|
||||
auto const need = needed(*in);
|
||||
if (need == 0)
|
||||
return false;
|
||||
if(need <= n)
|
||||
{
|
||||
if(! valid(in))
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
need_ = need - n;
|
||||
while(n--)
|
||||
*p_++ = *in++;
|
||||
return valid_have();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
using utf8_checker = utf8_checker_t<>;
|
||||
@@ -154,7 +310,7 @@ bool
|
||||
check_utf8(char const* p, std::size_t n)
|
||||
{
|
||||
utf8_checker c;
|
||||
if(! c.write(p, n))
|
||||
if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
|
||||
return false;
|
||||
return c.finish();
|
||||
}
|
||||
|
@@ -24,7 +24,7 @@ public:
|
||||
testOneByteSequence()
|
||||
{
|
||||
utf8_checker utf8;
|
||||
std::array<std::uint8_t, 256> const buf =
|
||||
std::array<std::uint8_t, 256> buf =
|
||||
([]()
|
||||
{
|
||||
std::array<std::uint8_t, 256> values;
|
||||
@@ -47,6 +47,10 @@ public:
|
||||
for(auto it = std::next(buf.begin(), 245);
|
||||
it != buf.end(); ++it)
|
||||
BEAST_EXPECT(! utf8.write(&(*it), 1));
|
||||
|
||||
// Invalid sequence
|
||||
std::fill(buf.begin(), buf.end(), 0xFF);
|
||||
BEAST_EXPECT(! utf8.write(&buf.front(), buf.size()));
|
||||
}
|
||||
|
||||
void
|
||||
@@ -80,6 +84,11 @@ public:
|
||||
buf[1] = static_cast<std::uint8_t>(j);
|
||||
BEAST_EXPECT(! utf8.write(buf, 2));
|
||||
}
|
||||
|
||||
// Segmented sequence second byte invalid
|
||||
BEAST_EXPECT(utf8.write(buf, 1));
|
||||
BEAST_EXPECT(! utf8.write(&buf[1], 1));
|
||||
utf8.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +130,11 @@ public:
|
||||
buf[2] = static_cast<std::uint8_t>(k);
|
||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||
}
|
||||
|
||||
// Segmented sequence third byte invalid
|
||||
BEAST_EXPECT(utf8.write(buf, 2));
|
||||
BEAST_EXPECT(! utf8.write(&buf[2], 1));
|
||||
utf8.reset();
|
||||
}
|
||||
|
||||
for(auto j = 0; j < b; ++j)
|
||||
@@ -136,6 +150,11 @@ public:
|
||||
buf[1] = static_cast<std::uint8_t>(j);
|
||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||
}
|
||||
|
||||
// Segmented sequence second byte invalid
|
||||
BEAST_EXPECT(utf8.write(buf, 1));
|
||||
BEAST_EXPECT(! utf8.write(&buf[1], 1));
|
||||
utf8.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -154,7 +173,7 @@ public:
|
||||
std::int32_t const e = (i == 244 ? 143 : 191);
|
||||
for(auto j = b; j <= e; ++j)
|
||||
{
|
||||
// Second byte valid range 128-191 or 144-191 or 128-143
|
||||
// Second byte valid range 144-191 or 128-191 or 128-143
|
||||
buf[1] = static_cast<std::uint8_t>(j);
|
||||
|
||||
for(auto k = 128; k <= 191; ++k)
|
||||
@@ -183,6 +202,11 @@ public:
|
||||
buf[3] = static_cast<std::uint8_t>(n);
|
||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||
}
|
||||
|
||||
// Segmented sequence fourth byte invalid
|
||||
BEAST_EXPECT(utf8.write(buf, 3));
|
||||
BEAST_EXPECT(! utf8.write(&buf[3], 1));
|
||||
utf8.reset();
|
||||
}
|
||||
|
||||
for(auto k = 0; k <= 127; ++k)
|
||||
@@ -198,21 +222,31 @@ public:
|
||||
buf[2] = static_cast<std::uint8_t>(k);
|
||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||
}
|
||||
|
||||
// Segmented sequence third byte invalid
|
||||
BEAST_EXPECT(utf8.write(buf, 2));
|
||||
BEAST_EXPECT(! utf8.write(&buf[2], 1));
|
||||
utf8.reset();
|
||||
}
|
||||
|
||||
for(auto j = 0; j < b; ++j)
|
||||
{
|
||||
// Second byte invalid range 0-127 or 0-143
|
||||
buf[1] = static_cast<std::uint8_t>(j);
|
||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||
}
|
||||
|
||||
for(auto j = e + 1; j <= 255; ++j)
|
||||
{
|
||||
// Second byte invalid range 144-255 or 192-255
|
||||
buf[1] = static_cast<std::uint8_t>(j);
|
||||
BEAST_EXPECT(! utf8.write(buf, 3));
|
||||
BEAST_EXPECT(! utf8.write(buf, 4));
|
||||
}
|
||||
|
||||
// Segmented sequence second byte invalid
|
||||
BEAST_EXPECT(utf8.write(buf, 1));
|
||||
BEAST_EXPECT(! utf8.write(&buf[1], 1));
|
||||
utf8.reset();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,12 +274,14 @@ public:
|
||||
0xC3,0x81,0x72,0x76,0xC3,0xAD,0x7A,0x74,0xC5,0xB1,0x72,0xC5,
|
||||
0x91,0x20,0x74,0xC3,0xBC,0x6B,0xC3,0xB6,0x72,0x66,0xC3,0xBA,
|
||||
0x72,0xC3,0xB3,0x67,0xC3,0xA9,0x70
|
||||
}, {
|
||||
240, 144, 128, 128
|
||||
}
|
||||
};
|
||||
utf8_checker utf8;
|
||||
for(auto const& s : data)
|
||||
{
|
||||
static std::size_t constexpr size = 8;
|
||||
static std::size_t constexpr size = 3;
|
||||
std::size_t n = s.size();
|
||||
auto cb = consumed_buffers(
|
||||
boost::asio::const_buffers_1(
|
||||
|
Reference in New Issue
Block a user