Optimize utf8 validation

This commit is contained in:
Miguel Portilla
2016-10-28 19:43:30 -04:00
committed by Vinnie Falco
parent 804038095c
commit d91fbd0611
3 changed files with 277 additions and 79 deletions

View File

@@ -1,3 +1,9 @@
1.0.0-b19
* Optimize utf8 validation
--------------------------------------------------------------------------------
1.0.0-b18
* Increase optimization settings for MSVC builds

View File

@@ -9,18 +9,22 @@
#define BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
#include <boost/asio/buffer.hpp>
#include <boost/assert.hpp>
#include <beast/core/buffer_concepts.hpp>
#include <algorithm>
#include <cstdint>
#include <string> // DEPRECATED
#include <beast/unit_test/dstream.hpp>
#include <iostream>
namespace beast {
namespace websocket {
namespace detail {
// Code adapted from
// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
/*
Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
/* This is a modified work.
Original version and license:
https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
@@ -39,100 +43,97 @@ namespace detail {
ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *
Additional changes:
Optimized for predominantly 7-bit content, 2016
https://github.com/uWebSockets/uWebSockets/blob/755bd362649c06abff102f18e273c5792c51c1a0/src/WebSocketProtocol.h#L198
Copyright (c) 2016 Alex Hultman and contributors
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgement in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
/** A UTF8 validator.
This validator can be used to check if a buffer containing UTF8 text is
valid. The write function may be called incrementally with segmented UTF8
sequences. The finish function determines if all processed text is valid.
*/
template<class = void>
class utf8_checker_t
{
// Table for the UTF8 decode state machine
using lut_type = std::uint8_t[400];
static
lut_type const&
lut()
{
// 400 elements
static std::uint8_t constexpr tab[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // s7..s8
};
return tab;
}
std::uint32_t state_ = 0;
std::uint32_t codepoint_ = 0;
std::size_t need_ = 0;
std::uint8_t* p_ = have_;
std::uint8_t have_[4];
public:
/** Prepare to process text as valid utf8
*/
void
reset();
// Returns `true` on success
bool
write(void const* buffer, std::size_t size);
// Returns `true` on success
template<class BufferSequence>
bool
write(BufferSequence const& bs);
// Returns `true` on success
/** Check that all processed text is valid utf8
*/
bool
finish();
/** Check if text is valid UTF8
@return `true` if the text is valid utf8 or false otherwise.
*/
bool
write(std::uint8_t const* in, std::size_t size);
/** Check if text is valid UTF8
@return `true` if the text is valid utf8 or false otherwise.
*/
template<class ConstBufferSequence>
bool
write(ConstBufferSequence const& bs);
};
template<class _>
void
utf8_checker_t<_>::reset()
{
state_ = 0;
codepoint_ = 0;
need_ = 0;
p_ = have_;
}
template<class _>
bool
utf8_checker_t<_>::write(void const* buffer, std::size_t size)
{
auto p = static_cast<std::uint8_t const*>(buffer);
auto plut = &lut()[0];
while(size)
{
auto const byte = *p;
auto const type = plut[byte];
if(state_)
codepoint_ = (byte & 0x3fu) | (codepoint_ << 6);
else
codepoint_ = (0xff >> type) & byte;
state_ = plut[256 + state_ * 16 + type];
if(state_ == 1)
utf8_checker_t<_>::finish()
{
auto const success = need_ == 0;
reset();
return false;
}
++p;
--size;
}
return true;
return success;
}
template<class _>
template<class BufferSequence>
template<class ConstBufferSequence>
bool
utf8_checker_t<_>::write(BufferSequence const& bs)
utf8_checker_t<_>::write(ConstBufferSequence const& bs)
{
static_assert(is_ConstBufferSequence<ConstBufferSequence>::value,
"ConstBufferSequence requirements not met");
using boost::asio::buffer_cast;
using boost::asio::buffer_size;
for(auto const& b : bs)
if(! write(buffer_cast<void const*>(b),
if(! write(buffer_cast<std::uint8_t const*>(b),
buffer_size(b)))
return false;
return true;
@@ -140,11 +141,166 @@ utf8_checker_t<_>::write(BufferSequence const& bs)
template<class _>
bool
utf8_checker_t<_>::finish()
utf8_checker_t<_>::write(std::uint8_t const* in, std::size_t size)
{
auto const success = state_ == 0;
reset();
return success;
auto const valid =
[](std::uint8_t const*& in)
{
if (in[0] < 128)
{
++in;
return true;
}
if ((in[0] & 0x60) == 0x40)
{
if ((in[0] & 0xfe) == 0xc0 ||
(in[1] & 0xc0) != 0x80)
return false;
in += 2;
return true;
}
if ((in[0] & 0xf0) == 0xe0)
{
if ((in[1] & 0xc0) != 0x80 ||
(in[2] & 0xc0) != 0x80 ||
(in[0] == 224 && in[1] < 160) ||
(in[0] == 237 && in[1] > 159))
return false;
in += 3;
return true;
}
if ((in[0] & 0xf8) == 0xf0)
{
if (in[0] > 244 ||
(in[1] & 0xc0) != 0x80 ||
(in[2] & 0xc0) != 0x80 ||
(in[3] & 0xc0) != 0x80 ||
(in[0] == 240 && in[1] < 144) ||
(in[0] == 244 && in[1] > 143))
return false;
in += 4;
return true;
}
return false;
};
auto const valid_have =
[&]()
{
if ((have_[0] & 0x60) == 0x40)
return have_[0] <= 223;
if ((have_[0] & 0xf0) == 0xe0)
{
if (p_ - have_ > 1 &&
((have_[1] & 0xc0) != 0x80 ||
(have_[0] == 224 && have_[1] < 160) ||
(have_[0] == 237 && have_[1] > 159)))
return false;
return true;
}
if ((have_[0] & 0xf8) == 0xf0)
{
auto const size = p_ - have_;
if (have_[0] > 244 ||
(size > 2 && (have_[2] & 0xc0) != 0x80))
return false;
if (size > 1 &&
((have_[1] & 0xc0) != 0x80 ||
(have_[0] == 240 && have_[1] < 144) ||
(have_[0] == 244 && have_[1] > 143)))
return false;
}
return true;
};
auto const needed =
[](std::uint8_t const in)
{
if (in < 128)
return 1;
if (in < 194)
return 0;
if (in < 224)
return 2;
if (in < 240)
return 3;
if (in < 245)
return 4;
return 0;
};
auto const end = in + size;
if (need_ > 0)
{
auto n = std::min(size, need_);
size -= n;
need_ -= n;
while(n--)
*p_++ = *in++;
if(need_ > 0)
{
BOOST_ASSERT(in == end);
return valid_have();
}
std::uint8_t const* p = &have_[0];
if (! valid(p))
return false;
p_ = have_;
}
auto last = in + size - 7;
while(in < last)
{
#if BEAST_WEBSOCKET_NO_UNALIGNED_READ
auto constexpr align = sizeof(std::size_t) - 1;
auto constexpr mask = static_cast<
std::size_t>(0x8080808080808080 &
~std::size_t{0});
if(
((reinterpret_cast<
std::uintptr_t>(in) & align) == 0) &&
(*reinterpret_cast<
std::size_t const*>(in) & mask) == 0)
in += sizeof(std::size_t);
else if(! valid(in))
return false;
#else
auto constexpr mask = static_cast<
std::size_t>(0x8080808080808080 &
~std::size_t{0});
if(
(*reinterpret_cast<
std::size_t const*>(in) & mask) == 0)
in += sizeof(std::size_t);
else if(! valid(in))
return false;
#endif
}
last += 4;
while(in < last)
if(! valid(in))
return false;
for(;;)
{
auto n = end - in;
if(! n)
break;
auto const need = needed(*in);
if (need == 0)
return false;
if(need <= n)
{
if(! valid(in))
return false;
}
else
{
need_ = need - n;
while(n--)
*p_++ = *in++;
return valid_have();
}
}
return true;
}
using utf8_checker = utf8_checker_t<>;
@@ -154,7 +310,7 @@ bool
check_utf8(char const* p, std::size_t n)
{
utf8_checker c;
if(! c.write(p, n))
if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
return false;
return c.finish();
}

View File

@@ -24,7 +24,7 @@ public:
testOneByteSequence()
{
utf8_checker utf8;
std::array<std::uint8_t, 256> const buf =
std::array<std::uint8_t, 256> buf =
([]()
{
std::array<std::uint8_t, 256> values;
@@ -47,6 +47,10 @@ public:
for(auto it = std::next(buf.begin(), 245);
it != buf.end(); ++it)
BEAST_EXPECT(! utf8.write(&(*it), 1));
// Invalid sequence
std::fill(buf.begin(), buf.end(), 0xFF);
BEAST_EXPECT(! utf8.write(&buf.front(), buf.size()));
}
void
@@ -80,6 +84,11 @@ public:
buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 2));
}
// Segmented sequence second byte invalid
BEAST_EXPECT(utf8.write(buf, 1));
BEAST_EXPECT(! utf8.write(&buf[1], 1));
utf8.reset();
}
}
@@ -121,6 +130,11 @@ public:
buf[2] = static_cast<std::uint8_t>(k);
BEAST_EXPECT(! utf8.write(buf, 3));
}
// Segmented sequence third byte invalid
BEAST_EXPECT(utf8.write(buf, 2));
BEAST_EXPECT(! utf8.write(&buf[2], 1));
utf8.reset();
}
for(auto j = 0; j < b; ++j)
@@ -136,6 +150,11 @@ public:
buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 3));
}
// Segmented sequence second byte invalid
BEAST_EXPECT(utf8.write(buf, 1));
BEAST_EXPECT(! utf8.write(&buf[1], 1));
utf8.reset();
}
}
@@ -154,7 +173,7 @@ public:
std::int32_t const e = (i == 244 ? 143 : 191);
for(auto j = b; j <= e; ++j)
{
// Second byte valid range 128-191 or 144-191 or 128-143
// Second byte valid range 144-191 or 128-191 or 128-143
buf[1] = static_cast<std::uint8_t>(j);
for(auto k = 128; k <= 191; ++k)
@@ -183,6 +202,11 @@ public:
buf[3] = static_cast<std::uint8_t>(n);
BEAST_EXPECT(! utf8.write(buf, 4));
}
// Segmented sequence fourth byte invalid
BEAST_EXPECT(utf8.write(buf, 3));
BEAST_EXPECT(! utf8.write(&buf[3], 1));
utf8.reset();
}
for(auto k = 0; k <= 127; ++k)
@@ -198,21 +222,31 @@ public:
buf[2] = static_cast<std::uint8_t>(k);
BEAST_EXPECT(! utf8.write(buf, 4));
}
// Segmented sequence third byte invalid
BEAST_EXPECT(utf8.write(buf, 2));
BEAST_EXPECT(! utf8.write(&buf[2], 1));
utf8.reset();
}
for(auto j = 0; j < b; ++j)
{
// Second byte invalid range 0-127 or 0-143
buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 3));
BEAST_EXPECT(! utf8.write(buf, 4));
}
for(auto j = e + 1; j <= 255; ++j)
{
// Second byte invalid range 144-255 or 192-255
buf[1] = static_cast<std::uint8_t>(j);
BEAST_EXPECT(! utf8.write(buf, 3));
BEAST_EXPECT(! utf8.write(buf, 4));
}
// Segmented sequence second byte invalid
BEAST_EXPECT(utf8.write(buf, 1));
BEAST_EXPECT(! utf8.write(&buf[1], 1));
utf8.reset();
}
}
@@ -240,12 +274,14 @@ public:
0xC3,0x81,0x72,0x76,0xC3,0xAD,0x7A,0x74,0xC5,0xB1,0x72,0xC5,
0x91,0x20,0x74,0xC3,0xBC,0x6B,0xC3,0xB6,0x72,0x66,0xC3,0xBA,
0x72,0xC3,0xB3,0x67,0xC3,0xA9,0x70
}, {
240, 144, 128, 128
}
};
utf8_checker utf8;
for(auto const& s : data)
{
static std::size_t constexpr size = 8;
static std::size_t constexpr size = 3;
std::size_t n = s.size();
auto cb = consumed_buffers(
boost::asio::const_buffers_1(