diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e691d5a..11759d56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +1.0.0-b19 + +* Optimize utf8 validation + +-------------------------------------------------------------------------------- + 1.0.0-b18 * Increase optimization settings for MSVC builds diff --git a/include/beast/websocket/detail/utf8_checker.hpp b/include/beast/websocket/detail/utf8_checker.hpp index 6960819d..355ec03c 100644 --- a/include/beast/websocket/detail/utf8_checker.hpp +++ b/include/beast/websocket/detail/utf8_checker.hpp @@ -9,18 +9,22 @@ #define BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP #include +#include +#include +#include #include -#include // DEPRECATED + +#include +#include namespace beast { namespace websocket { namespace detail { -// Code adapted from -// http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ -/* - Copyright (c) 2008-2009 Bjoern Hoehrmann +/* This is a modified work. + Original version and license: + https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including @@ -39,100 +43,97 @@ namespace detail { ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * + + Additional changes: + Optimized for predominantly 7-bit content, 2016 + https://github.com/uWebSockets/uWebSockets/blob/755bd362649c06abff102f18e273c5792c51c1a0/src/WebSocketProtocol.h#L198 + Copyright (c) 2016 Alex Hultman and contributors + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgement in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +/** A UTF8 validator. + + This validator can be used to check if a buffer containing UTF8 text is + valid. The write function may be called incrementally with segmented UTF8 + sequences. The finish function determines if all processed text is valid. */ template class utf8_checker_t { - // Table for the UTF8 decode state machine - using lut_type = std::uint8_t[400]; - static - lut_type const& - lut() - { - // 400 elements - static std::uint8_t constexpr tab[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1 // s7..s8 - }; - return tab; - } - - std::uint32_t state_ = 0; - std::uint32_t codepoint_ = 0; + std::size_t need_ = 0; + std::uint8_t* p_ = have_; + std::uint8_t have_[4]; public: + /** Prepare to process text as valid utf8 + */ void reset(); - // Returns `true` on success - bool - write(void const* buffer, std::size_t size); - - // Returns `true` on success - template - bool - write(BufferSequence const& bs); - - // Returns `true` on success + /** Check that all processed text is valid utf8 + */ bool finish(); + + /** Check if text is valid UTF8 + + @return `true` if the text is valid utf8 or false otherwise. + */ + bool + write(std::uint8_t const* in, std::size_t size); + + /** Check if text is valid UTF8 + + @return `true` if the text is valid utf8 or false otherwise. + */ + template + bool + write(ConstBufferSequence const& bs); }; template void utf8_checker_t<_>::reset() { - state_ = 0; - codepoint_ = 0; + need_ = 0; + p_ = have_; } template bool -utf8_checker_t<_>::write(void const* buffer, std::size_t size) +utf8_checker_t<_>::finish() { - auto p = static_cast(buffer); - auto plut = &lut()[0]; - while(size) - { - auto const byte = *p; - auto const type = plut[byte]; - if(state_) - codepoint_ = (byte & 0x3fu) | (codepoint_ << 6); - else - codepoint_ = (0xff >> type) & byte; - state_ = plut[256 + state_ * 16 + type]; - if(state_ == 1) - { - reset(); - return false; - } - ++p; - --size; - } - return true; + auto const success = need_ == 0; + reset(); + return success; } template -template +template bool -utf8_checker_t<_>::write(BufferSequence const& bs) +utf8_checker_t<_>::write(ConstBufferSequence const& bs) { + static_assert(is_ConstBufferSequence::value, + "ConstBufferSequence requirements not met"); using boost::asio::buffer_cast; using boost::asio::buffer_size; for(auto const& b : bs) - if(! write(buffer_cast(b), + if(! write(buffer_cast(b), buffer_size(b))) return false; return true; @@ -140,11 +141,166 @@ utf8_checker_t<_>::write(BufferSequence const& bs) template bool -utf8_checker_t<_>::finish() +utf8_checker_t<_>::write(std::uint8_t const* in, std::size_t size) { - auto const success = state_ == 0; - reset(); - return success; + auto const valid = + [](std::uint8_t const*& in) + { + if (in[0] < 128) + { + ++in; + return true; + } + if ((in[0] & 0x60) == 0x40) + { + if ((in[0] & 0xfe) == 0xc0 || + (in[1] & 0xc0) != 0x80) + return false; + in += 2; + return true; + } + if ((in[0] & 0xf0) == 0xe0) + { + if ((in[1] & 0xc0) != 0x80 || + (in[2] & 0xc0) != 0x80 || + (in[0] == 224 && in[1] < 160) || + (in[0] == 237 && in[1] > 159)) + return false; + in += 3; + return true; + } + if ((in[0] & 0xf8) == 0xf0) + { + if (in[0] > 244 || + (in[1] & 0xc0) != 0x80 || + (in[2] & 0xc0) != 0x80 || + (in[3] & 0xc0) != 0x80 || + (in[0] == 240 && in[1] < 144) || + (in[0] == 244 && in[1] > 143)) + return false; + in += 4; + return true; + } + return false; + }; + auto const valid_have = + [&]() + { + if ((have_[0] & 0x60) == 0x40) + return have_[0] <= 223; + if ((have_[0] & 0xf0) == 0xe0) + { + if (p_ - have_ > 1 && + ((have_[1] & 0xc0) != 0x80 || + (have_[0] == 224 && have_[1] < 160) || + (have_[0] == 237 && have_[1] > 159))) + return false; + return true; + } + if ((have_[0] & 0xf8) == 0xf0) + { + auto const size = p_ - have_; + if (have_[0] > 244 || + (size > 2 && (have_[2] & 0xc0) != 0x80)) + return false; + if (size > 1 && + ((have_[1] & 0xc0) != 0x80 || + (have_[0] == 240 && have_[1] < 144) || + (have_[0] == 244 && have_[1] > 143))) + return false; + } + return true; + }; + auto const needed = + [](std::uint8_t const in) + { + if (in < 128) + return 1; + if (in < 194) + return 0; + if (in < 224) + return 2; + if (in < 240) + return 3; + if (in < 245) + return 4; + return 0; + }; + + auto const end = in + size; + if (need_ > 0) + { + auto n = std::min(size, need_); + size -= n; + need_ -= n; + while(n--) + *p_++ = *in++; + if(need_ > 0) + { + BOOST_ASSERT(in == end); + return valid_have(); + } + std::uint8_t const* p = &have_[0]; + if (! valid(p)) + return false; + p_ = have_; + } + + auto last = in + size - 7; + while(in < last) + { +#if BEAST_WEBSOCKET_NO_UNALIGNED_READ + auto constexpr align = sizeof(std::size_t) - 1; + auto constexpr mask = static_cast< + std::size_t>(0x8080808080808080 & + ~std::size_t{0}); + if( + ((reinterpret_cast< + std::uintptr_t>(in) & align) == 0) && + (*reinterpret_cast< + std::size_t const*>(in) & mask) == 0) + in += sizeof(std::size_t); + else if(! valid(in)) + return false; +#else + auto constexpr mask = static_cast< + std::size_t>(0x8080808080808080 & + ~std::size_t{0}); + if( + (*reinterpret_cast< + std::size_t const*>(in) & mask) == 0) + in += sizeof(std::size_t); + else if(! valid(in)) + return false; +#endif + } + last += 4; + while(in < last) + if(! valid(in)) + return false; + + for(;;) + { + auto n = end - in; + if(! n) + break; + auto const need = needed(*in); + if (need == 0) + return false; + if(need <= n) + { + if(! valid(in)) + return false; + } + else + { + need_ = need - n; + while(n--) + *p_++ = *in++; + return valid_have(); + } + } + return true; } using utf8_checker = utf8_checker_t<>; @@ -154,7 +310,7 @@ bool check_utf8(char const* p, std::size_t n) { utf8_checker c; - if(! c.write(p, n)) + if(! c.write(reinterpret_cast(p), n)) return false; return c.finish(); } diff --git a/test/websocket/utf8_checker.cpp b/test/websocket/utf8_checker.cpp index 14749fda..20ac7996 100644 --- a/test/websocket/utf8_checker.cpp +++ b/test/websocket/utf8_checker.cpp @@ -24,7 +24,7 @@ public: testOneByteSequence() { utf8_checker utf8; - std::array const buf = + std::array buf = ([]() { std::array values; @@ -47,6 +47,10 @@ public: for(auto it = std::next(buf.begin(), 245); it != buf.end(); ++it) BEAST_EXPECT(! utf8.write(&(*it), 1)); + + // Invalid sequence + std::fill(buf.begin(), buf.end(), 0xFF); + BEAST_EXPECT(! utf8.write(&buf.front(), buf.size())); } void @@ -80,6 +84,11 @@ public: buf[1] = static_cast(j); BEAST_EXPECT(! utf8.write(buf, 2)); } + + // Segmented sequence second byte invalid + BEAST_EXPECT(utf8.write(buf, 1)); + BEAST_EXPECT(! utf8.write(&buf[1], 1)); + utf8.reset(); } } @@ -121,6 +130,11 @@ public: buf[2] = static_cast(k); BEAST_EXPECT(! utf8.write(buf, 3)); } + + // Segmented sequence third byte invalid + BEAST_EXPECT(utf8.write(buf, 2)); + BEAST_EXPECT(! utf8.write(&buf[2], 1)); + utf8.reset(); } for(auto j = 0; j < b; ++j) @@ -136,6 +150,11 @@ public: buf[1] = static_cast(j); BEAST_EXPECT(! utf8.write(buf, 3)); } + + // Segmented sequence second byte invalid + BEAST_EXPECT(utf8.write(buf, 1)); + BEAST_EXPECT(! utf8.write(&buf[1], 1)); + utf8.reset(); } } @@ -154,7 +173,7 @@ public: std::int32_t const e = (i == 244 ? 143 : 191); for(auto j = b; j <= e; ++j) { - // Second byte valid range 128-191 or 144-191 or 128-143 + // Second byte valid range 144-191 or 128-191 or 128-143 buf[1] = static_cast(j); for(auto k = 128; k <= 191; ++k) @@ -183,6 +202,11 @@ public: buf[3] = static_cast(n); BEAST_EXPECT(! utf8.write(buf, 4)); } + + // Segmented sequence fourth byte invalid + BEAST_EXPECT(utf8.write(buf, 3)); + BEAST_EXPECT(! utf8.write(&buf[3], 1)); + utf8.reset(); } for(auto k = 0; k <= 127; ++k) @@ -198,21 +222,31 @@ public: buf[2] = static_cast(k); BEAST_EXPECT(! utf8.write(buf, 4)); } + + // Segmented sequence third byte invalid + BEAST_EXPECT(utf8.write(buf, 2)); + BEAST_EXPECT(! utf8.write(&buf[2], 1)); + utf8.reset(); } for(auto j = 0; j < b; ++j) { // Second byte invalid range 0-127 or 0-143 buf[1] = static_cast(j); - BEAST_EXPECT(! utf8.write(buf, 3)); + BEAST_EXPECT(! utf8.write(buf, 4)); } for(auto j = e + 1; j <= 255; ++j) { // Second byte invalid range 144-255 or 192-255 buf[1] = static_cast(j); - BEAST_EXPECT(! utf8.write(buf, 3)); + BEAST_EXPECT(! utf8.write(buf, 4)); } + + // Segmented sequence second byte invalid + BEAST_EXPECT(utf8.write(buf, 1)); + BEAST_EXPECT(! utf8.write(&buf[1], 1)); + utf8.reset(); } } @@ -240,12 +274,14 @@ public: 0xC3,0x81,0x72,0x76,0xC3,0xAD,0x7A,0x74,0xC5,0xB1,0x72,0xC5, 0x91,0x20,0x74,0xC3,0xBC,0x6B,0xC3,0xB6,0x72,0x66,0xC3,0xBA, 0x72,0xC3,0xB3,0x67,0xC3,0xA9,0x70 + }, { + 240, 144, 128, 128 } }; utf8_checker utf8; for(auto const& s : data) { - static std::size_t constexpr size = 8; + static std::size_t constexpr size = 3; std::size_t n = s.size(); auto cb = consumed_buffers( boost::asio::const_buffers_1(