Files
boost_beast/include/boost/beast/websocket/detail/utf8_checker.hpp

348 lines
8.8 KiB
C++
Raw Normal View History

2017-07-20 08:01:46 -07:00
//
2017-07-24 09:42:36 -07:00
// Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
2017-07-20 08:01:46 -07:00
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
//
2017-07-20 13:40:34 -07:00
// Official repository: https://github.com/boostorg/beast
//
2017-07-20 08:01:46 -07:00
2017-07-20 13:40:34 -07:00
#ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
#define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
2017-07-20 08:01:46 -07:00
2017-07-20 13:40:34 -07:00
#include <boost/beast/core/type_traits.hpp>
2017-07-20 08:01:46 -07:00
#include <boost/asio/buffer.hpp>
2016-10-28 19:43:30 -04:00
#include <boost/assert.hpp>
#include <algorithm>
2017-07-20 08:01:46 -07:00
#include <cstdint>
2016-10-28 19:43:30 -04:00
2017-07-20 13:40:34 -07:00
namespace boost {
2017-07-20 08:01:46 -07:00
namespace beast {
namespace websocket {
namespace detail {
2016-10-28 19:43:30 -04:00
/** A UTF8 validator.
This validator can be used to check if a buffer containing UTF8 text is
valid. The write function may be called incrementally with segmented UTF8
sequences. The finish function determines if all processed text is valid.
2017-07-20 08:01:46 -07:00
*/
template<class = void>
class utf8_checker_t
{
2017-08-13 20:46:01 -07:00
std::size_t need_ = 0; // chars we need to finish the code point
std::uint8_t* p_ = cp_; // current position in temp buffer
std::uint8_t cp_[4]; // a temp buffer for the code point
2017-07-20 08:01:46 -07:00
public:
2016-10-28 19:43:30 -04:00
/** Prepare to process text as valid utf8
*/
2017-07-20 08:01:46 -07:00
void
reset();
2016-10-28 19:43:30 -04:00
/** Check that all processed text is valid utf8
*/
2017-07-20 08:01:46 -07:00
bool
2016-10-28 19:43:30 -04:00
finish();
/** Check if text is valid UTF8
2017-07-20 08:01:46 -07:00
2016-10-28 19:43:30 -04:00
@return `true` if the text is valid utf8 or false otherwise.
*/
2017-07-20 08:01:46 -07:00
bool
2016-10-28 19:43:30 -04:00
write(std::uint8_t const* in, std::size_t size);
/** Check if text is valid UTF8
2017-07-20 08:01:46 -07:00
2016-10-28 19:43:30 -04:00
@return `true` if the text is valid utf8 or false otherwise.
*/
template<class ConstBufferSequence>
2017-07-20 08:01:46 -07:00
bool
2016-10-28 19:43:30 -04:00
write(ConstBufferSequence const& bs);
2017-07-20 08:01:46 -07:00
};
template<class _>
void
utf8_checker_t<_>::
reset()
2017-07-20 08:01:46 -07:00
{
2016-10-28 19:43:30 -04:00
need_ = 0;
2017-08-13 20:46:01 -07:00
p_ = cp_;
2017-07-20 08:01:46 -07:00
}
template<class _>
bool
utf8_checker_t<_>::
finish()
2017-07-20 08:01:46 -07:00
{
2016-10-28 19:43:30 -04:00
auto const success = need_ == 0;
reset();
return success;
2017-07-20 08:01:46 -07:00
}
template<class _>
2016-10-28 19:43:30 -04:00
template<class ConstBufferSequence>
2017-07-20 08:01:46 -07:00
bool
utf8_checker_t<_>::
write(ConstBufferSequence const& bs)
2017-07-20 08:01:46 -07:00
{
static_assert(is_const_buffer_sequence<ConstBufferSequence>::value,
2016-10-28 19:43:30 -04:00
"ConstBufferSequence requirements not met");
2017-07-20 08:01:46 -07:00
using boost::asio::buffer_cast;
using boost::asio::buffer_size;
2017-06-13 11:53:06 -07:00
for(boost::asio::const_buffer b : bs)
2016-10-28 19:43:30 -04:00
if(! write(buffer_cast<std::uint8_t const*>(b),
2017-07-20 08:01:46 -07:00
buffer_size(b)))
return false;
return true;
}
template<class _>
bool
utf8_checker_t<_>::
write(std::uint8_t const* in, std::size_t size)
2017-07-20 08:01:46 -07:00
{
2016-10-28 19:43:30 -04:00
auto const valid =
2017-06-18 14:57:32 -07:00
[](std::uint8_t const*& p)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
if(p[0] < 128)
2016-10-28 19:43:30 -04:00
{
2017-06-18 14:57:32 -07:00
++p;
2016-10-28 19:43:30 -04:00
return true;
}
2017-08-13 20:46:01 -07:00
if((p[0] & 0x60) == 0x40)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
if((p[1] & 0xc0) != 0x80)
2016-11-14 17:21:42 -05:00
return false;
2017-06-18 14:57:32 -07:00
p += 2;
2016-10-28 19:43:30 -04:00
return true;
}
2017-08-13 20:46:01 -07:00
if((p[0] & 0xf0) == 0xe0)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
if((p[1] & 0xc0) != 0x80 ||
2017-06-18 14:57:32 -07:00
(p[2] & 0xc0) != 0x80 ||
(p[0] == 224 && p[1] < 160) ||
(p[0] == 237 && p[1] > 159))
2016-10-28 19:43:30 -04:00
return false;
2017-06-18 14:57:32 -07:00
p += 3;
2016-10-28 19:43:30 -04:00
return true;
}
2017-08-13 20:46:01 -07:00
if((p[0] & 0xf8) == 0xf0)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
if(p[0] > 244 ||
2017-06-18 14:57:32 -07:00
(p[1] & 0xc0) != 0x80 ||
(p[2] & 0xc0) != 0x80 ||
(p[3] & 0xc0) != 0x80 ||
(p[0] == 240 && p[1] < 144) ||
(p[0] == 244 && p[1] > 143))
2016-10-28 19:43:30 -04:00
return false;
2017-06-18 14:57:32 -07:00
p += 4;
2016-10-28 19:43:30 -04:00
return true;
}
return false;
};
auto const valid_have =
[&]()
{
2017-08-13 20:46:01 -07:00
if((cp_[0] & 0x60) == 0x40)
return cp_[0] <= 223;
if((cp_[0] & 0xf0) == 0xe0)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
if(p_ - cp_ > 1 &&
((cp_[1] & 0xc0) != 0x80 ||
(cp_[0] == 224 && cp_[1] < 160) ||
(cp_[0] == 237 && cp_[1] > 159)))
2016-10-28 19:43:30 -04:00
return false;
return true;
}
2017-08-13 20:46:01 -07:00
if((cp_[0] & 0xf8) == 0xf0)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
auto const n = p_ - cp_;
if(n > 2 && (cp_[2] & 0xc0) != 0x80)
2016-11-14 17:21:42 -05:00
return false;
2017-08-13 20:46:01 -07:00
if(n > 1 &&
((cp_[1] & 0xc0) != 0x80 ||
(cp_[0] == 240 && cp_[1] < 144) ||
(cp_[0] == 244 && cp_[1] > 143)))
2016-10-28 19:43:30 -04:00
return false;
}
return true;
};
auto const needed =
2017-06-18 14:57:32 -07:00
[](std::uint8_t const v)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
if(v < 128)
2016-10-28 19:43:30 -04:00
return 1;
if(v < 192)
2016-10-28 19:43:30 -04:00
return 0;
2017-08-13 20:46:01 -07:00
if(v < 224)
2016-10-28 19:43:30 -04:00
return 2;
2017-08-13 20:46:01 -07:00
if(v < 240)
2016-10-28 19:43:30 -04:00
return 3;
if(v < 248)
2016-10-28 19:43:30 -04:00
return 4;
return 0;
};
auto const end = in + size;
2017-08-13 20:46:01 -07:00
// Finish up any incomplete code point
if(need_ > 0)
2016-10-28 19:43:30 -04:00
{
2017-08-13 20:46:01 -07:00
// Calculate what we have
auto n = (std::min)(size, need_);
2016-10-28 19:43:30 -04:00
size -= n;
need_ -= n;
2017-08-13 20:46:01 -07:00
// Add characters to the code point
2016-10-28 19:43:30 -04:00
while(n--)
*p_++ = *in++;
2017-08-13 20:46:01 -07:00
BOOST_ASSERT(p_ <= cp_ + 5);
// Still incomplete?
2016-10-28 19:43:30 -04:00
if(need_ > 0)
{
2017-08-13 20:46:01 -07:00
// Incomplete code point
2016-10-28 19:43:30 -04:00
BOOST_ASSERT(in == end);
2017-08-13 20:46:01 -07:00
// Do partial validation on the incomplete
// code point, this is called "Fail fast"
// in Autobahn|Testsuite parlance.
2016-10-28 19:43:30 -04:00
return valid_have();
}
2017-08-13 20:46:01 -07:00
// Complete code point, validate it
std::uint8_t const* p = &cp_[0];
if(! valid(p))
2016-10-28 19:43:30 -04:00
return false;
2017-08-13 20:46:01 -07:00
p_ = cp_;
2016-10-28 19:43:30 -04:00
}
2017-06-14 17:50:48 -07:00
if(size <= sizeof(std::size_t))
goto slow;
2017-08-13 20:46:01 -07:00
// Align `in` to sizeof(std::size_t) boundary
2016-10-28 19:43:30 -04:00
{
2017-06-14 17:50:48 -07:00
auto const in0 = in;
auto last = reinterpret_cast<std::uint8_t const*>(
((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
sizeof(std::size_t)) * sizeof(std::size_t));
2017-08-13 20:46:01 -07:00
// Check one character at a time for low-ASCII
2017-06-14 17:50:48 -07:00
while(in < last)
{
if(*in & 0x80)
{
2017-08-13 20:46:01 -07:00
// Not low-ASCII so switch to slow loop
2017-06-14 17:50:48 -07:00
size = size - (in - in0);
goto slow;
}
++in;
}
size = size - (in - in0);
}
2017-08-13 20:46:01 -07:00
// Fast loop: Process 4 or 8 low-ASCII characters at a time
2017-06-14 17:50:48 -07:00
{
auto const in0 = in;
auto last = in + size - 7;
2016-10-28 19:43:30 -04:00
auto constexpr mask = static_cast<
2017-06-14 17:50:48 -07:00
std::size_t>(0x8080808080808080 & ~std::size_t{0});
while(in < last)
{
#if 0
std::size_t temp;
std::memcpy(&temp, in, sizeof(temp));
if((temp & mask) != 0)
#else
// Technically UB but works on all known platforms
2017-06-14 17:50:48 -07:00
if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
#endif
2017-06-14 17:50:48 -07:00
{
size = size - (in - in0);
goto slow;
}
2016-10-28 19:43:30 -04:00
in += sizeof(std::size_t);
2017-06-14 17:50:48 -07:00
}
2017-08-13 20:46:01 -07:00
// There's at least one more full code point left
2017-06-14 17:50:48 -07:00
last += 4;
while(in < last)
if(! valid(in))
return false;
goto tail;
}
slow:
2017-08-13 20:46:01 -07:00
// Slow loop: Full validation on one code point at a time
2017-06-14 17:50:48 -07:00
{
auto last = in + size - 3;
while(in < last)
if(! valid(in))
return false;
2016-10-28 19:43:30 -04:00
}
2017-06-14 17:50:48 -07:00
tail:
2017-08-13 20:46:01 -07:00
// Handle the remaining bytes. The last
// characters could split a code point so
// we save the partial code point for later.
//
// On entry to the loop, `in` points to the
// beginning of a code point.
//
2016-10-28 19:43:30 -04:00
for(;;)
{
2017-08-13 20:46:01 -07:00
// Number of chars left
2016-10-28 19:43:30 -04:00
auto n = end - in;
if(! n)
break;
2017-08-13 20:46:01 -07:00
// Chars we need to finish this code point
2016-10-28 19:43:30 -04:00
auto const need = needed(*in);
2017-08-13 20:46:01 -07:00
if(need == 0)
2016-10-28 19:43:30 -04:00
return false;
if(need <= n)
{
2017-08-13 20:46:01 -07:00
// Check a whole code point
2016-10-28 19:43:30 -04:00
if(! valid(in))
return false;
}
else
{
2017-08-13 20:46:01 -07:00
// Calculate how many chars we need
// to finish this partial code point
2016-10-28 19:43:30 -04:00
need_ = need - n;
2017-08-13 20:46:01 -07:00
// Save the partial code point
2016-10-28 19:43:30 -04:00
while(n--)
*p_++ = *in++;
2017-08-13 20:46:01 -07:00
BOOST_ASSERT(in == end);
BOOST_ASSERT(p_ <= cp_ + 5);
// Do partial validation on the incomplete
// code point, this is called "Fail fast"
// in Autobahn|Testsuite parlance.
2016-10-28 19:43:30 -04:00
return valid_have();
}
}
return true;
2017-07-20 08:01:46 -07:00
}
using utf8_checker = utf8_checker_t<>;
template<class = void>
bool
check_utf8(char const* p, std::size_t n)
{
utf8_checker c;
2016-10-28 19:43:30 -04:00
if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
2017-07-20 08:01:46 -07:00
return false;
return c.finish();
}
} // detail
} // websocket
} // beast
2017-07-20 13:40:34 -07:00
} // boost
2017-07-20 08:01:46 -07:00
#endif