2017-07-20 08:01:46 -07:00
|
|
|
//
|
2017-07-24 09:42:36 -07:00
|
|
|
// Copyright (c) 2016-2017 Vinnie Falco (vinnie dot falco at gmail dot com)
|
2017-07-20 08:01:46 -07:00
|
|
|
//
|
|
|
|
|
// Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
|
|
|
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
|
//
|
2017-07-20 13:40:34 -07:00
|
|
|
// Official repository: https://github.com/boostorg/beast
|
|
|
|
|
//
|
2017-07-20 08:01:46 -07:00
|
|
|
|
2017-07-20 13:40:34 -07:00
|
|
|
#ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
|
|
|
|
|
#define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_HPP
|
2017-07-20 08:01:46 -07:00
|
|
|
|
2017-07-20 13:40:34 -07:00
|
|
|
#include <boost/beast/core/type_traits.hpp>
|
2017-07-20 08:01:46 -07:00
|
|
|
#include <boost/asio/buffer.hpp>
|
2016-10-28 19:43:30 -04:00
|
|
|
#include <boost/assert.hpp>
|
|
|
|
|
#include <algorithm>
|
2017-07-20 08:01:46 -07:00
|
|
|
#include <cstdint>
|
2016-10-28 19:43:30 -04:00
|
|
|
|
2017-07-20 13:40:34 -07:00
|
|
|
namespace boost {
|
2017-07-20 08:01:46 -07:00
|
|
|
namespace beast {
|
|
|
|
|
namespace websocket {
|
|
|
|
|
namespace detail {
|
|
|
|
|
|
2016-10-28 19:43:30 -04:00
|
|
|
/** A UTF8 validator.
|
|
|
|
|
|
|
|
|
|
This validator can be used to check if a buffer containing UTF8 text is
|
|
|
|
|
valid. The write function may be called incrementally with segmented UTF8
|
|
|
|
|
sequences. The finish function determines if all processed text is valid.
|
2017-07-20 08:01:46 -07:00
|
|
|
*/
|
|
|
|
|
template<class = void>
|
|
|
|
|
class utf8_checker_t
|
|
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
std::size_t need_ = 0; // chars we need to finish the code point
|
|
|
|
|
std::uint8_t* p_ = cp_; // current position in temp buffer
|
|
|
|
|
std::uint8_t cp_[4]; // a temp buffer for the code point
|
2017-07-20 08:01:46 -07:00
|
|
|
|
|
|
|
|
public:
|
2016-10-28 19:43:30 -04:00
|
|
|
/** Prepare to process text as valid utf8
|
|
|
|
|
*/
|
2017-07-20 08:01:46 -07:00
|
|
|
void
|
|
|
|
|
reset();
|
|
|
|
|
|
2016-10-28 19:43:30 -04:00
|
|
|
/** Check that all processed text is valid utf8
|
|
|
|
|
*/
|
2017-07-20 08:01:46 -07:00
|
|
|
bool
|
2016-10-28 19:43:30 -04:00
|
|
|
finish();
|
|
|
|
|
|
|
|
|
|
/** Check if text is valid UTF8
|
2017-07-20 08:01:46 -07:00
|
|
|
|
2016-10-28 19:43:30 -04:00
|
|
|
@return `true` if the text is valid utf8 or false otherwise.
|
|
|
|
|
*/
|
2017-07-20 08:01:46 -07:00
|
|
|
bool
|
2016-10-28 19:43:30 -04:00
|
|
|
write(std::uint8_t const* in, std::size_t size);
|
|
|
|
|
|
|
|
|
|
/** Check if text is valid UTF8
|
2017-07-20 08:01:46 -07:00
|
|
|
|
2016-10-28 19:43:30 -04:00
|
|
|
@return `true` if the text is valid utf8 or false otherwise.
|
|
|
|
|
*/
|
|
|
|
|
template<class ConstBufferSequence>
|
2017-07-20 08:01:46 -07:00
|
|
|
bool
|
2016-10-28 19:43:30 -04:00
|
|
|
write(ConstBufferSequence const& bs);
|
2017-07-20 08:01:46 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
template<class _>
|
|
|
|
|
void
|
2017-05-10 12:03:00 -07:00
|
|
|
utf8_checker_t<_>::
|
|
|
|
|
reset()
|
2017-07-20 08:01:46 -07:00
|
|
|
{
|
2016-10-28 19:43:30 -04:00
|
|
|
need_ = 0;
|
2017-08-13 20:46:01 -07:00
|
|
|
p_ = cp_;
|
2017-07-20 08:01:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<class _>
|
|
|
|
|
bool
|
2017-05-10 12:03:00 -07:00
|
|
|
utf8_checker_t<_>::
|
|
|
|
|
finish()
|
2017-07-20 08:01:46 -07:00
|
|
|
{
|
2016-10-28 19:43:30 -04:00
|
|
|
auto const success = need_ == 0;
|
|
|
|
|
reset();
|
|
|
|
|
return success;
|
2017-07-20 08:01:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<class _>
|
2016-10-28 19:43:30 -04:00
|
|
|
template<class ConstBufferSequence>
|
2017-07-20 08:01:46 -07:00
|
|
|
bool
|
2017-05-10 12:03:00 -07:00
|
|
|
utf8_checker_t<_>::
|
|
|
|
|
write(ConstBufferSequence const& bs)
|
2017-07-20 08:01:46 -07:00
|
|
|
{
|
2017-09-07 07:39:52 -07:00
|
|
|
static_assert(boost::asio::is_const_buffer_sequence<ConstBufferSequence>::value,
|
2016-10-28 19:43:30 -04:00
|
|
|
"ConstBufferSequence requirements not met");
|
2017-09-07 07:39:52 -07:00
|
|
|
for(auto b : beast::detail::buffers_range(bs))
|
|
|
|
|
if(! write(reinterpret_cast<
|
|
|
|
|
std::uint8_t const*>(b.data()),
|
|
|
|
|
b.size()))
|
2017-07-20 08:01:46 -07:00
|
|
|
return false;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<class _>
|
|
|
|
|
bool
|
2017-05-10 12:03:00 -07:00
|
|
|
utf8_checker_t<_>::
|
|
|
|
|
write(std::uint8_t const* in, std::size_t size)
|
2017-07-20 08:01:46 -07:00
|
|
|
{
|
2016-10-28 19:43:30 -04:00
|
|
|
auto const valid =
|
2017-06-18 14:57:32 -07:00
|
|
|
[](std::uint8_t const*& p)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
if(p[0] < 128)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-06-18 14:57:32 -07:00
|
|
|
++p;
|
2016-10-28 19:43:30 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
2017-10-25 23:23:59 -07:00
|
|
|
if((p[0] & 0xe0) == 0xc0)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-10-25 23:23:59 -07:00
|
|
|
if( (p[1] & 0xc0) != 0x80 ||
|
|
|
|
|
(p[0] & 0xfe) == 0xc0) // overlong
|
2016-11-14 17:21:42 -05:00
|
|
|
return false;
|
2017-06-18 14:57:32 -07:00
|
|
|
p += 2;
|
2016-10-28 19:43:30 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
2017-08-13 20:46:01 -07:00
|
|
|
if((p[0] & 0xf0) == 0xe0)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-10-25 23:23:59 -07:00
|
|
|
if( (p[1] & 0xc0) != 0x80
|
|
|
|
|
|| (p[2] & 0xc0) != 0x80
|
|
|
|
|
|| (p[0] == 0xe0 && (p[1] & 0xe0) == 0x80) // overlong
|
|
|
|
|
|| (p[0] == 0xed && (p[1] & 0xe0) == 0xa0) // surrogate
|
|
|
|
|
//|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
|
|
|
|
|
)
|
|
|
|
|
return false;
|
2017-06-18 14:57:32 -07:00
|
|
|
p += 3;
|
2016-10-28 19:43:30 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
2017-08-13 20:46:01 -07:00
|
|
|
if((p[0] & 0xf8) == 0xf0)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-10-25 23:23:59 -07:00
|
|
|
if( (p[1] & 0xc0) != 0x80
|
|
|
|
|
|| (p[2] & 0xc0) != 0x80
|
|
|
|
|
|| (p[3] & 0xc0) != 0x80
|
|
|
|
|
|| (p[0] == 0xf0 && (p[1] & 0xf0) == 0x80) // overlong
|
|
|
|
|
|| (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
|
|
|
|
|
)
|
|
|
|
|
return false;
|
2017-06-18 14:57:32 -07:00
|
|
|
p += 4;
|
2016-10-28 19:43:30 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
};
|
2017-10-25 23:23:59 -07:00
|
|
|
auto const fail_fast =
|
2016-10-28 19:43:30 -04:00
|
|
|
[&]()
|
|
|
|
|
{
|
2017-10-25 23:23:59 -07:00
|
|
|
auto const n = p_ - cp_;
|
|
|
|
|
switch(n)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-10-25 23:23:59 -07:00
|
|
|
default:
|
|
|
|
|
BOOST_ASSERT(false);
|
|
|
|
|
BOOST_BEAST_FALLTHROUGH;
|
|
|
|
|
case 1:
|
|
|
|
|
cp_[1] = 0x81;
|
|
|
|
|
BOOST_BEAST_FALLTHROUGH;
|
|
|
|
|
case 2:
|
|
|
|
|
cp_[2] = 0x81;
|
|
|
|
|
BOOST_BEAST_FALLTHROUGH;
|
|
|
|
|
case 3:
|
|
|
|
|
cp_[3] = 0x81;
|
|
|
|
|
BOOST_BEAST_FALLTHROUGH;
|
|
|
|
|
break;
|
2016-10-28 19:43:30 -04:00
|
|
|
}
|
2017-10-25 23:23:59 -07:00
|
|
|
std::uint8_t const* p = cp_;
|
|
|
|
|
return ! valid(p);
|
2016-10-28 19:43:30 -04:00
|
|
|
};
|
|
|
|
|
auto const needed =
|
2017-06-18 14:57:32 -07:00
|
|
|
[](std::uint8_t const v)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
if(v < 128)
|
2016-10-28 19:43:30 -04:00
|
|
|
return 1;
|
2017-08-30 17:39:24 -07:00
|
|
|
if(v < 192)
|
2016-10-28 19:43:30 -04:00
|
|
|
return 0;
|
2017-08-13 20:46:01 -07:00
|
|
|
if(v < 224)
|
2016-10-28 19:43:30 -04:00
|
|
|
return 2;
|
2017-08-13 20:46:01 -07:00
|
|
|
if(v < 240)
|
2016-10-28 19:43:30 -04:00
|
|
|
return 3;
|
2017-08-30 17:39:24 -07:00
|
|
|
if(v < 248)
|
2016-10-28 19:43:30 -04:00
|
|
|
return 4;
|
|
|
|
|
return 0;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
auto const end = in + size;
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Finish up any incomplete code point
|
|
|
|
|
if(need_ > 0)
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
// Calculate what we have
|
2016-11-08 13:03:20 -05:00
|
|
|
auto n = (std::min)(size, need_);
|
2016-10-28 19:43:30 -04:00
|
|
|
size -= n;
|
|
|
|
|
need_ -= n;
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Add characters to the code point
|
2016-10-28 19:43:30 -04:00
|
|
|
while(n--)
|
|
|
|
|
*p_++ = *in++;
|
2017-08-13 20:46:01 -07:00
|
|
|
BOOST_ASSERT(p_ <= cp_ + 5);
|
|
|
|
|
|
|
|
|
|
// Still incomplete?
|
2016-10-28 19:43:30 -04:00
|
|
|
if(need_ > 0)
|
|
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
// Incomplete code point
|
2016-10-28 19:43:30 -04:00
|
|
|
BOOST_ASSERT(in == end);
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Do partial validation on the incomplete
|
|
|
|
|
// code point, this is called "Fail fast"
|
|
|
|
|
// in Autobahn|Testsuite parlance.
|
2017-10-25 23:23:59 -07:00
|
|
|
return ! fail_fast();
|
2016-10-28 19:43:30 -04:00
|
|
|
}
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Complete code point, validate it
|
|
|
|
|
std::uint8_t const* p = &cp_[0];
|
|
|
|
|
if(! valid(p))
|
2016-10-28 19:43:30 -04:00
|
|
|
return false;
|
2017-08-13 20:46:01 -07:00
|
|
|
p_ = cp_;
|
2016-10-28 19:43:30 -04:00
|
|
|
}
|
|
|
|
|
|
2017-06-14 17:50:48 -07:00
|
|
|
if(size <= sizeof(std::size_t))
|
|
|
|
|
goto slow;
|
|
|
|
|
|
2017-08-13 20:46:01 -07:00
|
|
|
// Align `in` to sizeof(std::size_t) boundary
|
2016-10-28 19:43:30 -04:00
|
|
|
{
|
2017-06-14 17:50:48 -07:00
|
|
|
auto const in0 = in;
|
|
|
|
|
auto last = reinterpret_cast<std::uint8_t const*>(
|
|
|
|
|
((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
|
|
|
|
|
sizeof(std::size_t)) * sizeof(std::size_t));
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Check one character at a time for low-ASCII
|
2017-06-14 17:50:48 -07:00
|
|
|
while(in < last)
|
|
|
|
|
{
|
|
|
|
|
if(*in & 0x80)
|
|
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
// Not low-ASCII so switch to slow loop
|
2017-06-14 17:50:48 -07:00
|
|
|
size = size - (in - in0);
|
|
|
|
|
goto slow;
|
|
|
|
|
}
|
|
|
|
|
++in;
|
|
|
|
|
}
|
|
|
|
|
size = size - (in - in0);
|
|
|
|
|
}
|
|
|
|
|
|
2017-08-13 20:46:01 -07:00
|
|
|
// Fast loop: Process 4 or 8 low-ASCII characters at a time
|
2017-06-14 17:50:48 -07:00
|
|
|
{
|
|
|
|
|
auto const in0 = in;
|
|
|
|
|
auto last = in + size - 7;
|
2016-10-28 19:43:30 -04:00
|
|
|
auto constexpr mask = static_cast<
|
2017-06-14 17:50:48 -07:00
|
|
|
std::size_t>(0x8080808080808080 & ~std::size_t{0});
|
|
|
|
|
while(in < last)
|
|
|
|
|
{
|
2017-07-10 08:09:47 -07:00
|
|
|
#if 0
|
|
|
|
|
std::size_t temp;
|
|
|
|
|
std::memcpy(&temp, in, sizeof(temp));
|
|
|
|
|
if((temp & mask) != 0)
|
|
|
|
|
#else
|
|
|
|
|
// Technically UB but works on all known platforms
|
2017-06-14 17:50:48 -07:00
|
|
|
if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
|
2017-07-10 08:09:47 -07:00
|
|
|
#endif
|
2017-06-14 17:50:48 -07:00
|
|
|
{
|
|
|
|
|
size = size - (in - in0);
|
|
|
|
|
goto slow;
|
|
|
|
|
}
|
2016-10-28 19:43:30 -04:00
|
|
|
in += sizeof(std::size_t);
|
2017-06-14 17:50:48 -07:00
|
|
|
}
|
2017-08-13 20:46:01 -07:00
|
|
|
// There's at least one more full code point left
|
2017-06-14 17:50:48 -07:00
|
|
|
last += 4;
|
|
|
|
|
while(in < last)
|
|
|
|
|
if(! valid(in))
|
|
|
|
|
return false;
|
|
|
|
|
goto tail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
slow:
|
2017-08-13 20:46:01 -07:00
|
|
|
// Slow loop: Full validation on one code point at a time
|
2017-06-14 17:50:48 -07:00
|
|
|
{
|
|
|
|
|
auto last = in + size - 3;
|
|
|
|
|
while(in < last)
|
|
|
|
|
if(! valid(in))
|
|
|
|
|
return false;
|
2016-10-28 19:43:30 -04:00
|
|
|
}
|
|
|
|
|
|
2017-06-14 17:50:48 -07:00
|
|
|
tail:
|
2017-08-13 20:46:01 -07:00
|
|
|
// Handle the remaining bytes. The last
|
|
|
|
|
// characters could split a code point so
|
|
|
|
|
// we save the partial code point for later.
|
|
|
|
|
//
|
|
|
|
|
// On entry to the loop, `in` points to the
|
|
|
|
|
// beginning of a code point.
|
|
|
|
|
//
|
2016-10-28 19:43:30 -04:00
|
|
|
for(;;)
|
|
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
// Number of chars left
|
2016-10-28 19:43:30 -04:00
|
|
|
auto n = end - in;
|
|
|
|
|
if(! n)
|
|
|
|
|
break;
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Chars we need to finish this code point
|
2016-10-28 19:43:30 -04:00
|
|
|
auto const need = needed(*in);
|
2017-08-13 20:46:01 -07:00
|
|
|
if(need == 0)
|
2016-10-28 19:43:30 -04:00
|
|
|
return false;
|
|
|
|
|
if(need <= n)
|
|
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
// Check a whole code point
|
2016-10-28 19:43:30 -04:00
|
|
|
if(! valid(in))
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
2017-08-13 20:46:01 -07:00
|
|
|
// Calculate how many chars we need
|
|
|
|
|
// to finish this partial code point
|
2016-10-28 19:43:30 -04:00
|
|
|
need_ = need - n;
|
2017-08-13 20:46:01 -07:00
|
|
|
|
|
|
|
|
// Save the partial code point
|
2016-10-28 19:43:30 -04:00
|
|
|
while(n--)
|
|
|
|
|
*p_++ = *in++;
|
2017-08-13 20:46:01 -07:00
|
|
|
BOOST_ASSERT(in == end);
|
|
|
|
|
BOOST_ASSERT(p_ <= cp_ + 5);
|
|
|
|
|
|
|
|
|
|
// Do partial validation on the incomplete
|
|
|
|
|
// code point, this is called "Fail fast"
|
|
|
|
|
// in Autobahn|Testsuite parlance.
|
2017-10-25 23:23:59 -07:00
|
|
|
return ! fail_fast();
|
2016-10-28 19:43:30 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
2017-07-20 08:01:46 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
using utf8_checker = utf8_checker_t<>;
|
|
|
|
|
|
|
|
|
|
template<class = void>
|
|
|
|
|
bool
|
|
|
|
|
check_utf8(char const* p, std::size_t n)
|
|
|
|
|
{
|
|
|
|
|
utf8_checker c;
|
2016-10-28 19:43:30 -04:00
|
|
|
if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
|
2017-07-20 08:01:46 -07:00
|
|
|
return false;
|
|
|
|
|
return c.finish();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // detail
|
|
|
|
|
} // websocket
|
|
|
|
|
} // beast
|
2017-07-20 13:40:34 -07:00
|
|
|
} // boost
|
2017-07-20 08:01:46 -07:00
|
|
|
|
|
|
|
|
#endif
|