From bc8cd9e148342d0086333a6bb728185ce92129e8 Mon Sep 17 00:00:00 2001 From: John Maddock Date: Wed, 28 Nov 2012 17:57:26 +0000 Subject: [PATCH] Add further error checking to UTF-8 decoding. Fixes #7744. [SVN r81614] --- .../boost/regex/pending/unicode_iterator.hpp | 24 +++++++++++++++++-- test/Jamfile.v2 | 4 ++-- test/unicode/unicode_iterator_test.cpp | 5 ++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/boost/regex/pending/unicode_iterator.hpp b/include/boost/regex/pending/unicode_iterator.hpp index e6399b50..3a7b68fb 100644 --- a/include/boost/regex/pending/unicode_iterator.hpp +++ b/include/boost/regex/pending/unicode_iterator.hpp @@ -520,9 +520,26 @@ public: } void increment() { + // We must not start with a continuation character: + if((static_cast(*m_position) & 0xC0) == 0x80) + invalid_sequence(); // skip high surrogate first if there is one: unsigned c = detail::utf8_byte_count(*m_position); - std::advance(m_position, c); + if(m_value == pending_read) + { + // Since we haven't read in a value, we need to validate the code points: + for(unsigned i = 0; i < c; ++i) + { + ++m_position; + // We must have a continuation byte: + if((i != c - 1) && ((static_cast(*m_position) & 0xC0) != 0x80)) + invalid_sequence(); + } + } + else + { + std::advance(m_position, c); + } m_value = pending_read; } void decrement() @@ -589,7 +606,7 @@ private: // we must not have a continuation character: if((m_value & 0xC0u) == 0x80u) invalid_sequence(); - // see how many extra byts we have: + // see how many extra bytes we have: unsigned extra = detail::utf8_trailing_byte_count(*m_position); // extract the extra bits, 6 from each extra byte: BaseIterator next(m_position); @@ -597,6 +614,9 @@ private: { ++next; m_value <<= 6; + // We must have a continuation byte: + if((static_cast(*next) & 0xC0) != 0x80) + invalid_sequence(); m_value += static_cast(*next) & 0x3Fu; } // we now need to remove a few of the leftmost bits, but how many depends diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index a7b689f7..0bf23c83 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -124,8 +124,8 @@ test-suite regex ../build//boost_regex ] - [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : TEST_UTF8 : unicode_iterator_test_utf8 ] - [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : TEST_UTF16 : unicode_iterator_test_utf16 ] + [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : release TEST_UTF8 : unicode_iterator_test_utf8 ] + [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : release TEST_UTF16 : unicode_iterator_test_utf16 ] [ run static_mutex/static_mutex_test.cpp ../../thread/build//boost_thread ../build//boost_regex ] diff --git a/test/unicode/unicode_iterator_test.cpp b/test/unicode/unicode_iterator_test.cpp index a6facb2c..d6156817 100644 --- a/test/unicode/unicode_iterator_test.cpp +++ b/test/unicode/unicode_iterator_test.cpp @@ -103,6 +103,11 @@ void spot_checks() BOOST_CHECK_THROW(boost::u16_to_u32_iterator(bad_seq2, bad_seq2, bad_seq2 + 5), std::out_of_range); BOOST_CHECK_THROW(boost::u16_to_u32_iterator(bad_seq2 + 1, bad_seq2 + 1, bad_seq2 + 6), std::out_of_range); BOOST_CHECK_THROW(boost::u16_to_u32_iterator(bad_seq2 + 1, bad_seq2, bad_seq2 + 6), std::out_of_range); + + boost::uint8_t bad_seq3[5] = { '.', '*', 0xe4, '.', '*' }; + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(bad_seq3, bad_seq3, bad_seq3 + 5), boost::u8_to_u32_iterator(bad_seq3 + 5, bad_seq3, bad_seq3 + 5)), std::out_of_range); + boost::uint8_t bad_seq4[5] = { '.', '*', 0xf6, '.', '*' }; + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(bad_seq4, bad_seq4, bad_seq4 + 5), boost::u8_to_u32_iterator(bad_seq4 + 5, bad_seq4, bad_seq4 + 5)), std::out_of_range); } void test(const std::vector< ::boost::uint32_t>& v)