From f1aa75af2c6a3a6beb965da4e7badcff867a5694 Mon Sep 17 00:00:00 2001 From: jzmaddock Date: Thu, 19 Dec 2013 10:45:50 +0000 Subject: [PATCH] Add checks for invalid UTF-8 sequences, see: https://svn.boost.org/trac/boost/ticket/9473 --- doc/history.qbk | 1 + .../background_information/history.html | 7 ++++ doc/html/index.html | 2 +- .../boost/regex/pending/unicode_iterator.hpp | 8 +++- test/unicode/unicode_iterator_test.cpp | 40 +++++++++++++++++++ 5 files changed, 56 insertions(+), 2 deletions(-) diff --git a/doc/history.qbk b/doc/history.qbk index b3818c8f..4f2e012d 100644 --- a/doc/history.qbk +++ b/doc/history.qbk @@ -26,6 +26,7 @@ this gets bumped up from v4 to v5. * Fixed bug in collation code that failed if the locale generated collation strings with embedded nul's, see [@https://svn.boost.org/trac/boost/ticket/9451 #9451]. * Apply patch for unusual thread usage (no statically initiallized mutexes), see [@https://svn.boost.org/trac/boost/ticket/9461 #9461]. +* Added better checks for invalid UTF-8 sequences, see [@https://svn.boost.org/trac/boost/ticket/9473 #9473]. [h4 Boost-1.54] diff --git a/doc/html/boost_regex/background_information/history.html b/doc/html/boost_regex/background_information/history.html index 986d5019..b40b73ef 100644 --- a/doc/html/boost_regex/background_information/history.html +++ b/doc/html/boost_regex/background_information/history.html @@ -59,6 +59,13 @@ Fixed bug in collation code that failed if the locale generated collation strings with embedded nul's, see #9451. +
  • + Apply patch for unusual thread usage (no statically initiallized mutexes), + see #9461. +
  • +
  • + Added better checks for invalid UTF-8 sequences, see #9473. +
  • diff --git a/doc/html/index.html b/doc/html/index.html index 1bcf8ef9..19acfe92 100644 --- a/doc/html/index.html +++ b/doc/html/index.html @@ -198,7 +198,7 @@

    - +

    Last revised: December 18, 2013 at 17:16:00 GMT

    Last revised: December 19, 2013 at 10:47:27 GMT


    diff --git a/include/boost/regex/pending/unicode_iterator.hpp b/include/boost/regex/pending/unicode_iterator.hpp index 89afdd9d..b84cfa07 100644 --- a/include/boost/regex/pending/unicode_iterator.hpp +++ b/include/boost/regex/pending/unicode_iterator.hpp @@ -629,9 +629,15 @@ private: 0x1FFFFFu, }; m_value &= masks[extra]; - // check the result: + // check the result is in range: if(m_value > static_cast(0x10FFFFu)) invalid_sequence(); + // The result must not be a surrogate: + if((m_value >= static_cast(0xD800)) && (m_value <= static_cast(0xDFFF))) + invalid_sequence(); + // We should not have had an invalidly encoded UTF8 sequence: + if((extra > 0) && (m_value <= static_cast(masks[extra - 1]))) + invalid_sequence(); } BaseIterator m_position; mutable U32Type m_value; diff --git a/test/unicode/unicode_iterator_test.cpp b/test/unicode/unicode_iterator_test.cpp index d6156817..206b748b 100644 --- a/test/unicode/unicode_iterator_test.cpp +++ b/test/unicode/unicode_iterator_test.cpp @@ -108,6 +108,46 @@ void spot_checks() BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(bad_seq3, bad_seq3, bad_seq3 + 5), boost::u8_to_u32_iterator(bad_seq3 + 5, bad_seq3, bad_seq3 + 5)), std::out_of_range); boost::uint8_t bad_seq4[5] = { '.', '*', 0xf6, '.', '*' }; BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(bad_seq4, bad_seq4, bad_seq4 + 5), boost::u8_to_u32_iterator(bad_seq4 + 5, bad_seq4, bad_seq4 + 5)), std::out_of_range); + + // Invalid sequences containing surrogate pairs: + const char* invalid_pseq = "\xed\xa0\x80"; // single lowest lead surrogate U+D800 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xed\xb0\x80"; // single lowest trail surrogate U+DC00 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xed\xb0\x80"; // single lowest trail surrogate U+DC00 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xed\xbf\xbf"; // single highest trail surrogate U+DFFF + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + + // overlong encodings (created by left-padding with zero bits) + invalid_pseq = "\xc0\x80"; // illegal 2-byte encoding of 1-byte character U+0000 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xe0\x80\x80"; // illegal 3-byte encoding of 1-byte character U+0000 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xf0\x80\x80\x80"; // illegal 4-byte encoding of 1-byte character U+0000 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + + invalid_pseq = "\xc1\xbf"; // illegal 2-byte encoding of 1-byte character U+007F + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xe0\x81\xbf"; // illegal 3-byte encoding of 1-byte character U+007F + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xf0\x80\x81\xbf"; // illegal 4-byte encoding of 1-byte character U+007F + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + + invalid_pseq = "\xe0\x82\x80"; // illegal 3-byte encoding of 2-byte character U+0080 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xf0\x80\x82\x80"; // illegal 4-byte encoding of 2-byte character U+0080 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + + invalid_pseq = "\xe0\x9f\xbf"; // illegal 3-byte encoding of 2-byte character U+07FF + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xf0\x80\x9f\xbf"; // illegal 4-byte encoding of 2-byte character U+07FF + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + + invalid_pseq = "\xf0\x80\xa0\x80"; // illegal 4-byte encoding of 3-byte character U+0800 + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); + invalid_pseq = "\xf0\x8f\xbf\xbf"; // illegal 4-byte encoding of 3-byte character U+FFFF + BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range); } void test(const std::vector< ::boost::uint32_t>& v)