diff --git a/doc/history.qbk b/doc/history.qbk
index b3818c8f..4f2e012d 100644
--- a/doc/history.qbk
+++ b/doc/history.qbk
@@ -26,6 +26,7 @@ this gets bumped up from v4 to v5.
* Fixed bug in collation code that failed if the locale generated collation strings with embedded nul's,
see [@https://svn.boost.org/trac/boost/ticket/9451 #9451].
* Apply patch for unusual thread usage (no statically initiallized mutexes), see [@https://svn.boost.org/trac/boost/ticket/9461 #9461].
+* Added better checks for invalid UTF-8 sequences, see [@https://svn.boost.org/trac/boost/ticket/9473 #9473].
[h4 Boost-1.54]
diff --git a/doc/html/boost_regex/background_information/history.html b/doc/html/boost_regex/background_information/history.html
index 986d5019..b40b73ef 100644
--- a/doc/html/boost_regex/background_information/history.html
+++ b/doc/html/boost_regex/background_information/history.html
@@ -59,6 +59,13 @@
Fixed bug in collation code that failed if the locale generated collation
strings with embedded nul's, see #9451.
+
+ Apply patch for unusual thread usage (no statically initiallized mutexes),
+ see #9461.
+
+
+ Added better checks for invalid UTF-8 sequences, see #9473.
+
diff --git a/doc/html/index.html b/doc/html/index.html
index 1bcf8ef9..19acfe92 100644
--- a/doc/html/index.html
+++ b/doc/html/index.html
@@ -198,7 +198,7 @@
-Last revised: December 18, 2013 at 17:16:00 GMT |
+Last revised: December 19, 2013 at 10:47:27 GMT |
|
diff --git a/include/boost/regex/pending/unicode_iterator.hpp b/include/boost/regex/pending/unicode_iterator.hpp
index 89afdd9d..b84cfa07 100644
--- a/include/boost/regex/pending/unicode_iterator.hpp
+++ b/include/boost/regex/pending/unicode_iterator.hpp
@@ -629,9 +629,15 @@ private:
0x1FFFFFu,
};
m_value &= masks[extra];
- // check the result:
+ // check the result is in range:
if(m_value > static_cast(0x10FFFFu))
invalid_sequence();
+ // The result must not be a surrogate:
+ if((m_value >= static_cast(0xD800)) && (m_value <= static_cast(0xDFFF)))
+ invalid_sequence();
+ // We should not have had an invalidly encoded UTF8 sequence:
+ if((extra > 0) && (m_value <= static_cast(masks[extra - 1])))
+ invalid_sequence();
}
BaseIterator m_position;
mutable U32Type m_value;
diff --git a/test/unicode/unicode_iterator_test.cpp b/test/unicode/unicode_iterator_test.cpp
index d6156817..206b748b 100644
--- a/test/unicode/unicode_iterator_test.cpp
+++ b/test/unicode/unicode_iterator_test.cpp
@@ -108,6 +108,46 @@ void spot_checks()
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(bad_seq3, bad_seq3, bad_seq3 + 5), boost::u8_to_u32_iterator(bad_seq3 + 5, bad_seq3, bad_seq3 + 5)), std::out_of_range);
boost::uint8_t bad_seq4[5] = { '.', '*', 0xf6, '.', '*' };
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(bad_seq4, bad_seq4, bad_seq4 + 5), boost::u8_to_u32_iterator(bad_seq4 + 5, bad_seq4, bad_seq4 + 5)), std::out_of_range);
+
+ // Invalid sequences containing surrogate pairs:
+ const char* invalid_pseq = "\xed\xa0\x80"; // single lowest lead surrogate U+D800
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xed\xb0\x80"; // single lowest trail surrogate U+DC00
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xed\xb0\x80"; // single lowest trail surrogate U+DC00
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xed\xbf\xbf"; // single highest trail surrogate U+DFFF
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+
+ // overlong encodings (created by left-padding with zero bits)
+ invalid_pseq = "\xc0\x80"; // illegal 2-byte encoding of 1-byte character U+0000
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xe0\x80\x80"; // illegal 3-byte encoding of 1-byte character U+0000
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xf0\x80\x80\x80"; // illegal 4-byte encoding of 1-byte character U+0000
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+
+ invalid_pseq = "\xc1\xbf"; // illegal 2-byte encoding of 1-byte character U+007F
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xe0\x81\xbf"; // illegal 3-byte encoding of 1-byte character U+007F
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xf0\x80\x81\xbf"; // illegal 4-byte encoding of 1-byte character U+007F
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+
+ invalid_pseq = "\xe0\x82\x80"; // illegal 3-byte encoding of 2-byte character U+0080
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xf0\x80\x82\x80"; // illegal 4-byte encoding of 2-byte character U+0080
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+
+ invalid_pseq = "\xe0\x9f\xbf"; // illegal 3-byte encoding of 2-byte character U+07FF
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xf0\x80\x9f\xbf"; // illegal 4-byte encoding of 2-byte character U+07FF
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+
+ invalid_pseq = "\xf0\x80\xa0\x80"; // illegal 4-byte encoding of 3-byte character U+0800
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
+ invalid_pseq = "\xf0\x8f\xbf\xbf"; // illegal 4-byte encoding of 3-byte character U+FFFF
+ BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
}
void test(const std::vector< ::boost::uint32_t>& v)