Add checks for invalid UTF-8 sequences, see: https://svn.boost.org/trac/boost/ticket/9473

This commit is contained in:
jzmaddock
2013-12-19 10:45:50 +00:00
parent 0983ff065e
commit f1aa75af2c
5 changed files with 56 additions and 2 deletions

View File

@ -26,6 +26,7 @@ this gets bumped up from v4 to v5.
* Fixed bug in collation code that failed if the locale generated collation strings with embedded nul's,
see [@https://svn.boost.org/trac/boost/ticket/9451 #9451].
* Apply patch for unusual thread usage (no statically initiallized mutexes), see [@https://svn.boost.org/trac/boost/ticket/9461 #9461].
* Added better checks for invalid UTF-8 sequences, see [@https://svn.boost.org/trac/boost/ticket/9473 #9473].
[h4 Boost-1.54]

View File

@ -59,6 +59,13 @@
Fixed bug in collation code that failed if the locale generated collation
strings with embedded nul's, see <a href="https://svn.boost.org/trac/boost/ticket/9451" target="_top">#9451</a>.
</li>
<li class="listitem">
Apply patch for unusual thread usage (no statically initiallized mutexes),
see <a href="https://svn.boost.org/trac/boost/ticket/9461" target="_top">#9461</a>.
</li>
<li class="listitem">
Added better checks for invalid UTF-8 sequences, see <a href="https://svn.boost.org/trac/boost/ticket/9473" target="_top">#9473</a>.
</li>
</ul></div>
<h5>
<a name="boost_regex.background_information.history.h1"></a>

View File

@ -198,7 +198,7 @@
</p>
</div>
<table xmlns:rev="http://www.cs.rpi.edu/~gregod/boost/tools/doc/revision" width="100%"><tr>
<td align="left"><p><small>Last revised: December 18, 2013 at 17:16:00 GMT</small></p></td>
<td align="left"><p><small>Last revised: December 19, 2013 at 10:47:27 GMT</small></p></td>
<td align="right"><div class="copyright-footer"></div></td>
</tr></table>
<hr>

View File

@ -629,9 +629,15 @@ private:
0x1FFFFFu,
};
m_value &= masks[extra];
// check the result:
// check the result is in range:
if(m_value > static_cast<U32Type>(0x10FFFFu))
invalid_sequence();
// The result must not be a surrogate:
if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
invalid_sequence();
// We should not have had an invalidly encoded UTF8 sequence:
if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
invalid_sequence();
}
BaseIterator m_position;
mutable U32Type m_value;

View File

@ -108,6 +108,46 @@ void spot_checks()
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq3, bad_seq3, bad_seq3 + 5), boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq3 + 5, bad_seq3, bad_seq3 + 5)), std::out_of_range);
boost::uint8_t bad_seq4[5] = { '.', '*', 0xf6, '.', '*' };
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq4, bad_seq4, bad_seq4 + 5), boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq4 + 5, bad_seq4, bad_seq4 + 5)), std::out_of_range);
// Invalid sequences containing surrogate pairs:
const char* invalid_pseq = "\xed\xa0\x80"; // single lowest lead surrogate U+D800
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xed\xb0\x80"; // single lowest trail surrogate U+DC00
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xed\xb0\x80"; // single lowest trail surrogate U+DC00
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xed\xbf\xbf"; // single highest trail surrogate U+DFFF
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
// overlong encodings (created by left-padding with zero bits)
invalid_pseq = "\xc0\x80"; // illegal 2-byte encoding of 1-byte character U+0000
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xe0\x80\x80"; // illegal 3-byte encoding of 1-byte character U+0000
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xf0\x80\x80\x80"; // illegal 4-byte encoding of 1-byte character U+0000
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xc1\xbf"; // illegal 2-byte encoding of 1-byte character U+007F
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xe0\x81\xbf"; // illegal 3-byte encoding of 1-byte character U+007F
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xf0\x80\x81\xbf"; // illegal 4-byte encoding of 1-byte character U+007F
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xe0\x82\x80"; // illegal 3-byte encoding of 2-byte character U+0080
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xf0\x80\x82\x80"; // illegal 4-byte encoding of 2-byte character U+0080
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xe0\x9f\xbf"; // illegal 3-byte encoding of 2-byte character U+07FF
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xf0\x80\x9f\xbf"; // illegal 4-byte encoding of 2-byte character U+07FF
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xf0\x80\xa0\x80"; // illegal 4-byte encoding of 3-byte character U+0800
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
invalid_pseq = "\xf0\x8f\xbf\xbf"; // illegal 4-byte encoding of 3-byte character U+FFFF
BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const char*>(invalid_pseq, invalid_pseq, invalid_pseq + std::strlen(invalid_pseq)), boost::u8_to_u32_iterator<const char*>(invalid_pseq + std::strlen(invalid_pseq), invalid_pseq, invalid_pseq + std::strlen(invalid_pseq))), std::out_of_range);
}
void test(const std::vector< ::boost::uint32_t>& v)