From bc8cd9e148342d0086333a6bb728185ce92129e8 Mon Sep 17 00:00:00 2001
From: John Maddock <john@johnmaddock.co.uk>
Date: Wed, 28 Nov 2012 17:57:26 +0000
Subject: [PATCH] Add further error checking to UTF-8 decoding. Fixes #7744.

[SVN r81614]
---
 .../boost/regex/pending/unicode_iterator.hpp  | 24 +++++++++++++++++--
 test/Jamfile.v2                               |  4 ++--
 test/unicode/unicode_iterator_test.cpp        |  5 ++++
 3 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/include/boost/regex/pending/unicode_iterator.hpp b/include/boost/regex/pending/unicode_iterator.hpp
index e6399b50..3a7b68fb 100644
--- a/include/boost/regex/pending/unicode_iterator.hpp
+++ b/include/boost/regex/pending/unicode_iterator.hpp
@@ -520,9 +520,26 @@ public:
    }
    void increment()
    {
+      // We must not start with a continuation character:
+      if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
+         invalid_sequence();
       // skip high surrogate first if there is one:
       unsigned c = detail::utf8_byte_count(*m_position);
-      std::advance(m_position, c);
+      if(m_value == pending_read)
+      {
+         // Since we haven't read in a value, we need to validate the code points:
+         for(unsigned i = 0; i < c; ++i)
+         {
+            ++m_position;
+            // We must have a continuation byte:
+            if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
+               invalid_sequence();
+         }
+      }
+      else
+      {
+         std::advance(m_position, c);
+      }
       m_value = pending_read;
    }
    void decrement()
@@ -589,7 +606,7 @@ private:
       // we must not have a continuation character:
       if((m_value & 0xC0u) == 0x80u)
          invalid_sequence();
-      // see how many extra byts we have:
+      // see how many extra bytes we have:
       unsigned extra = detail::utf8_trailing_byte_count(*m_position);
       // extract the extra bits, 6 from each extra byte:
       BaseIterator next(m_position);
@@ -597,6 +614,9 @@ private:
       {
          ++next;
          m_value <<= 6;
+         // We must have a continuation byte:
+         if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
+            invalid_sequence();
          m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
       }
       // we now need to remove a few of the leftmost bits, but how many depends
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index a7b689f7..0bf23c83 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -124,8 +124,8 @@ test-suite regex
             ../build//boost_regex
       ]
       
-      [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : <define>TEST_UTF8 : unicode_iterator_test_utf8 ]
-      [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : <define>TEST_UTF16 : unicode_iterator_test_utf16 ]
+      [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : release <define>TEST_UTF8 : unicode_iterator_test_utf8 ]
+      [ run unicode/unicode_iterator_test.cpp ../build//boost_regex : : : release <define>TEST_UTF16 : unicode_iterator_test_utf16 ]
       [ run static_mutex/static_mutex_test.cpp
             ../../thread/build//boost_thread ../build//boost_regex
       ]
diff --git a/test/unicode/unicode_iterator_test.cpp b/test/unicode/unicode_iterator_test.cpp
index a6facb2c..d6156817 100644
--- a/test/unicode/unicode_iterator_test.cpp
+++ b/test/unicode/unicode_iterator_test.cpp
@@ -103,6 +103,11 @@ void spot_checks()
    BOOST_CHECK_THROW(boost::u16_to_u32_iterator<const boost::uint16_t*>(bad_seq2, bad_seq2, bad_seq2 + 5), std::out_of_range);
    BOOST_CHECK_THROW(boost::u16_to_u32_iterator<const boost::uint16_t*>(bad_seq2 + 1, bad_seq2 + 1, bad_seq2 + 6), std::out_of_range);
    BOOST_CHECK_THROW(boost::u16_to_u32_iterator<const boost::uint16_t*>(bad_seq2 + 1, bad_seq2, bad_seq2 + 6), std::out_of_range);
+
+   boost::uint8_t bad_seq3[5] = { '.', '*', 0xe4, '.', '*' };
+   BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq3, bad_seq3, bad_seq3 + 5), boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq3 + 5, bad_seq3, bad_seq3 + 5)), std::out_of_range);
+   boost::uint8_t bad_seq4[5] = { '.', '*', 0xf6, '.', '*' };
+   BOOST_CHECK_THROW(iterate_over(boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq4, bad_seq4, bad_seq4 + 5), boost::u8_to_u32_iterator<const boost::uint8_t*>(bad_seq4 + 5, bad_seq4, bad_seq4 + 5)), std::out_of_range);
 }
 
 void test(const std::vector< ::boost::uint32_t>& v)