From 1b5dd57cf6335a2a53357dbba5258b6d0066e5fe Mon Sep 17 00:00:00 2001 From: John Maddock Date: Wed, 18 Oct 2006 12:56:45 +0000 Subject: [PATCH] Update heuristics used to determine max state count, following a bug report from SAP. Updated tests to match. [SVN r35656] --- include/boost/regex/v4/perl_matcher.hpp | 4 +- .../boost/regex/v4/perl_matcher_common.hpp | 68 ++++++++++++++++--- test/pathology/bad_expression_test.cpp | 6 ++ 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/include/boost/regex/v4/perl_matcher.hpp b/include/boost/regex/v4/perl_matcher.hpp index 4f85fba7..99b3a9a8 100644 --- a/include/boost/regex/v4/perl_matcher.hpp +++ b/include/boost/regex/v4/perl_matcher.hpp @@ -427,9 +427,9 @@ private: // matching flags in use: match_flag_type m_match_flags; // how many states we have examined so far: - difference_type state_count; + boost::uintmax_t state_count; // max number of states to examine before giving up: - difference_type max_state_count; + boost::uintmax_t max_state_count; // whether we should ignore case or not: bool icase; // set to true when (position == last), indicates that we may have a partial match: diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index d43422b0..1c222b87 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -77,15 +77,67 @@ void perl_matcher::construct_init(const basic_r template void perl_matcher::estimate_max_state_count(std::random_access_iterator_tag*) { - static const difference_type k = 100000; - difference_type dist = boost::re_detail::distance(base, last); - traits_size_type states = static_cast(re.size()); + // + // How many states should we allow our machine to visit before giving up? + // This is a heuristic: it takes the greater of O(N^2) and O(NS^2) + // where N is the length of the string, and S is the number of states + // in the machine. It's tempting to up this to O(N^2S) or even O(N^2S^2) + // but these take unreasonably amounts of time to bale out in pathological + // cases. + // + // Calculate NS^2 first: + // + static const boost::uintmax_t k = 100000; + boost::uintmax_t dist = boost::re_detail::distance(base, last); + if(dist == 0) + dist = 1; + boost::uintmax_t states = re.size(); + if(states == 0) + states = 1; states *= states; - difference_type lim = ((std::numeric_limits::max)() - k) / states; - if(dist >= lim) - max_state_count = (std::numeric_limits::max)(); - else - max_state_count = k + states * dist; + if((std::numeric_limits::max)() / dist < states) + { + max_state_count = (std::numeric_limits::max)() - 2; + return; + } + states *= dist; + if((std::numeric_limits::max)() - k < states) + { + max_state_count = (std::numeric_limits::max)() - 2; + return; + } + states += k; + + max_state_count = states; + + // + // Now calculate N^2: + // + states = dist; + if((std::numeric_limits::max)() / dist < states) + { + max_state_count = (std::numeric_limits::max)() - 2; + return; + } + states *= dist; + if((std::numeric_limits::max)() - k < states) + { + max_state_count = (std::numeric_limits::max)() - 2; + return; + } + states += k; + // + // N^2 can be a very large number indeed, to prevent things getting out + // of control, cap the max states: + // + if(states > BOOST_REGEX_MAX_STATE_COUNT) + states = BOOST_REGEX_MAX_STATE_COUNT; + // + // If (the possibly capped) N^2 is larger than our first estimate, + // use this instead: + // + if(states > max_state_count) + max_state_count = states; } template diff --git a/test/pathology/bad_expression_test.cpp b/test/pathology/bad_expression_test.cpp index 05cc5b12..308df422 100644 --- a/test/pathology/bad_expression_test.cpp +++ b/test/pathology/bad_expression_test.cpp @@ -44,6 +44,12 @@ int test_main( int , char* [] ) BOOST_CHECK_THROW(boost::regex_search(bad_text, what, e2), std::runtime_error); BOOST_CHECK(boost::regex_search(good_text, what, e2)); + bad_text.assign((std::string::size_type)500000, 'a'); + e2.assign("aaa*@"); + BOOST_CHECK_THROW(0 == boost::regex_search(bad_text, what, e2), std::runtime_error); + good_text.assign((std::string::size_type)5000, 'a'); + BOOST_CHECK(0 == boost::regex_search(good_text, what, e2)); + return 0; }