From 4bb4d392e4e5bdbf454ba1ae86f79d9b5ca3bc75 Mon Sep 17 00:00:00 2001 From: jzmaddock Date: Sun, 19 Jan 2020 11:28:36 +0000 Subject: [PATCH] Remove limit on the number of backrefs possible. Changes named sub-expressions to use different hashing scheme: high order bit is now always set to clashes between hashes and indexes don't happen until 2^30 or 2^62 sub-expressions in 32 and 64 bit code respectively. Changes bitmask of seen sub-expressions to use dynamic storage for sub-expression indexes above 64. Adds tests for the above. Fixes https://github.com/boostorg/regex/issues/75. --- include/boost/regex/v4/basic_regex.hpp | 7 +-- .../boost/regex/v4/basic_regex_creator.hpp | 10 ++-- include/boost/regex/v4/basic_regex_parser.hpp | 11 ++-- include/boost/regex/v4/indexed_bit_flag.hpp | 54 +++++++++++++++++++ .../boost/regex/v4/perl_matcher_common.hpp | 6 +-- test/regress/main.cpp | 8 +-- test/regress/test_backrefs.cpp | 5 ++ 7 files changed, 81 insertions(+), 20 deletions(-) create mode 100644 include/boost/regex/v4/indexed_bit_flag.hpp diff --git a/include/boost/regex/v4/basic_regex.hpp b/include/boost/regex/v4/basic_regex.hpp index b3bb1fe0..4e166afc 100644 --- a/include/boost/regex/v4/basic_regex.hpp +++ b/include/boost/regex/v4/basic_regex.hpp @@ -70,13 +70,14 @@ void bubble_down_one(I first, I last) } } +static const int hash_value_mask = 1 << (std::numeric_limits::digits - 1); + template inline int hash_value_from_capture_name(Iterator i, Iterator j) { std::size_t r = boost::hash_range(i, j); - r %= ((std::numeric_limits::max)() - 10001); - r += 10000; - return static_cast(r); + r %= ((std::numeric_limits::max)()); + return static_cast(r) | hash_value_mask; } class named_subexpressions diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index 7c006527..fe32533a 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -20,6 +20,8 @@ #ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP #define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP +#include + #ifdef BOOST_MSVC #pragma warning(push) #pragma warning(disable: 4103) @@ -239,7 +241,7 @@ protected: bool m_icase; // true for case insensitive matches unsigned m_repeater_id; // the state_id of the next repeater bool m_has_backrefs; // true if there are actually any backrefs - unsigned m_backrefs; // bitmask of permitted backrefs + indexed_bit_flag m_backrefs; // bitmask of permitted backrefs boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; bool m_has_recursions; // set when we have recursive expresisons to fixup std::vector m_recursion_checks; // notes which recursions we've followed while analysing this expression @@ -267,7 +269,7 @@ private: template basic_regex_creator::basic_regex_creator(regex_data* data) - : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0), m_has_recursions(false) + : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_has_recursions(false) { m_pdata->m_data.clear(); m_pdata->m_status = ::boost::regex_constants::error_ok; @@ -763,7 +765,7 @@ void basic_regex_creator::fixup_recursions(re_syntax_base* state) if(idx < 0) { idx = -idx-1; - if(idx >= 10000) + if(idx >= hash_value_mask) { idx = m_pdata->get_id(idx); if(idx <= 0) @@ -795,7 +797,7 @@ void basic_regex_creator::fixup_recursions(re_syntax_base* state) bool ok = false; re_syntax_base* p = base; std::ptrdiff_t idx = static_cast(state)->alt.i; - if(idx > 10000) + if(idx >= hash_value_mask) { // // There may be more than one capture group with this hash, just do what Perl diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 85b43eaf..13ff181b 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -545,8 +545,8 @@ bool basic_regex_parser::parse_open_paren() // // allow backrefs to this mark: // - if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT)) - this->m_backrefs |= 1u << (markid - 1); + if(markid > 0) + this->m_backrefs.set(markid); return true; } @@ -912,7 +912,7 @@ escape_type_class_jump: } if(negative) i = 1 + m_mark_count - i; - if(((i > 0) && (i < std::numeric_limits::digits) && (i - 1 < static_cast(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_pdata->get_id(i)-1 < static_cast(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1))))) + if(((i > 0) && (this->m_backrefs.test(i)) || ((i >= hash_value_mask) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs.test(this->m_pdata->get_id(i)))))) { m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); @@ -1944,7 +1944,7 @@ bool basic_regex_parser::parse_backref() charT c = unescape_character(); this->append_literal(c); } - else if((i > 0) && (this->m_backrefs & (1u << (i-1)))) + else if((i > 0) && (this->m_backrefs.test(i))) { m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); @@ -2718,8 +2718,7 @@ option_group_jump: // // allow backrefs to this mark: // - if(markid < (int)(sizeof(unsigned) * CHAR_BIT)) - this->m_backrefs |= 1u << (markid - 1); + this->m_backrefs.set(markid); } return true; } diff --git a/include/boost/regex/v4/indexed_bit_flag.hpp b/include/boost/regex/v4/indexed_bit_flag.hpp new file mode 100644 index 00000000..c9d32c59 --- /dev/null +++ b/include/boost/regex/v4/indexed_bit_flag.hpp @@ -0,0 +1,54 @@ +/* + * + * Copyright (c) 2020 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE basic_regex_parser.cpp + * VERSION see + * DESCRIPTION: Declares template class basic_regex_parser. + */ + +#include +#include + +#ifndef BOOST_REGEX_V4_INDEXED_BIT_FLAG_HPP +#define BOOST_REGEX_V4_INDEXED_BIT_FLAG_HPP + +namespace boost{ +namespace BOOST_REGEX_DETAIL_NS{ + +class indexed_bit_flag +{ + boost::uint64_t low_mask; + std::set mask_set; +public: + indexed_bit_flag() : low_mask(0) {} + void set(std::size_t i) + { + if (i < std::numeric_limits::digits - 1) + low_mask |= static_cast(1u) << i; + else + mask_set.insert(i); + } + bool test(std::size_t i) + { + if (i < std::numeric_limits::digits - 1) + return low_mask & static_cast(1u) << i ? true : false; + else + return mask_set.find(i) != mask_set.end(); + } +}; + +} // namespace BOOST_REGEX_DETAIL_NS +} // namespace boost + + +#endif diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index 3c654e58..d8439130 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -609,7 +609,7 @@ bool perl_matcher::match_backref() // or PCRE. // int index = static_cast(pstate)->index; - if(index >= 10000) + if(index >= hash_value_mask) { named_subexpressions::range_type r = re.get_data().equal_range(index); BOOST_ASSERT(r.first != r.second); @@ -758,7 +758,7 @@ inline bool perl_matcher::match_assert_backref( { // Have we matched subexpression "index"? // Check if index is a hash value: - if(index >= 10000) + if(index >= hash_value_mask) { named_subexpressions::range_type r = re.get_data().equal_range(index); while(r.first != r.second) @@ -782,7 +782,7 @@ inline bool perl_matcher::match_assert_backref( // Have we recursed into subexpression "index"? // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1. int idx = -(index+1); - if(idx >= 10000) + if(idx >= hash_value_mask) { named_subexpressions::range_type r = re.get_data().equal_range(idx); int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx; diff --git a/test/regress/main.cpp b/test/regress/main.cpp index e3e3dd7f..87ad9ff4 100644 --- a/test/regress/main.cpp +++ b/test/regress/main.cpp @@ -139,10 +139,10 @@ int cpp_main(int /*argc*/, char * /*argv*/[]) int* get_array_data() { - static boost::thread_specific_ptr > tp; + static boost::thread_specific_ptr > tp; if(tp.get() == 0) - tp.reset(new boost::array); + tp.reset(new boost::array); return tp.get()->data(); } @@ -160,9 +160,9 @@ const int* make_array(int first, ...) #ifdef TEST_THREADS int* data = get_array_data(); #else - static int data[200]; + static int data[800]; #endif - std::fill_n(data, 200, -2); + std::fill_n(data, 800, -2); va_list ap; va_start(ap, first); // diff --git a/test/regress/test_backrefs.cpp b/test/regress/test_backrefs.cpp index 58f4dedb..be9f54ca 100644 --- a/test/regress/test_backrefs.cpp +++ b/test/regress/test_backrefs.cpp @@ -103,5 +103,10 @@ void test_backrefs() TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("^(?'foo'.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a(?'foo'[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + + // Bug cases from https://github.com/boostorg/regex/issues/75 + TEST_REGEX_SEARCH("(?:(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)\\g{-1}|WORKING)", perl, "WORKING", match_default, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("(?:(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)\\g{-1}|WORKING)", perl, "WORKING", match_default, make_array(0, 7, -2, -2)); + }