Remove limit on the number of backrefs possible.

Changes named sub-expressions to use different hashing scheme: high order bit is now always set to clashes between hashes and indexes don't happen until 2^30 or 2^62 sub-expressions in 32 and 64 bit code respectively.
Changes bitmask of seen sub-expressions to use dynamic storage for sub-expression indexes above 64.
Adds tests for the above.
Fixes https://github.com/boostorg/regex/issues/75.
This commit is contained in:
jzmaddock
2020-01-19 11:28:36 +00:00
parent b5d60694cc
commit 4bb4d392e4
7 changed files with 81 additions and 20 deletions

View File

@ -70,13 +70,14 @@ void bubble_down_one(I first, I last)
} }
} }
static const int hash_value_mask = 1 << (std::numeric_limits<int>::digits - 1);
template <class Iterator> template <class Iterator>
inline int hash_value_from_capture_name(Iterator i, Iterator j) inline int hash_value_from_capture_name(Iterator i, Iterator j)
{ {
std::size_t r = boost::hash_range(i, j); std::size_t r = boost::hash_range(i, j);
r %= ((std::numeric_limits<int>::max)() - 10001); r %= ((std::numeric_limits<int>::max)());
r += 10000; return static_cast<int>(r) | hash_value_mask;
return static_cast<int>(r);
} }
class named_subexpressions class named_subexpressions

View File

@ -20,6 +20,8 @@
#ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP #ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
#define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP #define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
#include <boost/regex/v4/indexed_bit_flag.hpp>
#ifdef BOOST_MSVC #ifdef BOOST_MSVC
#pragma warning(push) #pragma warning(push)
#pragma warning(disable: 4103) #pragma warning(disable: 4103)
@ -239,7 +241,7 @@ protected:
bool m_icase; // true for case insensitive matches bool m_icase; // true for case insensitive matches
unsigned m_repeater_id; // the state_id of the next repeater unsigned m_repeater_id; // the state_id of the next repeater
bool m_has_backrefs; // true if there are actually any backrefs bool m_has_backrefs; // true if there are actually any backrefs
unsigned m_backrefs; // bitmask of permitted backrefs indexed_bit_flag m_backrefs; // bitmask of permitted backrefs
boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for;
bool m_has_recursions; // set when we have recursive expresisons to fixup bool m_has_recursions; // set when we have recursive expresisons to fixup
std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression
@ -267,7 +269,7 @@ private:
template <class charT, class traits> template <class charT, class traits>
basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data) basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
: m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0), m_has_recursions(false) : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_has_recursions(false)
{ {
m_pdata->m_data.clear(); m_pdata->m_data.clear();
m_pdata->m_status = ::boost::regex_constants::error_ok; m_pdata->m_status = ::boost::regex_constants::error_ok;
@ -763,7 +765,7 @@ void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
if(idx < 0) if(idx < 0)
{ {
idx = -idx-1; idx = -idx-1;
if(idx >= 10000) if(idx >= hash_value_mask)
{ {
idx = m_pdata->get_id(idx); idx = m_pdata->get_id(idx);
if(idx <= 0) if(idx <= 0)
@ -795,7 +797,7 @@ void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
bool ok = false; bool ok = false;
re_syntax_base* p = base; re_syntax_base* p = base;
std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i; std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
if(idx > 10000) if(idx >= hash_value_mask)
{ {
// //
// There may be more than one capture group with this hash, just do what Perl // There may be more than one capture group with this hash, just do what Perl

View File

@ -545,8 +545,8 @@ bool basic_regex_parser<charT, traits>::parse_open_paren()
// //
// allow backrefs to this mark: // allow backrefs to this mark:
// //
if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT)) if(markid > 0)
this->m_backrefs |= 1u << (markid - 1); this->m_backrefs.set(markid);
return true; return true;
} }
@ -912,7 +912,7 @@ escape_type_class_jump:
} }
if(negative) if(negative)
i = 1 + m_mark_count - i; i = 1 + m_mark_count - i;
if(((i > 0) && (i < std::numeric_limits<unsigned>::digits) && (i - 1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_pdata->get_id(i)-1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1))))) if(((i > 0) && (this->m_backrefs.test(i)) || ((i >= hash_value_mask) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs.test(this->m_pdata->get_id(i))))))
{ {
m_position = pc; m_position = pc;
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace))); re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
@ -1944,7 +1944,7 @@ bool basic_regex_parser<charT, traits>::parse_backref()
charT c = unescape_character(); charT c = unescape_character();
this->append_literal(c); this->append_literal(c);
} }
else if((i > 0) && (this->m_backrefs & (1u << (i-1)))) else if((i > 0) && (this->m_backrefs.test(i)))
{ {
m_position = pc; m_position = pc;
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace))); re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
@ -2718,8 +2718,7 @@ option_group_jump:
// //
// allow backrefs to this mark: // allow backrefs to this mark:
// //
if(markid < (int)(sizeof(unsigned) * CHAR_BIT)) this->m_backrefs.set(markid);
this->m_backrefs |= 1u << (markid - 1);
} }
return true; return true;
} }

View File

@ -0,0 +1,54 @@
/*
*
* Copyright (c) 2020
* John Maddock
*
* Use, modification and distribution are subject to the
* Boost Software License, Version 1.0. (See accompanying file
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
*
*/
/*
* LOCATION: see http://www.boost.org for most recent version.
* FILE basic_regex_parser.cpp
* VERSION see <boost/version.hpp>
* DESCRIPTION: Declares template class basic_regex_parser.
*/
#include <boost/regex/config.hpp>
#include <set>
#ifndef BOOST_REGEX_V4_INDEXED_BIT_FLAG_HPP
#define BOOST_REGEX_V4_INDEXED_BIT_FLAG_HPP
namespace boost{
namespace BOOST_REGEX_DETAIL_NS{
class indexed_bit_flag
{
boost::uint64_t low_mask;
std::set<std::size_t> mask_set;
public:
indexed_bit_flag() : low_mask(0) {}
void set(std::size_t i)
{
if (i < std::numeric_limits<boost::uint64_t>::digits - 1)
low_mask |= static_cast<boost::uint64_t>(1u) << i;
else
mask_set.insert(i);
}
bool test(std::size_t i)
{
if (i < std::numeric_limits<boost::uint64_t>::digits - 1)
return low_mask & static_cast<boost::uint64_t>(1u) << i ? true : false;
else
return mask_set.find(i) != mask_set.end();
}
};
} // namespace BOOST_REGEX_DETAIL_NS
} // namespace boost
#endif

View File

@ -609,7 +609,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_backref()
// or PCRE. // or PCRE.
// //
int index = static_cast<const re_brace*>(pstate)->index; int index = static_cast<const re_brace*>(pstate)->index;
if(index >= 10000) if(index >= hash_value_mask)
{ {
named_subexpressions::range_type r = re.get_data().equal_range(index); named_subexpressions::range_type r = re.get_data().equal_range(index);
BOOST_ASSERT(r.first != r.second); BOOST_ASSERT(r.first != r.second);
@ -758,7 +758,7 @@ inline bool perl_matcher<BidiIterator, Allocator, traits>::match_assert_backref(
{ {
// Have we matched subexpression "index"? // Have we matched subexpression "index"?
// Check if index is a hash value: // Check if index is a hash value:
if(index >= 10000) if(index >= hash_value_mask)
{ {
named_subexpressions::range_type r = re.get_data().equal_range(index); named_subexpressions::range_type r = re.get_data().equal_range(index);
while(r.first != r.second) while(r.first != r.second)
@ -782,7 +782,7 @@ inline bool perl_matcher<BidiIterator, Allocator, traits>::match_assert_backref(
// Have we recursed into subexpression "index"? // Have we recursed into subexpression "index"?
// If index == 0 then check for any recursion at all, otherwise for recursion to -index-1. // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
int idx = -(index+1); int idx = -(index+1);
if(idx >= 10000) if(idx >= hash_value_mask)
{ {
named_subexpressions::range_type r = re.get_data().equal_range(idx); named_subexpressions::range_type r = re.get_data().equal_range(idx);
int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx; int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx;

View File

@ -139,10 +139,10 @@ int cpp_main(int /*argc*/, char * /*argv*/[])
int* get_array_data() int* get_array_data()
{ {
static boost::thread_specific_ptr<boost::array<int, 200> > tp; static boost::thread_specific_ptr<boost::array<int, 800> > tp;
if(tp.get() == 0) if(tp.get() == 0)
tp.reset(new boost::array<int, 200>); tp.reset(new boost::array<int, 800>);
return tp.get()->data(); return tp.get()->data();
} }
@ -160,9 +160,9 @@ const int* make_array(int first, ...)
#ifdef TEST_THREADS #ifdef TEST_THREADS
int* data = get_array_data(); int* data = get_array_data();
#else #else
static int data[200]; static int data[800];
#endif #endif
std::fill_n(data, 200, -2); std::fill_n(data, 800, -2);
va_list ap; va_list ap;
va_start(ap, first); va_start(ap, first);
// //

View File

@ -103,5 +103,10 @@ void test_backrefs()
TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("^(?'foo'.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("^(?'foo'.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("a(?'foo'[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); TEST_REGEX_SEARCH("a(?'foo'[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2));
// Bug cases from https://github.com/boostorg/regex/issues/75
TEST_REGEX_SEARCH("(?:(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)\\g{-1}|WORKING)", perl, "WORKING", match_default, make_array(0, 7, -2, -2));
TEST_REGEX_SEARCH("(?:(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)\\g{-1}|WORKING)", perl, "WORKING", match_default, make_array(0, 7, -2, -2));
} }