forked from boostorg/regex
Remove limit on the number of backrefs possible.
Changes named sub-expressions to use different hashing scheme: high order bit is now always set to clashes between hashes and indexes don't happen until 2^30 or 2^62 sub-expressions in 32 and 64 bit code respectively. Changes bitmask of seen sub-expressions to use dynamic storage for sub-expression indexes above 64. Adds tests for the above. Fixes https://github.com/boostorg/regex/issues/75.
This commit is contained in:
@ -70,13 +70,14 @@ void bubble_down_one(I first, I last)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const int hash_value_mask = 1 << (std::numeric_limits<int>::digits - 1);
|
||||||
|
|
||||||
template <class Iterator>
|
template <class Iterator>
|
||||||
inline int hash_value_from_capture_name(Iterator i, Iterator j)
|
inline int hash_value_from_capture_name(Iterator i, Iterator j)
|
||||||
{
|
{
|
||||||
std::size_t r = boost::hash_range(i, j);
|
std::size_t r = boost::hash_range(i, j);
|
||||||
r %= ((std::numeric_limits<int>::max)() - 10001);
|
r %= ((std::numeric_limits<int>::max)());
|
||||||
r += 10000;
|
return static_cast<int>(r) | hash_value_mask;
|
||||||
return static_cast<int>(r);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class named_subexpressions
|
class named_subexpressions
|
||||||
|
@ -20,6 +20,8 @@
|
|||||||
#ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
|
#ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
|
||||||
#define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
|
#define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
|
||||||
|
|
||||||
|
#include <boost/regex/v4/indexed_bit_flag.hpp>
|
||||||
|
|
||||||
#ifdef BOOST_MSVC
|
#ifdef BOOST_MSVC
|
||||||
#pragma warning(push)
|
#pragma warning(push)
|
||||||
#pragma warning(disable: 4103)
|
#pragma warning(disable: 4103)
|
||||||
@ -239,7 +241,7 @@ protected:
|
|||||||
bool m_icase; // true for case insensitive matches
|
bool m_icase; // true for case insensitive matches
|
||||||
unsigned m_repeater_id; // the state_id of the next repeater
|
unsigned m_repeater_id; // the state_id of the next repeater
|
||||||
bool m_has_backrefs; // true if there are actually any backrefs
|
bool m_has_backrefs; // true if there are actually any backrefs
|
||||||
unsigned m_backrefs; // bitmask of permitted backrefs
|
indexed_bit_flag m_backrefs; // bitmask of permitted backrefs
|
||||||
boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for;
|
boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for;
|
||||||
bool m_has_recursions; // set when we have recursive expresisons to fixup
|
bool m_has_recursions; // set when we have recursive expresisons to fixup
|
||||||
std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression
|
std::vector<unsigned char> m_recursion_checks; // notes which recursions we've followed while analysing this expression
|
||||||
@ -267,7 +269,7 @@ private:
|
|||||||
|
|
||||||
template <class charT, class traits>
|
template <class charT, class traits>
|
||||||
basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
|
basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
|
||||||
: m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0), m_has_recursions(false)
|
: m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_has_recursions(false)
|
||||||
{
|
{
|
||||||
m_pdata->m_data.clear();
|
m_pdata->m_data.clear();
|
||||||
m_pdata->m_status = ::boost::regex_constants::error_ok;
|
m_pdata->m_status = ::boost::regex_constants::error_ok;
|
||||||
@ -763,7 +765,7 @@ void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
|
|||||||
if(idx < 0)
|
if(idx < 0)
|
||||||
{
|
{
|
||||||
idx = -idx-1;
|
idx = -idx-1;
|
||||||
if(idx >= 10000)
|
if(idx >= hash_value_mask)
|
||||||
{
|
{
|
||||||
idx = m_pdata->get_id(idx);
|
idx = m_pdata->get_id(idx);
|
||||||
if(idx <= 0)
|
if(idx <= 0)
|
||||||
@ -795,7 +797,7 @@ void basic_regex_creator<charT, traits>::fixup_recursions(re_syntax_base* state)
|
|||||||
bool ok = false;
|
bool ok = false;
|
||||||
re_syntax_base* p = base;
|
re_syntax_base* p = base;
|
||||||
std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
|
std::ptrdiff_t idx = static_cast<re_jump*>(state)->alt.i;
|
||||||
if(idx > 10000)
|
if(idx >= hash_value_mask)
|
||||||
{
|
{
|
||||||
//
|
//
|
||||||
// There may be more than one capture group with this hash, just do what Perl
|
// There may be more than one capture group with this hash, just do what Perl
|
||||||
|
@ -545,8 +545,8 @@ bool basic_regex_parser<charT, traits>::parse_open_paren()
|
|||||||
//
|
//
|
||||||
// allow backrefs to this mark:
|
// allow backrefs to this mark:
|
||||||
//
|
//
|
||||||
if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
|
if(markid > 0)
|
||||||
this->m_backrefs |= 1u << (markid - 1);
|
this->m_backrefs.set(markid);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -912,7 +912,7 @@ escape_type_class_jump:
|
|||||||
}
|
}
|
||||||
if(negative)
|
if(negative)
|
||||||
i = 1 + m_mark_count - i;
|
i = 1 + m_mark_count - i;
|
||||||
if(((i > 0) && (i < std::numeric_limits<unsigned>::digits) && (i - 1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_pdata->get_id(i)-1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
|
if(((i > 0) && (this->m_backrefs.test(i)) || ((i >= hash_value_mask) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs.test(this->m_pdata->get_id(i))))))
|
||||||
{
|
{
|
||||||
m_position = pc;
|
m_position = pc;
|
||||||
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
|
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
|
||||||
@ -1944,7 +1944,7 @@ bool basic_regex_parser<charT, traits>::parse_backref()
|
|||||||
charT c = unescape_character();
|
charT c = unescape_character();
|
||||||
this->append_literal(c);
|
this->append_literal(c);
|
||||||
}
|
}
|
||||||
else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
|
else if((i > 0) && (this->m_backrefs.test(i)))
|
||||||
{
|
{
|
||||||
m_position = pc;
|
m_position = pc;
|
||||||
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
|
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
|
||||||
@ -2718,8 +2718,7 @@ option_group_jump:
|
|||||||
//
|
//
|
||||||
// allow backrefs to this mark:
|
// allow backrefs to this mark:
|
||||||
//
|
//
|
||||||
if(markid < (int)(sizeof(unsigned) * CHAR_BIT))
|
this->m_backrefs.set(markid);
|
||||||
this->m_backrefs |= 1u << (markid - 1);
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
54
include/boost/regex/v4/indexed_bit_flag.hpp
Normal file
54
include/boost/regex/v4/indexed_bit_flag.hpp
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
*
|
||||||
|
* Copyright (c) 2020
|
||||||
|
* John Maddock
|
||||||
|
*
|
||||||
|
* Use, modification and distribution are subject to the
|
||||||
|
* Boost Software License, Version 1.0. (See accompanying file
|
||||||
|
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* LOCATION: see http://www.boost.org for most recent version.
|
||||||
|
* FILE basic_regex_parser.cpp
|
||||||
|
* VERSION see <boost/version.hpp>
|
||||||
|
* DESCRIPTION: Declares template class basic_regex_parser.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <boost/regex/config.hpp>
|
||||||
|
#include <set>
|
||||||
|
|
||||||
|
#ifndef BOOST_REGEX_V4_INDEXED_BIT_FLAG_HPP
|
||||||
|
#define BOOST_REGEX_V4_INDEXED_BIT_FLAG_HPP
|
||||||
|
|
||||||
|
namespace boost{
|
||||||
|
namespace BOOST_REGEX_DETAIL_NS{
|
||||||
|
|
||||||
|
class indexed_bit_flag
|
||||||
|
{
|
||||||
|
boost::uint64_t low_mask;
|
||||||
|
std::set<std::size_t> mask_set;
|
||||||
|
public:
|
||||||
|
indexed_bit_flag() : low_mask(0) {}
|
||||||
|
void set(std::size_t i)
|
||||||
|
{
|
||||||
|
if (i < std::numeric_limits<boost::uint64_t>::digits - 1)
|
||||||
|
low_mask |= static_cast<boost::uint64_t>(1u) << i;
|
||||||
|
else
|
||||||
|
mask_set.insert(i);
|
||||||
|
}
|
||||||
|
bool test(std::size_t i)
|
||||||
|
{
|
||||||
|
if (i < std::numeric_limits<boost::uint64_t>::digits - 1)
|
||||||
|
return low_mask & static_cast<boost::uint64_t>(1u) << i ? true : false;
|
||||||
|
else
|
||||||
|
return mask_set.find(i) != mask_set.end();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace BOOST_REGEX_DETAIL_NS
|
||||||
|
} // namespace boost
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
@ -609,7 +609,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_backref()
|
|||||||
// or PCRE.
|
// or PCRE.
|
||||||
//
|
//
|
||||||
int index = static_cast<const re_brace*>(pstate)->index;
|
int index = static_cast<const re_brace*>(pstate)->index;
|
||||||
if(index >= 10000)
|
if(index >= hash_value_mask)
|
||||||
{
|
{
|
||||||
named_subexpressions::range_type r = re.get_data().equal_range(index);
|
named_subexpressions::range_type r = re.get_data().equal_range(index);
|
||||||
BOOST_ASSERT(r.first != r.second);
|
BOOST_ASSERT(r.first != r.second);
|
||||||
@ -758,7 +758,7 @@ inline bool perl_matcher<BidiIterator, Allocator, traits>::match_assert_backref(
|
|||||||
{
|
{
|
||||||
// Have we matched subexpression "index"?
|
// Have we matched subexpression "index"?
|
||||||
// Check if index is a hash value:
|
// Check if index is a hash value:
|
||||||
if(index >= 10000)
|
if(index >= hash_value_mask)
|
||||||
{
|
{
|
||||||
named_subexpressions::range_type r = re.get_data().equal_range(index);
|
named_subexpressions::range_type r = re.get_data().equal_range(index);
|
||||||
while(r.first != r.second)
|
while(r.first != r.second)
|
||||||
@ -782,7 +782,7 @@ inline bool perl_matcher<BidiIterator, Allocator, traits>::match_assert_backref(
|
|||||||
// Have we recursed into subexpression "index"?
|
// Have we recursed into subexpression "index"?
|
||||||
// If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
|
// If index == 0 then check for any recursion at all, otherwise for recursion to -index-1.
|
||||||
int idx = -(index+1);
|
int idx = -(index+1);
|
||||||
if(idx >= 10000)
|
if(idx >= hash_value_mask)
|
||||||
{
|
{
|
||||||
named_subexpressions::range_type r = re.get_data().equal_range(idx);
|
named_subexpressions::range_type r = re.get_data().equal_range(idx);
|
||||||
int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx;
|
int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx;
|
||||||
|
@ -139,10 +139,10 @@ int cpp_main(int /*argc*/, char * /*argv*/[])
|
|||||||
|
|
||||||
int* get_array_data()
|
int* get_array_data()
|
||||||
{
|
{
|
||||||
static boost::thread_specific_ptr<boost::array<int, 200> > tp;
|
static boost::thread_specific_ptr<boost::array<int, 800> > tp;
|
||||||
|
|
||||||
if(tp.get() == 0)
|
if(tp.get() == 0)
|
||||||
tp.reset(new boost::array<int, 200>);
|
tp.reset(new boost::array<int, 800>);
|
||||||
|
|
||||||
return tp.get()->data();
|
return tp.get()->data();
|
||||||
}
|
}
|
||||||
@ -160,9 +160,9 @@ const int* make_array(int first, ...)
|
|||||||
#ifdef TEST_THREADS
|
#ifdef TEST_THREADS
|
||||||
int* data = get_array_data();
|
int* data = get_array_data();
|
||||||
#else
|
#else
|
||||||
static int data[200];
|
static int data[800];
|
||||||
#endif
|
#endif
|
||||||
std::fill_n(data, 200, -2);
|
std::fill_n(data, 800, -2);
|
||||||
va_list ap;
|
va_list ap;
|
||||||
va_start(ap, first);
|
va_start(ap, first);
|
||||||
//
|
//
|
||||||
|
@ -103,5 +103,10 @@ void test_backrefs()
|
|||||||
TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2));
|
TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2));
|
||||||
TEST_REGEX_SEARCH("^(?'foo'.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2));
|
TEST_REGEX_SEARCH("^(?'foo'.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2));
|
||||||
TEST_REGEX_SEARCH("a(?'foo'[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2));
|
TEST_REGEX_SEARCH("a(?'foo'[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2));
|
||||||
|
|
||||||
|
// Bug cases from https://github.com/boostorg/regex/issues/75
|
||||||
|
TEST_REGEX_SEARCH("(?:(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)\\g{-1}|WORKING)", perl, "WORKING", match_default, make_array(0, 7, -2, -2));
|
||||||
|
TEST_REGEX_SEARCH("(?:(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)(z)\\g{-1}|WORKING)", perl, "WORKING", match_default, make_array(0, 7, -2, -2));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user