Add COMMIT support plus lots of tests.

This commit is contained in:
jzmaddock
2015-09-29 17:40:43 +01:00
parent 2580fb035f
commit c281c9cc40
10 changed files with 308 additions and 19 deletions

View File

@ -164,9 +164,9 @@ struct regex_data : public named_subexpressions
regex_data(const ::boost::shared_ptr<
::boost::regex_traits_wrapper<traits> >& t)
: m_ptraits(t), m_expression(0), m_expression_len(0) {}
: m_ptraits(t), m_expression(0), m_expression_len(0), m_disable_match_any(false) {}
regex_data()
: m_ptraits(new ::boost::regex_traits_wrapper<traits>()), m_expression(0), m_expression_len(0) {}
: m_ptraits(new ::boost::regex_traits_wrapper<traits>()), m_expression(0), m_expression_len(0), m_disable_match_any(false) {}
::boost::shared_ptr<
::boost::regex_traits_wrapper<traits>
@ -186,6 +186,7 @@ struct regex_data : public named_subexpressions
std::pair<
std::size_t, std::size_t> > m_subs; // Position of sub-expressions within the *string*.
bool m_has_recursions; // whether we have recursive expressions;
bool m_disable_match_any; // when set we need to disable the match_any flag as it causes different/buggy behaviour.
};
//
// class basic_regex_implementation

View File

@ -1149,6 +1149,7 @@ void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state,
set_all_masks(l_map, mask);
return;
}
case syntax_element_accept:
case syntax_element_match:
{
// must be null, any character can match:
@ -1335,6 +1336,11 @@ void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state,
state = state->next.p;
break;
case syntax_element_commit:
set_all_masks(l_map, mask);
// Continue scanning so we can figure out whether we can be null:
state = state->next.p;
break;
case syntax_element_startmark:
// need to handle independent subs as a special case:
if(static_cast<re_brace*>(state)->index == -3)

View File

@ -2740,6 +2740,31 @@ bool basic_regex_parser<charT, traits>::parse_perl_verb()
return true;
}
break;
case 'C':
if(++m_position == m_end)
{
// Rewind to start of (* sequence:
--m_position;
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
fail(regex_constants::error_perl_extension, m_position - m_base);
return false;
}
if(match_verb("OMMIT"))
{
if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
{
// Rewind to start of (* sequence:
--m_position;
while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
fail(regex_constants::error_perl_extension, m_position - m_base);
return false;
}
++m_position;
this->append_state(syntax_element_commit);
this->m_pdata->m_disable_match_any = true;
return true;
}
break;
}
return false;
}

View File

@ -445,6 +445,8 @@ private:
bool match_recursion();
bool match_fail();
bool match_accept();
bool match_commit();
bool skip_until_paren(int index, bool match = true);
// find procs stored in s_find_vtable:
bool find_restart_any();
@ -527,6 +529,7 @@ private:
bool unwind_non_greedy_repeat(bool);
bool unwind_recursion(bool);
bool unwind_recursion_pop(bool);
bool unwind_commit(bool);
void destroy_single_repeat();
void push_matched_paren(int index, const sub_match<BidiIterator>& sub);
void push_recursion_stopper();

View File

@ -85,6 +85,9 @@ void perl_matcher<BidiIterator, Allocator, traits>::construct_init(const basic_r
m_word_mask = re.get_data().m_word_mask;
// find bitmask to use for matching '.':
match_any_mask = static_cast<unsigned char>((f & match_not_dot_newline) ? BOOST_REGEX_DETAIL_NS::test_not_newline : BOOST_REGEX_DETAIL_NS::test_newline);
// Disable match_any if requested in the state machine:
if(e.get_data().m_disable_match_any)
m_match_flags &= ~regex_constants::match_any;
}
template <class BidiIterator, class Allocator, class traits>
@ -800,21 +803,6 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_fail()
return false;
}
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::match_accept()
{
// Almost the same as match_match, but we need to close any half-open capturing groups:
for(unsigned i = 1; i < m_result.size(); ++i)
{
if((m_result[i].matched == false) && (m_result[i].first != last))
{
m_result.set_second(position, i);
}
}
return match_match();
}
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::find_restart_any()
{

View File

@ -141,7 +141,7 @@ struct saved_recursion : public saved_state
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::match_all_states()
{
static matcher_proc_type const s_match_vtable[32] =
static matcher_proc_type const s_match_vtable[33] =
{
(&perl_matcher<BidiIterator, Allocator, traits>::match_startmark),
&perl_matcher<BidiIterator, Allocator, traits>::match_endmark,
@ -179,6 +179,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_all_states()
&perl_matcher<BidiIterator, Allocator, traits>::match_recursion,
&perl_matcher<BidiIterator, Allocator, traits>::match_fail,
&perl_matcher<BidiIterator, Allocator, traits>::match_accept,
&perl_matcher<BidiIterator, Allocator, traits>::match_commit,
};
push_recursion_stopper();
@ -1006,6 +1007,116 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_match()
return true;
}
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::match_commit()
{
// Ideally we would just junk all the states that are on the stack,
// however we might not unwind correctly in that case, so for now,
// just mark that we don't backtrack into whatever is left (or rather
// we'll unwind it unconditionally without pausing to try other matches).
saved_state* pmp = m_backup_state;
--pmp;
if(pmp < m_stack_base)
{
extend_stack();
pmp = m_backup_state;
--pmp;
}
(void) new (pmp)saved_state(16);
m_backup_state = pmp;
pstate = pstate->next.p;
// If we don't find a match we don't want to search further either:
restart = last;
return true;
}
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::skip_until_paren(int index, bool match)
{
while(pstate)
{
if(pstate->type == syntax_element_endmark)
{
if(static_cast<const re_brace*>(pstate)->index == index)
{
if(match)
return this->match_endmark();
pstate = pstate->next.p;
return true;
}
else
{
// Unenclosed closing ), occurs when (*ACCEPT) is inside some other
// parenthesis which may or may not have other side effects associated with it.
match_endmark();
if(!pstate)
{
unwind(true);
}
}
continue;
}
else if(pstate->type == syntax_element_match)
return true;
else if(pstate->type == syntax_element_startmark)
{
int index = static_cast<const re_brace*>(pstate)->index;
pstate = pstate->next.p;
skip_until_paren(index, false);
continue;
}
pstate = pstate->next.p;
}
return true;
}
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::match_accept()
{
#if 0
// Almost the same as match_match, but we need to close any half-open capturing groups:
for(unsigned i = 1; i < m_result.size(); ++i)
{
if((m_result[i].matched == false) && (m_result[i].first != last))
{
m_result.set_second(position, i);
}
}
if(!recursion_stack.empty())
{
// Skip forward to the end of this recursion:
while(pstate)
{
if(pstate->type == syntax_element_endmark)
if(static_cast<const re_brace*>(pstate)->index == recursion_stack.back().idx)
break;
pstate = pstate->next.p;
}
return true;
/*
int index = recursion_stack.back().idx;
pstate = recursion_stack.back().preturn_address;
*m_presult = recursion_stack.back().results;
push_recursion(index, recursion_stack.back().preturn_address, &recursion_stack.back().results);
recursion_stack.pop_back();
push_repeater_count(-(2 + index), &next_count);
return true;
*/
}
else
return match_match();
#endif
if(!recursion_stack.empty())
{
skip_until_paren(recursion_stack.back().idx);
}
else
{
skip_until_paren(INT_MAX);
}
return true;
}
/****************************************************************************
Unwind and associated proceedures follow, these perform what normal stack
@ -1034,6 +1145,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::unwind(bool have_match)
&perl_matcher<BidiIterator, Allocator, traits>::unwind_non_greedy_repeat,
&perl_matcher<BidiIterator, Allocator, traits>::unwind_recursion,
&perl_matcher<BidiIterator, Allocator, traits>::unwind_recursion_pop,
&perl_matcher<BidiIterator, Allocator, traits>::unwind_commit,
};
m_recursive_result = have_match;
@ -1583,6 +1695,15 @@ void perl_matcher<BidiIterator, Allocator, traits>::push_recursion_pop()
(void) new (pmp)saved_state(15);
m_backup_state = pmp;
}
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::unwind_commit(bool b)
{
boost::BOOST_REGEX_DETAIL_NS::inplace_destroy(m_backup_state++);
while(unwind(b)) {}
return false;
}
/*
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::unwind_parenthesis_pop(bool r)

View File

@ -60,7 +60,7 @@ public:
template <class BidiIterator, class Allocator, class traits>
bool perl_matcher<BidiIterator, Allocator, traits>::match_all_states()
{
static matcher_proc_type const s_match_vtable[32] =
static matcher_proc_type const s_match_vtable[33] =
{
(&perl_matcher<BidiIterator, Allocator, traits>::match_startmark),
&perl_matcher<BidiIterator, Allocator, traits>::match_endmark,
@ -98,6 +98,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_all_states()
&perl_matcher<BidiIterator, Allocator, traits>::match_recursion,
&perl_matcher<BidiIterator, Allocator, traits>::match_fail,
&perl_matcher<BidiIterator, Allocator, traits>::match_accept,
&perl_matcher<BidiIterator, Allocator, traits>::match_commit,
};
if(state_count > max_state_count)

View File

@ -124,6 +124,7 @@ enum syntax_element_type
// Verbs:
syntax_element_fail = syntax_element_recurse + 1,
syntax_element_accept = syntax_element_fail + 1,
syntax_element_commit = syntax_element_accept + 1,
};
#ifdef BOOST_REGEX_DEBUG

View File

@ -49,6 +49,7 @@ int error_count = 0;
void run_tests()
{
#if 0
RUN_TESTS(basic_tests);
RUN_TESTS(test_simple_repeats);
RUN_TESTS(test_alt);
@ -82,6 +83,7 @@ void run_tests()
RUN_TESTS(test_pocessive_repeats);
RUN_TESTS(test_mark_resets);
RUN_TESTS(test_recursion);
#endif
RUN_TESTS(test_verbs);
}
@ -160,6 +162,7 @@ const int* make_array(int first, ...)
#else
static int data[200];
#endif
std::fill_n(data, 200, -2);
va_list ap;
va_start(ap, first);
//

View File

@ -941,4 +941,144 @@ void test_verbs()
TEST_REGEX_SEARCH("a+(*FAIL)b", perl, "aaaab", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AB", match_default, make_array(0, 2, 0, 2, 1, 2, -1, -1, -2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "ACDE", match_default, make_array(0, 4, 0, 3, 1, 2, 3, 4, -2, -2));
TEST_REGEX_SEARCH("^a+(*FAIL)", perl, "aaaaaa", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("a+b?c+(*FAIL)", perl, "aaabccc", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("a+b?(*COMMIT)c+(*FAIL)", perl, "aaabccc", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AB", match_default, make_array(0, 2, 0, 2, 1, 2, -1, -1, -2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "ABX", match_default, make_array(0, 2, 0, 2, 1, 2, -1, -1, -2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AADE", match_default, make_array(0, 4, 0, 3, 1, 2, 3, 4, -2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "ACDE", match_default, make_array(0, 4, 0, 3, 1, 2, 3, 4, -2, -2));
TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AD", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "AAD", match_default, make_array(0, 2, 1, 2, -2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "ACD", match_default, make_array(0, 3, 1, 2, -2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "BAD", match_default, make_array(0, 2, 1, 2, -2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "BCD", match_default, make_array(0, 3, 1, 2, -2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "BAX", match_default, make_array(0, 2, 1, 2, -2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "ACX", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "ABC", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("^(?=a(*ACCEPT)b)", perl, "ac", match_default, make_array(0, 0, -2, -2));
TEST_REGEX_SEARCH("A(*COMMIT)(B|D)", perl, "ACABX", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("(*COMMIT)(A|P)(B|P)(C|P)", perl, "ABCDEFG", match_default, make_array(0, 3, 0, 1, 1, 2, 2, 3, -2, -2));
TEST_REGEX_SEARCH("(*COMMIT)(A|P)(B|P)(C|P)", perl, "DEFGABC", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("(\\w+)(?>b(*COMMIT))\\w{2}", perl, "abbb", match_default, make_array(0, 4, 0, 1, -2, -2));
TEST_REGEX_SEARCH("(\\w+)b(*COMMIT)\\w{2}", perl, "abbb", match_default, make_array(-2, -2));
//
#if 0
/a+b?(*PRUNE)c+(*FAIL)/
aaabccc
/a+b?(*SKIP)c+(*FAIL)/
aaabcccaaabccc
/^(?:aaa(*THEN)\w{6}|bbb(*THEN)\w{5}|ccc(*THEN)\w{4}|\w{3})/
aaaxxxxxx
aaa++++++
bbbxxxxx
bbb+++++
cccxxxx
ccc++++
dddddddd
/^(aaa(*THEN)\w{6}|bbb(*THEN)\w{5}|ccc(*THEN)\w{4}|\w{3})/
aaaxxxxxx
aaa++++++
bbbxxxxx
bbb+++++
cccxxxx
ccc++++
dddddddd
/a+b?(*THEN)c+(*FAIL)/
aaabccc
/^(?=a(*SKIP)b|ac)/
** Failers
ac
/^(?=a(*PRUNE)b)/
ab
** Failers
ac
~~~~~
# Check the use of names for failure
/^(A(*PRUNE:A)B|C(*PRUNE:B)D)/mark
** Failers
AC
CB
/(*MARK:A)(*SKIP:B)(C|X)/mark
C
D
/^(A(*THEN:A)B|C(*THEN:B)D)/mark
** Failers
CB
/^(?:A(*THEN:A)B|C(*THEN:B)D)/mark
CB
/^(?>A(*THEN:A)B|C(*THEN:B)D)/mark
CB
# This should succeed, as the skip causes bump to offset 1 (the mark). Note
# that we have to have something complicated such as (B|Z) at the end because,
# for Perl, a simple character somehow causes an unwanted optimization to mess
# with the handling of backtracking verbs.
/A(*MARK:A)A+(*SKIP:A)(B|Z) | AC/x,mark
AAAC
# Test skipping over a non-matching mark.
/A(*MARK:A)A+(*MARK:B)(*SKIP:A)(B|Z) | AC/x,mark
AAAC
# Check shorthand for MARK.
/A(*:A)A+(*SKIP:A)(B|Z) | AC/x,mark
AAAC
/(*:A)A+(*SKIP:A)(B|Z)/mark
AAAC
# This should succeed, as a non-existent skip name disables the skip.
/A(*MARK:A)A+(*SKIP:B)(B|Z) | AC/x,mark
AAAC
/A(*MARK:A)A+(*SKIP:B)(B|Z) | AC(*:B)/x,mark
AAAC
# COMMIT should override THEN.
/(?>(*COMMIT)(?>yes|no)(*THEN)(*F))?/
yes
/(?>(*COMMIT)(yes|no)(*THEN)(*F))?/
yes
/b?(*SKIP)c/
bc
abc
/(*SKIP)bc/
a
/(*SKIP)b/
a
#endif
}