From c281c9cc40276acfc0d9113f3d616480ddf345f3 Mon Sep 17 00:00:00 2001 From: jzmaddock Date: Tue, 29 Sep 2015 17:40:43 +0100 Subject: [PATCH] Add COMMIT support plus lots of tests. --- include/boost/regex/v4/basic_regex.hpp | 5 +- .../boost/regex/v4/basic_regex_creator.hpp | 6 + include/boost/regex/v4/basic_regex_parser.hpp | 25 ++++ include/boost/regex/v4/perl_matcher.hpp | 3 + .../boost/regex/v4/perl_matcher_common.hpp | 18 +-- .../regex/v4/perl_matcher_non_recursive.hpp | 123 ++++++++++++++- .../boost/regex/v4/perl_matcher_recursive.hpp | 3 +- include/boost/regex/v4/states.hpp | 1 + test/regress/main.cpp | 3 + test/regress/test_perl_ex.cpp | 140 ++++++++++++++++++ 10 files changed, 308 insertions(+), 19 deletions(-) diff --git a/include/boost/regex/v4/basic_regex.hpp b/include/boost/regex/v4/basic_regex.hpp index 68c3124e..ac91af29 100644 --- a/include/boost/regex/v4/basic_regex.hpp +++ b/include/boost/regex/v4/basic_regex.hpp @@ -164,9 +164,9 @@ struct regex_data : public named_subexpressions regex_data(const ::boost::shared_ptr< ::boost::regex_traits_wrapper >& t) - : m_ptraits(t), m_expression(0), m_expression_len(0) {} + : m_ptraits(t), m_expression(0), m_expression_len(0), m_disable_match_any(false) {} regex_data() - : m_ptraits(new ::boost::regex_traits_wrapper()), m_expression(0), m_expression_len(0) {} + : m_ptraits(new ::boost::regex_traits_wrapper()), m_expression(0), m_expression_len(0), m_disable_match_any(false) {} ::boost::shared_ptr< ::boost::regex_traits_wrapper @@ -186,6 +186,7 @@ struct regex_data : public named_subexpressions std::pair< std::size_t, std::size_t> > m_subs; // Position of sub-expressions within the *string*. bool m_has_recursions; // whether we have recursive expressions; + bool m_disable_match_any; // when set we need to disable the match_any flag as it causes different/buggy behaviour. }; // // class basic_regex_implementation diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index b514edcf..51704a84 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -1149,6 +1149,7 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, set_all_masks(l_map, mask); return; } + case syntax_element_accept: case syntax_element_match: { // must be null, any character can match: @@ -1335,6 +1336,11 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, state = state->next.p; break; + case syntax_element_commit: + set_all_masks(l_map, mask); + // Continue scanning so we can figure out whether we can be null: + state = state->next.p; + break; case syntax_element_startmark: // need to handle independent subs as a special case: if(static_cast(state)->index == -3) diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 8252991c..7a2e9745 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -2740,6 +2740,31 @@ bool basic_regex_parser::parse_perl_verb() return true; } break; + case 'C': + if(++m_position == m_end) + { + // Rewind to start of (* sequence: + --m_position; + while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; + fail(regex_constants::error_perl_extension, m_position - m_base); + return false; + } + if(match_verb("OMMIT")) + { + if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + { + // Rewind to start of (* sequence: + --m_position; + while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position; + fail(regex_constants::error_perl_extension, m_position - m_base); + return false; + } + ++m_position; + this->append_state(syntax_element_commit); + this->m_pdata->m_disable_match_any = true; + return true; + } + break; } return false; } diff --git a/include/boost/regex/v4/perl_matcher.hpp b/include/boost/regex/v4/perl_matcher.hpp index f165da7e..3fe85506 100644 --- a/include/boost/regex/v4/perl_matcher.hpp +++ b/include/boost/regex/v4/perl_matcher.hpp @@ -445,6 +445,8 @@ private: bool match_recursion(); bool match_fail(); bool match_accept(); + bool match_commit(); + bool skip_until_paren(int index, bool match = true); // find procs stored in s_find_vtable: bool find_restart_any(); @@ -527,6 +529,7 @@ private: bool unwind_non_greedy_repeat(bool); bool unwind_recursion(bool); bool unwind_recursion_pop(bool); + bool unwind_commit(bool); void destroy_single_repeat(); void push_matched_paren(int index, const sub_match& sub); void push_recursion_stopper(); diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index e4282ca9..4544d202 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -85,6 +85,9 @@ void perl_matcher::construct_init(const basic_r m_word_mask = re.get_data().m_word_mask; // find bitmask to use for matching '.': match_any_mask = static_cast((f & match_not_dot_newline) ? BOOST_REGEX_DETAIL_NS::test_not_newline : BOOST_REGEX_DETAIL_NS::test_newline); + // Disable match_any if requested in the state machine: + if(e.get_data().m_disable_match_any) + m_match_flags &= ~regex_constants::match_any; } template @@ -800,21 +803,6 @@ bool perl_matcher::match_fail() return false; } -template -bool perl_matcher::match_accept() -{ - // Almost the same as match_match, but we need to close any half-open capturing groups: - for(unsigned i = 1; i < m_result.size(); ++i) - { - if((m_result[i].matched == false) && (m_result[i].first != last)) - { - m_result.set_second(position, i); - } - } - return match_match(); -} - - template bool perl_matcher::find_restart_any() { diff --git a/include/boost/regex/v4/perl_matcher_non_recursive.hpp b/include/boost/regex/v4/perl_matcher_non_recursive.hpp index 51bf8461..b0681d0a 100644 --- a/include/boost/regex/v4/perl_matcher_non_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_non_recursive.hpp @@ -141,7 +141,7 @@ struct saved_recursion : public saved_state template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[32] = + static matcher_proc_type const s_match_vtable[33] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -179,6 +179,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_recursion, &perl_matcher::match_fail, &perl_matcher::match_accept, + &perl_matcher::match_commit, }; push_recursion_stopper(); @@ -1006,6 +1007,116 @@ bool perl_matcher::match_match() return true; } +template +bool perl_matcher::match_commit() +{ + // Ideally we would just junk all the states that are on the stack, + // however we might not unwind correctly in that case, so for now, + // just mark that we don't backtrack into whatever is left (or rather + // we'll unwind it unconditionally without pausing to try other matches). + saved_state* pmp = m_backup_state; + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = m_backup_state; + --pmp; + } + (void) new (pmp)saved_state(16); + m_backup_state = pmp; + pstate = pstate->next.p; + // If we don't find a match we don't want to search further either: + restart = last; + return true; +} + +template +bool perl_matcher::skip_until_paren(int index, bool match) +{ + while(pstate) + { + if(pstate->type == syntax_element_endmark) + { + if(static_cast(pstate)->index == index) + { + if(match) + return this->match_endmark(); + pstate = pstate->next.p; + return true; + } + else + { + // Unenclosed closing ), occurs when (*ACCEPT) is inside some other + // parenthesis which may or may not have other side effects associated with it. + match_endmark(); + if(!pstate) + { + unwind(true); + } + } + continue; + } + else if(pstate->type == syntax_element_match) + return true; + else if(pstate->type == syntax_element_startmark) + { + int index = static_cast(pstate)->index; + pstate = pstate->next.p; + skip_until_paren(index, false); + continue; + } + pstate = pstate->next.p; + } + return true; +} + +template +bool perl_matcher::match_accept() +{ +#if 0 + // Almost the same as match_match, but we need to close any half-open capturing groups: + for(unsigned i = 1; i < m_result.size(); ++i) + { + if((m_result[i].matched == false) && (m_result[i].first != last)) + { + m_result.set_second(position, i); + } + } + if(!recursion_stack.empty()) + { + // Skip forward to the end of this recursion: + while(pstate) + { + if(pstate->type == syntax_element_endmark) + if(static_cast(pstate)->index == recursion_stack.back().idx) + break; + pstate = pstate->next.p; + } + return true; + /* + int index = recursion_stack.back().idx; + pstate = recursion_stack.back().preturn_address; + *m_presult = recursion_stack.back().results; + push_recursion(index, recursion_stack.back().preturn_address, &recursion_stack.back().results); + recursion_stack.pop_back(); + push_repeater_count(-(2 + index), &next_count); + return true; + */ + } + else + return match_match(); +#endif + if(!recursion_stack.empty()) + { + skip_until_paren(recursion_stack.back().idx); + } + else + { + skip_until_paren(INT_MAX); + } + return true; +} + /**************************************************************************** Unwind and associated proceedures follow, these perform what normal stack @@ -1034,6 +1145,7 @@ bool perl_matcher::unwind(bool have_match) &perl_matcher::unwind_non_greedy_repeat, &perl_matcher::unwind_recursion, &perl_matcher::unwind_recursion_pop, + &perl_matcher::unwind_commit, }; m_recursive_result = have_match; @@ -1583,6 +1695,15 @@ void perl_matcher::push_recursion_pop() (void) new (pmp)saved_state(15); m_backup_state = pmp; } + +template +bool perl_matcher::unwind_commit(bool b) +{ + boost::BOOST_REGEX_DETAIL_NS::inplace_destroy(m_backup_state++); + while(unwind(b)) {} + return false; +} + /* template bool perl_matcher::unwind_parenthesis_pop(bool r) diff --git a/include/boost/regex/v4/perl_matcher_recursive.hpp b/include/boost/regex/v4/perl_matcher_recursive.hpp index 741f596e..03fa9deb 100644 --- a/include/boost/regex/v4/perl_matcher_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_recursive.hpp @@ -60,7 +60,7 @@ public: template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[32] = + static matcher_proc_type const s_match_vtable[33] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -98,6 +98,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_recursion, &perl_matcher::match_fail, &perl_matcher::match_accept, + &perl_matcher::match_commit, }; if(state_count > max_state_count) diff --git a/include/boost/regex/v4/states.hpp b/include/boost/regex/v4/states.hpp index e6c03d07..aba08180 100644 --- a/include/boost/regex/v4/states.hpp +++ b/include/boost/regex/v4/states.hpp @@ -124,6 +124,7 @@ enum syntax_element_type // Verbs: syntax_element_fail = syntax_element_recurse + 1, syntax_element_accept = syntax_element_fail + 1, + syntax_element_commit = syntax_element_accept + 1, }; #ifdef BOOST_REGEX_DEBUG diff --git a/test/regress/main.cpp b/test/regress/main.cpp index a97c48a0..a469f0a2 100644 --- a/test/regress/main.cpp +++ b/test/regress/main.cpp @@ -49,6 +49,7 @@ int error_count = 0; void run_tests() { +#if 0 RUN_TESTS(basic_tests); RUN_TESTS(test_simple_repeats); RUN_TESTS(test_alt); @@ -82,6 +83,7 @@ void run_tests() RUN_TESTS(test_pocessive_repeats); RUN_TESTS(test_mark_resets); RUN_TESTS(test_recursion); +#endif RUN_TESTS(test_verbs); } @@ -160,6 +162,7 @@ const int* make_array(int first, ...) #else static int data[200]; #endif + std::fill_n(data, 200, -2); va_list ap; va_start(ap, first); // diff --git a/test/regress/test_perl_ex.cpp b/test/regress/test_perl_ex.cpp index 314d297c..2cb9a1c6 100644 --- a/test/regress/test_perl_ex.cpp +++ b/test/regress/test_perl_ex.cpp @@ -941,4 +941,144 @@ void test_verbs() TEST_REGEX_SEARCH("a+(*FAIL)b", perl, "aaaab", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AB", match_default, make_array(0, 2, 0, 2, 1, 2, -1, -1, -2, -2)); TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "ACDE", match_default, make_array(0, 4, 0, 3, 1, 2, 3, 4, -2, -2)); + + TEST_REGEX_SEARCH("^a+(*FAIL)", perl, "aaaaaa", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a+b?c+(*FAIL)", perl, "aaabccc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a+b?(*COMMIT)c+(*FAIL)", perl, "aaabccc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AB", match_default, make_array(0, 2, 0, 2, 1, 2, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "ABX", match_default, make_array(0, 2, 0, 2, 1, 2, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AADE", match_default, make_array(0, 4, 0, 3, 1, 2, 3, 4, -2, -2)); + TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "ACDE", match_default, make_array(0, 4, 0, 3, 1, 2, 3, 4, -2, -2)); + TEST_REGEX_SEARCH("(A(A|B(*ACCEPT)|C)D)(E)", perl, "AD", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "AAD", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "ACD", match_default, make_array(0, 3, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "BAD", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "BCD", match_default, make_array(0, 3, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "BAX", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "ACX", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?:(?1)|B)(A(*ACCEPT)XX|C)D", perl, "ABC", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("^(?=a(*ACCEPT)b)", perl, "ac", match_default, make_array(0, 0, -2, -2)); + TEST_REGEX_SEARCH("A(*COMMIT)(B|D)", perl, "ACABX", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(*COMMIT)(A|P)(B|P)(C|P)", perl, "ABCDEFG", match_default, make_array(0, 3, 0, 1, 1, 2, 2, 3, -2, -2)); + TEST_REGEX_SEARCH("(*COMMIT)(A|P)(B|P)(C|P)", perl, "DEFGABC", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(\\w+)(?>b(*COMMIT))\\w{2}", perl, "abbb", match_default, make_array(0, 4, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(\\w+)b(*COMMIT)\\w{2}", perl, "abbb", match_default, make_array(-2, -2)); + +// + + +#if 0 +/a+b?(*PRUNE)c+(*FAIL)/ + aaabccc + +/a+b?(*SKIP)c+(*FAIL)/ + aaabcccaaabccc + +/^(?:aaa(*THEN)\w{6}|bbb(*THEN)\w{5}|ccc(*THEN)\w{4}|\w{3})/ + aaaxxxxxx + aaa++++++ + bbbxxxxx + bbb+++++ + cccxxxx + ccc++++ + dddddddd + +/^(aaa(*THEN)\w{6}|bbb(*THEN)\w{5}|ccc(*THEN)\w{4}|\w{3})/ + aaaxxxxxx + aaa++++++ + bbbxxxxx + bbb+++++ + cccxxxx + ccc++++ + dddddddd + +/a+b?(*THEN)c+(*FAIL)/ + aaabccc + +/^(?=a(*SKIP)b|ac)/ + ** Failers + ac + +/^(?=a(*PRUNE)b)/ + ab + ** Failers + ac + +~~~~~ + +# Check the use of names for failure + +/^(A(*PRUNE:A)B|C(*PRUNE:B)D)/mark + ** Failers + AC + CB + +/(*MARK:A)(*SKIP:B)(C|X)/mark + C + D + +/^(A(*THEN:A)B|C(*THEN:B)D)/mark + ** Failers + CB + +/^(?:A(*THEN:A)B|C(*THEN:B)D)/mark + CB + +/^(?>A(*THEN:A)B|C(*THEN:B)D)/mark + CB + +# This should succeed, as the skip causes bump to offset 1 (the mark). Note +# that we have to have something complicated such as (B|Z) at the end because, +# for Perl, a simple character somehow causes an unwanted optimization to mess +# with the handling of backtracking verbs. + +/A(*MARK:A)A+(*SKIP:A)(B|Z) | AC/x,mark + AAAC + +# Test skipping over a non-matching mark. + +/A(*MARK:A)A+(*MARK:B)(*SKIP:A)(B|Z) | AC/x,mark + AAAC + +# Check shorthand for MARK. + +/A(*:A)A+(*SKIP:A)(B|Z) | AC/x,mark + AAAC + +/(*:A)A+(*SKIP:A)(B|Z)/mark + AAAC + +# This should succeed, as a non-existent skip name disables the skip. + +/A(*MARK:A)A+(*SKIP:B)(B|Z) | AC/x,mark + AAAC + +/A(*MARK:A)A+(*SKIP:B)(B|Z) | AC(*:B)/x,mark + AAAC + +# COMMIT should override THEN. + +/(?>(*COMMIT)(?>yes|no)(*THEN)(*F))?/ + yes + +/(?>(*COMMIT)(yes|no)(*THEN)(*F))?/ + yes + +/b?(*SKIP)c/ + bc + abc + +/(*SKIP)bc/ + a + +/(*SKIP)b/ + a + +#endif + + }