diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 403facc2..24a47fd2 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -1127,6 +1127,7 @@ bool basic_regex_parser::parse_perl_extension() std::ptrdiff_t last_alt_point = m_alt_insert_point; this->m_pdata->m_data.align(); m_alt_insert_point = this->m_pdata->m_data.size(); + std::ptrdiff_t expected_alt_point = m_alt_insert_point; // // select the actual extension used: // @@ -1191,6 +1192,48 @@ bool basic_regex_parser::parse_perl_extension() this->m_pdata->m_data.align(); m_alt_insert_point = this->m_pdata->m_data.size(); break; + case regex_constants::syntax_open_mark: + { + // a conditional expression: + pb->index = markid = -4; + if(++m_position == m_end) + fail(REG_BADRPT, m_position - m_base); + int v = this->m_traits.toi(m_position, m_end, 10); + if(v > 0) + { + re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); + br->index = v; + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + fail(REG_BADRPT, m_position - m_base); + if(++m_position == m_end) + fail(REG_BADRPT, m_position - m_base); + } + else + { + // verify that we have a lookahead or lookbehind assert: + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question) + fail(REG_BADRPT, m_position - m_base); + if(++m_position == m_end) + fail(REG_BADRPT, m_position - m_base); + if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word) + { + if(++m_position == m_end) + fail(REG_BADRPT, m_position - m_base); + if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal) + && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not)) + fail(REG_BADRPT, m_position - m_base); + m_position -= 3; + } + else + { + if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal) + && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not)) + fail(REG_BADRPT, m_position - m_base); + m_position -= 2; + } + } + break; + } default: fail(REG_BADRPT, m_position - m_base); } @@ -1221,6 +1264,19 @@ bool basic_regex_parser::parse_perl_extension() } } // + // verify that if this is conditional expression, that we do have + // an alternative, if not add one: + // + if(markid == -4) + { + re_syntax_base* b = this->getaddress(expected_alt_point); + if(b->type != syntax_element_alt) + { + re_alt* alt = static_cast(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt))); + alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt); + } + } + // // append closing parenthesis state: // pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); diff --git a/include/boost/regex/v4/perl_matcher.hpp b/include/boost/regex/v4/perl_matcher.hpp index 41455a93..0627e787 100644 --- a/include/boost/regex/v4/perl_matcher.hpp +++ b/include/boost/regex/v4/perl_matcher.hpp @@ -334,6 +334,7 @@ private: bool match_dot_repeat_fast(); bool match_dot_repeat_slow(); bool match_backstep(); + bool match_assert_backref(); bool backtrack_till_match(unsigned count); // find procs stored in s_find_vtable: diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index 3e7f8eea..89198f02 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -81,11 +81,11 @@ void perl_matcher::estimate_max_state_count(std difference_type dist = boost::re_detail::distance(base, last); traits_size_type states = static_cast(re.size()); states *= states; - difference_type lim = (std::numeric_limits::max)() - 1000 - states; + difference_type lim = (std::numeric_limits::max)() - 100000 - states; if(dist > (difference_type)(lim / states)) max_state_count = lim; else - max_state_count = 1000 + states * dist; + max_state_count = 100000 + states * dist; } template void perl_matcher::estimate_max_state_count(void*) @@ -294,7 +294,7 @@ bool perl_matcher::match_endmark() if((m_match_flags & match_nosubs) == 0) m_presult->set_second(position, index); } - else if(index < 0) + else if((index < 0) && (index != -4)) { // matched forward lookahead: pstate = 0; @@ -670,6 +670,14 @@ bool perl_matcher::match_backstep() return true; } +template +bool perl_matcher::match_assert_backref() +{ + // return true if marked sub-expression N has been matched: + pstate = pstate->next.p; + return (*m_presult)[static_cast(pstate)->index].matched; +} + template bool perl_matcher::find_restart_any() { diff --git a/include/boost/regex/v4/perl_matcher_non_recursive.hpp b/include/boost/regex/v4/perl_matcher_non_recursive.hpp index 4f6d45dc..e170db8d 100644 --- a/include/boost/regex/v4/perl_matcher_non_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_non_recursive.hpp @@ -113,7 +113,7 @@ struct saved_single_repeat : public saved_state template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[27] = + static matcher_proc_type const s_match_vtable[28] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -142,6 +142,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_set_repeat, &perl_matcher::match_long_set_repeat, &perl_matcher::match_backstep, + &perl_matcher::match_assert_backref, }; push_recursion_stopper(); @@ -344,6 +345,37 @@ bool perl_matcher::match_startmark() #endif return r; } + case -4: + { + // conditional expression: + const re_alt* alt = static_cast(pstate->next.p); + BOOST_ASSERT(alt->type == syntax_element_alt); + pstate = alt->next.p; + if(pstate->type == syntax_element_assert_backref) + { + if(!match_assert_backref()) + pstate = alt->alt.p; + break; + } + else + { + // zero width assertion, have to match this recursively: + BOOST_ASSERT(pstate->type == syntax_element_startmark); + bool negated = static_cast(pstate)->index == -2; + BidiIterator saved_position = position; + const re_syntax_base* next_pstate = static_cast(pstate->next.p)->alt.p->next.p; + pstate = pstate->next.p->next.p; + bool r = match_all_states(); + position = saved_position; + if(negated) + r = !r; + if(r) + pstate = next_pstate; + else + pstate = alt->alt.p; + break; + } + } default: { assert(index > 0); @@ -929,8 +961,8 @@ bool perl_matcher::unwind_greedy_single_repeat( const re_repeat* rep = pmp->rep; std::size_t count = pmp->count; - assert(rep->next.p); - assert(rep->alt.p); + assert(rep->next.p != 0); + assert(rep->alt.p != 0); count -= rep->min; @@ -979,8 +1011,8 @@ bool perl_matcher::unwind_slow_dot_repeat(bool const re_repeat* rep = pmp->rep; std::size_t count = pmp->count; assert(rep->type == syntax_element_dot_rep); - assert(rep->next.p); - assert(rep->alt.p); + assert(rep->next.p != 0); + assert(rep->alt.p != 0); assert(rep->next.p->type == syntax_element_wild); assert(count < rep->max); @@ -1097,8 +1129,8 @@ bool perl_matcher::unwind_char_repeat(bool r) position = pmp->last_position; assert(rep->type == syntax_element_char_rep); - assert(rep->next.p); - assert(rep->alt.p); + assert(rep->next.p != 0); + assert(rep->alt.p != 0); assert(rep->next.p->type == syntax_element_literal); assert(count < rep->max); @@ -1161,8 +1193,8 @@ bool perl_matcher::unwind_short_set_repeat(bool position = pmp->last_position; assert(rep->type == syntax_element_short_set_rep); - assert(rep->next.p); - assert(rep->alt.p); + assert(rep->next.p != 0); + assert(rep->alt.p != 0); assert(rep->next.p->type == syntax_element_set); assert(count < rep->max); @@ -1226,8 +1258,8 @@ bool perl_matcher::unwind_long_set_repeat(bool position = pmp->last_position; assert(rep->type == syntax_element_long_set_rep); - assert(rep->next.p); - assert(rep->alt.p); + assert(rep->next.p != 0); + assert(rep->alt.p != 0); assert(rep->next.p->type == syntax_element_long_set); assert(position != last); assert(count < rep->max); diff --git a/include/boost/regex/v4/perl_matcher_recursive.hpp b/include/boost/regex/v4/perl_matcher_recursive.hpp index 4ce6a75b..03a653b6 100644 --- a/include/boost/regex/v4/perl_matcher_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_recursive.hpp @@ -48,7 +48,7 @@ public: template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[27] = + static matcher_proc_type const s_match_vtable[28] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -77,6 +77,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_set_repeat, &perl_matcher::match_long_set_repeat, &perl_matcher::match_backstep, + &perl_matcher::match_assert_backref, }; if(state_count > max_state_count) @@ -157,6 +158,37 @@ bool perl_matcher::match_startmark() #endif break; } + case -4: + { + // conditional expression: + const re_alt* alt = static_cast(pstate->next.p); + BOOST_ASSERT(alt->type == syntax_element_alt); + pstate = alt->next.p; + if(pstate->type == syntax_element_assert_backref) + { + if(!match_assert_backref()) + pstate = alt->alt.p; + break; + } + else + { + // zero width assertion, have to match this recursively: + BOOST_ASSERT(pstate->type == syntax_element_startmark); + bool negated = static_cast(pstate)->index == -2; + BidiIterator saved_position = position; + const re_syntax_base* next_pstate = static_cast(pstate->next.p)->alt.p->next.p; + pstate = pstate->next.p->next.p; + bool r = match_all_states(); + position = saved_position; + if(negated) + r = !r; + if(r) + pstate = next_pstate; + else + pstate = alt->alt.p; + break; + } + } default: { assert(index > 0); diff --git a/include/boost/regex/v4/states.hpp b/include/boost/regex/v4/states.hpp index 92842a7e..f6ae6647 100644 --- a/include/boost/regex/v4/states.hpp +++ b/include/boost/regex/v4/states.hpp @@ -108,7 +108,9 @@ enum syntax_element_type syntax_element_short_set_rep = syntax_element_char_rep + 1, syntax_element_long_set_rep = syntax_element_short_set_rep + 1, // a backstep for lookbehind repeats: - syntax_element_backstep = syntax_element_long_set_rep + 1 + syntax_element_backstep = syntax_element_long_set_rep + 1, + // an assertion that a mark was matched: + syntax_element_assert_backref = syntax_element_backstep +1 }; #ifdef BOOST_REGEX_DEBUG diff --git a/test/regress/basic_tests.cpp b/test/regress/basic_tests.cpp index 46dca15c..37aec3a4 100644 --- a/test/regress/basic_tests.cpp +++ b/test/regress/basic_tests.cpp @@ -784,6 +784,8 @@ void test_tricky_cases2() TEST_REGEX_SEARCH("([a-c]+)\\1", perl, "abcbc", match_default, make_array(1, 5, 1, 3, -2, -2)); TEST_REGEX_SEARCH(".+abc", perl, "xxxxxxxxyyyyyyyyab", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("(.+)\\1", perl, "abcdxxxyyyxxxyyy", match_default, make_array(4, 16, 4, 10, -2, -2)); + // this should not throw: + TEST_REGEX_SEARCH("[_]+$", perl, "___________________________________________x", match_default, make_array(-2, -2)); // // the strings in the next test case are too long for most compilers to cope with, @@ -1315,5 +1317,55 @@ void test_nosubs() TEST_REGEX_SEARCH("a(bbb+|bb+|b)bb", perl|nosubs, "abbb", match_default, make_array(0, 4, -2, -2)); TEST_REGEX_SEARCH("(.*).*", perl|nosubs, "abcdef", match_default, make_array(0, 6, -2, 6, 6, -2, -2)); TEST_REGEX_SEARCH("(a*)*", perl|nosubs, "bc", match_default, make_array(0, 0, -2, 1, 1, -2, 2, 2, -2, -2)); - +} + +void test_conditionals() +{ + using namespace boost::regex_constants; + TEST_REGEX_SEARCH("(?:(a)|b)(?(1)A|B)", perl, "aA", match_default, make_array(0, 2, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(?:(a)|b)(?(1)A|B)", perl, "bB", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("(?:(a)|b)(?(1)A|B)", perl, "aB", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?:(a)|b)(?(1)A|B)", perl, "bA", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("^(a)?(?(1)a|b)+$", perl, "aa", match_default, make_array(0, 2, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("^(a)?(?(1)a|b)+$", perl, "b", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("^(a)?(?(1)a|b)+$", perl, "bb", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("^(a)?(?(1)a|b)+$", perl, "ab", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("^(?(?=abc)\\w{3}:|\\d\\d)$", perl, "abc:", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("^(?(?=abc)\\w{3}:|\\d\\d)$", perl, "12", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("^(?(?=abc)\\w{3}:|\\d\\d)$", perl, "123", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?(?=abc)\\w{3}:|\\d\\d)$", perl, "xyz", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("^(?(?!abc)\\d\\d|\\w{3}:)$", perl, "abc:", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("^(?(?!abc)\\d\\d|\\w{3}:)$", perl, "12", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("^(?(?!abc)\\d\\d|\\w{3}:)$", perl, "123", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?(?!abc)\\d\\d|\\w{3}:)$", perl, "xyz", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(?(?<=foo)bar|cat)", perl, "foobar", match_default, make_array(3, 6, -2, -2)); + TEST_REGEX_SEARCH("(?(?<=foo)bar|cat)", perl, "cat", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?(?<=foo)bar|cat)", perl, "fcat", match_default, make_array(1, 4, -2, -2)); + TEST_REGEX_SEARCH("(?(?<=foo)bar|cat)", perl, "focat", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("(?(?<=foo)bar|cat)", perl, "foocat", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(?(?