From 38b58f2007ed2cdea9e9dea2e3b3c304b2617234 Mon Sep 17 00:00:00 2001 From: John Maddock Date: Fri, 19 Mar 2004 12:58:49 +0000 Subject: [PATCH] Alternatives now work. [SVN r22525] --- .../boost/regex/v4/basic_regex_creator.hpp | 3 +- include/boost/regex/v4/basic_regex_parser.hpp | 68 ++++++++++++++++++- include/boost/regex/v4/regbase.hpp | 3 + include/boost/regex/v4/states.hpp | 3 +- test/regress/basic_tests.cpp | 47 +++++++------ 5 files changed, 98 insertions(+), 26 deletions(-) diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index 62bfbed1..d7c2aa4a 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -348,14 +348,13 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, state = static_cast(state)->alt.p; break;; case syntax_element_alt: - assert(0); case syntax_element_rep: case syntax_element_dot_rep: case syntax_element_char_rep: case syntax_element_short_set_rep: case syntax_element_long_set_rep: { - re_alt* rep = static_cast(state); + re_alt* rep = static_cast(state); if(rep->_map[0] & mask_init) { if(map) diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 65da6e08..f604f243 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -44,6 +44,7 @@ public: bool parse_match_any(); bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits::max)()); bool parse_repeat_range(bool isbasic); + bool parse_alt(); private: typedef bool (basic_regex_parser::*parser_proc_type)(); @@ -54,6 +55,7 @@ private: unsigned m_mark_count; // how many sub-expressions we have std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted). unsigned m_repeater_id; // the id of the next repeater + std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative basic_regex_parser& operator=(const basic_regex_parser&); basic_regex_parser(const basic_regex_parser&); @@ -61,7 +63,7 @@ private: template basic_regex_parser::basic_regex_parser(regex_data* data) - : basic_regex_creator(data), m_mark_count(0), m_paren_start(0), m_repeater_id(0) + : basic_regex_creator(data), m_mark_count(0), m_paren_start(0), m_repeater_id(0), m_alt_insert_point(0) { } @@ -201,6 +203,8 @@ bool basic_regex_parser::parse_extended() BOOST_ASSERT(0); result = false; break; + case regex_constants::syntax_or: + return parse_alt(); default: result = parse_literal(); break; @@ -230,6 +234,10 @@ bool basic_regex_parser::parse_open_paren() pb->index = markid; ++m_position; std::ptrdiff_t last_paren_start = this->getoffset(pb); + // back up insertion point for alternations, and set new point: + std::ptrdiff_t last_alt_point = m_alt_insert_point; + this->m_pdata->m_data.align(); + m_alt_insert_point = this->m_pdata->m_data.size(); // // now recursively add more states, this will terminate when we get to a // matching ')' : @@ -248,6 +256,10 @@ bool basic_regex_parser::parse_open_paren() pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); pb->index = markid; this->m_paren_start = last_paren_start; + // + // restore the alternate insertion point: + // + this->m_alt_insert_point = last_alt_point; return true; } @@ -280,12 +292,22 @@ bool basic_regex_parser::parse_basic_escape() else return parse_literal(); case regex_constants::syntax_open_brace: + if(this->m_pdata->m_flags & regbase::no_intervals) + return parse_literal(); ++m_position; return parse_repeat_range(true); case regex_constants::syntax_close_brace: + if(this->m_pdata->m_flags & regbase::no_intervals) + return parse_literal(); fail(REG_EBRACE, this->m_position - this->m_base); result = false; break; + case regex_constants::syntax_or: + if(this->m_pdata->m_flags & regbase::bk_vbar) + return parse_alt(); + else + result = parse_literal(); + break; default: result = parse_literal(); break; @@ -482,6 +504,50 @@ bool basic_regex_parser::parse_repeat_range(bool isbasic) return parse_repeat(min, max); } +template +bool basic_regex_parser::parse_alt() +{ + // + // error check: if there have been no previous states, + // or if the last state was a '(' then error: + // + if((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark)) + fail(REG_EMPTY, this->m_position - this->m_base); + ++m_position; + // + // we need to append a trailing jump, then insert the alternative: + // + re_syntax_base* pj = this->append_state(re_detail::syntax_element_jump, sizeof(re_jump)); + std::ptrdiff_t jump_offset = this->getoffset(pj); + re_alt* palt = static_cast(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size)); + jump_offset += re_alt_size; + this->m_pdata->m_data.align(); + palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt); + // + // update m_alt_insert_point so that the next alternate gets + // inserted at the start of the second of the two we've just created: + // + this->m_alt_insert_point = this->m_pdata->m_data.size(); + // + // recursively add states: + // + bool result = this->parse_all(); + // + // if we didn't actually add any trailing states then that's an error: + // + if(this->m_alt_insert_point == this->m_pdata->m_data.size()) + fail(REG_EMPTY, this->m_position - this->m_base); + // + // fix up the jump we added to point to the end of the states + // that we're just added: + // + this->m_pdata->m_data.align(); + re_jump* jmp = static_cast(this->getaddress(jump_offset)); + jmp->alt.i = this->m_pdata->m_data.size() - jump_offset; + + return result; +} + } // namespace re_detail } // namespace boost diff --git a/include/boost/regex/v4/regbase.hpp b/include/boost/regex/v4/regbase.hpp index 62435c88..a6f282f6 100644 --- a/include/boost/regex/v4/regbase.hpp +++ b/include/boost/regex/v4/regbase.hpp @@ -59,6 +59,7 @@ public: no_char_classes = 1 << 8, // [[:CLASS:]] not allowed no_intervals = 1 << 9, // {x,y} not allowed bk_plus_qm = 1 << 10, // uses \+ and \? + bk_vbar = 1 << 11, // use \| for alternatives // // options common to all groups: @@ -120,6 +121,8 @@ namespace regex_constants{ nosubs = ::boost::regbase::nosubs, optimize = ::boost::regbase::optimize, bk_plus_qm = ::boost::regbase::bk_plus_qm, + bk_vbar = ::boost::regbase::bk_vbar, + no_intervals = ::boost::regbase::no_intervals, basic = ::boost::regbase::basic, extended = ::boost::regbase::extended, diff --git a/include/boost/regex/v4/states.hpp b/include/boost/regex/v4/states.hpp index 31c86cf4..8fffe657 100644 --- a/include/boost/regex/v4/states.hpp +++ b/include/boost/regex/v4/states.hpp @@ -215,7 +215,8 @@ We provide this so we know how manybytes to insert when constructing the machine enum re_jump_size_type { re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask), - re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask) + re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask), + re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask) }; /*** proc re_is_set_member ********************************************* diff --git a/test/regress/basic_tests.cpp b/test/regress/basic_tests.cpp index b4eab37b..89f82f07 100644 --- a/test/regress/basic_tests.cpp +++ b/test/regress/basic_tests.cpp @@ -180,29 +180,32 @@ void basic_tests() TEST_INVALID_REGEX("a\\{1b\\}", basic); TEST_INVALID_REGEX("a\\{1,b\\}", basic); TEST_INVALID_REGEX("a\\{1,2v\\}", basic); -#if 0 -; now test the alternation operator | -- match_default normal REG_EXTENDED -a|b a 0 1 -a|b b 0 1 -a(b|c) ab 0 2 1 2 -a(b|c) ac 0 2 1 2 -a(b|c) ad -1 -1 -1 -1 -|c ! -c| ! -(|) ! -(a|) ! -(|a) ! -a\| a| 0 2 -- match_default normal limited_ops -a| a| 0 2 -a\| a| 0 2 -| | 0 1 -- match_default normal bk_vbar REG_NO_POSIX_TEST -a| a| 0 2 -a\|b a 0 1 -a\|b b 0 1 + // now test the alternation operator | + TEST_REGEX_SEARCH("a|b", perl, "a", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("a|b", perl, "b", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("a|b|c", perl, "c", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("a|(b)|.", perl, "b", match_default, make_array(0, 1, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(a)|b|.", perl, "a", match_default, make_array(0, 1, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("a(b|c)", perl, "ab", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("a(b|c)", perl, "ac", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("a(b|c)", perl, "ad", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(a|b|c)", perl, "c", match_default, make_array(0, 1, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(a|(b)|.)", perl, "b", match_default, make_array(0, 1, 0, 1, 0, 1, -2, -2)); + TEST_INVALID_REGEX("|c", perl); + TEST_INVALID_REGEX("c|", perl); + TEST_INVALID_REGEX("(|)", perl); + TEST_INVALID_REGEX("(a|)", perl); + TEST_INVALID_REGEX("(|a)", perl); + TEST_REGEX_SEARCH("a\\|", perl, "a|", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("a|", basic, "a|", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a\\|", basic, "a|", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("|", basic, "|", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("a|", basic|bk_vbar, "a|", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a\\|b", basic|bk_vbar, "a", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("a\\|b", basic|bk_vbar, "b", match_default, make_array(0, 1, -2, -2)); +#if 0 ; now test the set operator [] - match_default normal REG_EXTENDED