diff --git a/build/Jamfile.v2 b/build/Jamfile.v2 index a8359dda..312632f8 100644 --- a/build/Jamfile.v2 +++ b/build/Jamfile.v2 @@ -102,11 +102,13 @@ SOURCES = lib boost_regex : ../src/$(SOURCES) icu_options : shared:BOOST_REGEX_DYN_LINK=1 - gcc-mw:static - gcc-mingw:static + #gcc-mw:static + #gcc-mingw:static gcc-cygwin:static ; boost-install boost_regex ; + + diff --git a/example/Jamfile.v2 b/example/Jamfile.v2 index 82098387..dd958c7d 100644 --- a/example/Jamfile.v2 +++ b/example/Jamfile.v2 @@ -14,8 +14,8 @@ project gcc:all gcc:-Wextra U_USING_ICU_NAMESPACE=0 - gcc-mw:static - gcc-mingw:static + #gcc-mw:static + #gcc-mingw:static gcc-cygwin:static ; @@ -78,3 +78,4 @@ test-suite regex-examples : + diff --git a/include/boost/regex/v4/basic_regex.hpp b/include/boost/regex/v4/basic_regex.hpp index f807fce9..04c7bb37 100644 --- a/include/boost/regex/v4/basic_regex.hpp +++ b/include/boost/regex/v4/basic_regex.hpp @@ -53,7 +53,7 @@ void bubble_down_one(I first, I last) if(first != last) { I next = last - 1; - while((next != first) && !(*(next-1) < *next)) + while((next != first) && (*next < *(next-1))) { (next-1)->swap(*next); --next; @@ -61,70 +61,59 @@ void bubble_down_one(I first, I last) } } -// -// Class named_subexpressions -// Contains information about named subexpressions within the regex. -// -template -class named_subexpressions_base -{ -public: - virtual int get_id(const charT* i, const charT* j)const = 0; - virtual int get_id(std::size_t h)const = 0; -#ifdef __GNUC__ - // warning supression: - virtual ~named_subexpressions_base(){} -#endif -}; - template -inline std::size_t hash_value_from_capture_name(Iterator i, Iterator j) +inline int hash_value_from_capture_name(Iterator i, Iterator j) { std::size_t r = boost::hash_range(i, j); r %= ((std::numeric_limits::max)() - 10001); r += 10000; - return r; + return static_cast(r); } -template -class named_subexpressions : public named_subexpressions_base +class named_subexpressions { +public: struct name { + template name(const charT* i, const charT* j, int idx) - : /*n(i, j), */ index(idx) + : index(idx) { hash = hash_value_from_capture_name(i, j); } - name(std::size_t h, int idx) + name(int h, int idx) : index(idx), hash(h) { } - //std::vector n; int index; - std::size_t hash; + int hash; bool operator < (const name& other)const { - return hash < other.hash; //std::lexicographical_compare(n.begin(), n.end(), other.n.begin(), other.n.end()); + return hash < other.hash; } bool operator == (const name& other)const { - return hash == other.hash; //n == other.n; + return hash == other.hash; } void swap(name& other) { - //n.swap(other.n); std::swap(index, other.index); std::swap(hash, other.hash); } }; -public: + + typedef std::vector::const_iterator const_iterator; + typedef std::pair range_type; + named_subexpressions(){} + + template void set_name(const charT* i, const charT* j, int index) { m_sub_names.push_back(name(i, j, index)); bubble_down_one(m_sub_names.begin(), m_sub_names.end()); } + template int get_id(const charT* i, const charT* j)const { name t(i, j, 0); @@ -135,72 +124,37 @@ public: } return -1; } - int get_id(std::size_t h)const + template + range_type equal_range(const charT* i, const charT* j)const + { + name t(i, j, 0); + return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t); + } + int get_id(int h)const { name t(h, 0); - typename std::vector::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t); + std::vector::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t); if((pos != m_sub_names.end()) && (*pos == t)) { return pos->index; } return -1; } + range_type equal_range(int h)const + { + name t(h, 0); + return std::equal_range(m_sub_names.begin(), m_sub_names.end(), t); + } private: std::vector m_sub_names; }; -template -class named_subexpressions_converter : public named_subexpressions_base -{ - boost::shared_ptr > m_converter; -public: - named_subexpressions_converter(boost::shared_ptr > s) - : m_converter(s) {} - int get_id(const charT* i, const charT* j)const - { - if(i == j) - return -1; - std::vector v; - while(i != j) - { - v.push_back(*i); - ++i; - } - return m_converter->get_id(&v[0], &v[0] + v.size()); - } - int get_id(std::size_t h)const - { - return m_converter->get_id(h); - } -}; - -template -inline boost::shared_ptr > convert_to_named_subs_imp( - boost::shared_ptr > s, - boost::integral_constant const&) -{ - return s; -} -template -inline boost::shared_ptr > convert_to_named_subs_imp( - boost::shared_ptr > s, - boost::integral_constant const&) -{ - return boost::shared_ptr >(new named_subexpressions_converter(s)); -} -template -inline boost::shared_ptr > convert_to_named_subs( - boost::shared_ptr > s) -{ - typedef typename boost::is_same::type tag_type; - return convert_to_named_subs_imp(s, tag_type()); -} // // class regex_data: // represents the data we wish to expose to the matching algorithms. // template -struct regex_data : public named_subexpressions +struct regex_data : public named_subexpressions { typedef regex_constants::syntax_option_type flag_type; typedef std::size_t size_type; @@ -672,7 +626,7 @@ public: BOOST_ASSERT(0 != m_pimpl.get()); return m_pimpl->get_data(); } - boost::shared_ptr > get_named_subs()const + boost::shared_ptr get_named_subs()const { return m_pimpl; } diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index f3bc0061..efa9f7dd 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -806,7 +806,13 @@ void basic_regex_creator::fixup_recursions(re_syntax_base* state) re_syntax_base* p = base; std::ptrdiff_t idx = static_cast(state)->alt.i; if(idx > 10000) - idx = m_pdata->get_id(idx); + { + // + // There may be more than one capture group with this hash, just do what Perl + // does and recurse to the leftmost: + // + idx = m_pdata->get_id(static_cast(idx)); + } while(p) { if((p->type == syntax_element_startmark) && (static_cast(p)->index == idx)) diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 6779383e..4dacfc6f 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -820,7 +820,11 @@ escape_type_class_jump: return false; } // maybe have \g{ddd} - if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) + regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position); + regex_constants::syntax_type syn_end = 0; + if((syn == regex_constants::syntax_open_brace) + || (syn == regex_constants::escape_type_left_word) + || (syn == regex_constants::escape_type_end_buffer)) { if(++m_position == m_end) { @@ -828,6 +832,18 @@ escape_type_class_jump: return false; } have_brace = true; + switch(syn) + { + case regex_constants::syntax_open_brace: + syn_end = regex_constants::syntax_close_brace; + break; + case regex_constants::escape_type_left_word: + syn_end = regex_constants::escape_type_right_word; + break; + default: + syn_end = regex_constants::escape_type_end_buffer; + break; + } } negative = (*m_position == static_cast('-')); if((negative) && (++m_position == m_end)) @@ -837,18 +853,20 @@ escape_type_class_jump: } const charT* pc = m_position; int i = this->m_traits.toi(pc, m_end, 10); - if(i < 0) + if((i < 0) && syn_end) { - // Check for a named capture: + // Check for a named capture, get the leftmost one if there is more than one: const charT* base = m_position; - while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end)) + { ++m_position; - i = this->m_pdata->get_id(base, m_position); + } + i = hash_value_from_capture_name(base, m_position); pc = m_position; } if(negative) i = 1 + m_mark_count - i; - if((i > 0) && (this->m_backrefs & (1u << (i-1)))) + if(((i > 0) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1))))) { m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); @@ -863,7 +881,7 @@ escape_type_class_jump: m_position = pc; if(have_brace) { - if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) + if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end)) { fail(regex_constants::error_escape, m_position - m_base, incomplete_message); return false; diff --git a/include/boost/regex/v4/match_results.hpp b/include/boost/regex/v4/match_results.hpp index 8f11bbbe..ca9898f4 100644 --- a/include/boost/regex/v4/match_results.hpp +++ b/include/boost/regex/v4/match_results.hpp @@ -38,7 +38,6 @@ namespace boost{ namespace re_detail{ -template class named_subexpressions; } @@ -69,7 +68,7 @@ public: typedef typename re_detail::regex_iterator_traits< BidiIterator>::value_type char_type; typedef std::basic_string string_type; - typedef re_detail::named_subexpressions_base named_sub_type; + typedef re_detail::named_subexpressions named_sub_type; // construct/copy/destroy: explicit match_results(const Allocator& a = Allocator()) @@ -225,10 +224,15 @@ public: // const_reference named_subexpression(const char_type* i, const char_type* j) const { + // + // Scan for the leftmost *matched* subexpression with the specified named: + // if(m_is_singular) raise_logic_error(); - int index = m_named_subs->get_id(i, j); - return index > 0 ? (*this)[index] : m_null; + re_detail::named_subexpressions::range_type r = m_named_subs->equal_range(i, j); + while((r.first != r.second) && ((*this)[r.first->index].matched == false)) + ++r.first; + return r.first != r.second ? (*this)[r.first->index] : m_null; } template const_reference named_subexpression(const charT* i, const charT* j) const @@ -243,10 +247,20 @@ public: } int named_subexpression_index(const char_type* i, const char_type* j) const { + // + // Scan for the leftmost *matched* subexpression with the specified named. + // If none found then return the leftmost expression with that name, + // otherwise an invalid index: + // if(m_is_singular) raise_logic_error(); - int index = m_named_subs->get_id(i, j); - return index > 0 ? index : -20; + re_detail::named_subexpressions::range_type s, r; + s = r = m_named_subs->equal_range(i, j); + while((r.first != r.second) && ((*this)[r.first->index].matched == false)) + ++r.first; + if(r.first == r.second) + r = s; + return r.first != r.second ? r.first->index : -20; } template int named_subexpression_index(const charT* i, const charT* j) const diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index fe224210..b8c4e963 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -200,7 +200,7 @@ bool perl_matcher::match_imp() m_match_flags |= regex_constants::match_all; m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last); m_presult->set_base(base); - m_presult->set_named_subs(re_detail::convert_to_named_subs::char_type>(this->re.get_named_subs())); + m_presult->set_named_subs(this->re.get_named_subs()); if(m_match_flags & match_posix) m_result = *m_presult; verify_options(re.flags(), m_match_flags); @@ -262,7 +262,7 @@ bool perl_matcher::find_imp() pstate = re.get_first_state(); m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last); m_presult->set_base(base); - m_presult->set_named_subs(re_detail::convert_to_named_subs::char_type>(this->re.get_named_subs())); + m_presult->set_named_subs(this->re.get_named_subs()); m_match_flags |= regex_constants::match_init; } else @@ -588,8 +588,23 @@ bool perl_matcher::match_backref() // in the match, this is in line with ECMAScript, but not Perl // or PCRE. // - BidiIterator i = (*m_presult)[static_cast(pstate)->index].first; - BidiIterator j = (*m_presult)[static_cast(pstate)->index].second; + int index = static_cast(pstate)->index; + if(index >= 10000) + { + named_subexpressions::range_type r = re.get_data().equal_range(index); + BOOST_ASSERT(r.first != r.second); + do + { + index = r.first->index; + ++r.first; + }while((r.first != r.second) && ((*m_presult)[index].matched != true)); + } + + if((m_match_flags & match_perl) && !(*m_presult)[index].matched) + return false; + + BidiIterator i = (*m_presult)[index].first; + BidiIterator j = (*m_presult)[index].second; while(i != j) { if((position == last) || (traits_inst.translate(*position, icase) != traits_inst.translate(*i, icase))) @@ -713,7 +728,7 @@ inline bool perl_matcher::match_assert_backref( { // return true if marked sub-expression N has been matched: int index = static_cast(pstate)->index; - bool result; + bool result = false; if(index == 9999) { // Magic value for a (DEFINE) block: @@ -721,11 +736,25 @@ inline bool perl_matcher::match_assert_backref( } else if(index > 0) { + // Have we matched subexpression "index"? // Check if index is a hash value: if(index >= 10000) - index = re.get_data().get_id(index); - // Have we matched subexpression "index"? - result = (*m_presult)[index].matched; + { + named_subexpressions::range_type r = re.get_data().equal_range(index); + while(r.first != r.second) + { + if((*m_presult)[r.first->index].matched) + { + result = true; + break; + } + ++r.first; + } + } + else + { + result = (*m_presult)[index].matched; + } pstate = pstate->next.p; } else @@ -734,8 +763,20 @@ inline bool perl_matcher::match_assert_backref( // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1. int idx = -index-1; if(idx >= 10000) - idx = re.get_data().get_id(idx); - result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0)); + { + named_subexpressions::range_type r = re.get_data().equal_range(idx); + int stack_index = recursion_stack.empty() ? -1 : recursion_stack.back().idx; + while(r.first != r.second) + { + result |= (stack_index == r.first->index); + if(result)break; + ++r.first; + } + } + else + { + result = !recursion_stack.empty() && ((recursion_stack.back().idx == idx) || (index == 0)); + } pstate = pstate->next.p; } return result; diff --git a/include/boost/regex/v4/sub_match.hpp b/include/boost/regex/v4/sub_match.hpp index 1c79e39a..34a86840 100644 --- a/include/boost/regex/v4/sub_match.hpp +++ b/include/boost/regex/v4/sub_match.hpp @@ -56,7 +56,7 @@ struct sub_match : public std::pair template operator std::basic_string ()const { - return std::basic_string(this->first, this->second); + return matched ? std::basic_string(this->first, this->second) : std::basic_string(); } #else operator std::basic_string ()const @@ -66,19 +66,22 @@ struct sub_match : public std::pair #endif difference_type BOOST_REGEX_CALL length()const { - difference_type n = ::boost::re_detail::distance((BidiIterator)this->first, (BidiIterator)this->second); + difference_type n = matched ? ::boost::re_detail::distance((BidiIterator)this->first, (BidiIterator)this->second) : 0; return n; } std::basic_string str()const { std::basic_string result; - std::size_t len = ::boost::re_detail::distance((BidiIterator)this->first, (BidiIterator)this->second); - result.reserve(len); - BidiIterator i = this->first; - while(i != this->second) + if(matched) { - result.append(1, *i); - ++i; + std::size_t len = ::boost::re_detail::distance((BidiIterator)this->first, (BidiIterator)this->second); + result.reserve(len); + BidiIterator i = this->first; + while(i != this->second) + { + result.append(1, *i); + ++i; + } } return result; } diff --git a/src/regex_traits_defaults.cpp b/src/regex_traits_defaults.cpp index 31b79184..5f06149c 100644 --- a/src/regex_traits_defaults.cpp +++ b/src/regex_traits_defaults.cpp @@ -100,7 +100,7 @@ BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants "p", "P", "N", - "g", + "gk", "K", "R", }; diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 18af0d7d..ce198b56 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -17,8 +17,8 @@ project gcc:-Wextra gcc:-Wshadow U_USING_ICU_NAMESPACE=0 - gcc-mw:static - gcc-mingw:static + #gcc-mw:static + #gcc-mingw:static gcc-cygwin:static ; @@ -177,3 +177,5 @@ test-suite regex ; build-project ../example ; + + diff --git a/test/regress/test_perl_ex.cpp b/test/regress/test_perl_ex.cpp index 05abf22d..f96be261 100644 --- a/test/regress/test_perl_ex.cpp +++ b/test/regress/test_perl_ex.cpp @@ -907,5 +907,16 @@ void test_recursion() // Bugs: TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(); } }", match_default, make_array(0, 46, 10, 13, 14, 46, -2, -2)); TEST_REGEX_SEARCH("namespace\\s+(\\w+)\\s+(\\{(?:[^{}]*(?:(?2)[^{}]*)*)?\\})", perl, "namespace one { namespace two { int foo(){} } { {{{ } } } } {}}", match_default, make_array(0, 64, 10, 13, 14, 64, -2, -2)); + + // Recursion to a named sub with a name that is used multiple times: + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.(?&A)", perl, "aaaa.aa", match_default, make_array(0, 7, 0, 4, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.(?&A)", perl, "bbbb.aa", match_default, make_array(0, 7, -1, -1, 0, 4, -2, -2)); + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.(?&A)", perl, "bbbb.bb", match_default, make_array(-2, -2)); + // Back reference to a named sub with a name that is used multiple times: + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.\\k", perl, "aaaa.aaaa", match_default, make_array(0, 9, 0, 4, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.\\k", perl, "bbbb.aaaa", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.\\k", perl, "aaaa.bbbb", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?:(?a+)|(?b+))\\.\\k", perl, "bbbb.bbbb", match_default, make_array(0, 9, -1, -1, 0, 4, -2, -2)); + TEST_REGEX_SEARCH("(?:(?a+)|(?b+)|c+)\\.\\k", perl, "cccc.cccc", match_default, make_array(-2, -2)); } diff --git a/test/regress/test_replace.cpp b/test/regress/test_replace.cpp index 06c137bf..2d8724eb 100644 --- a/test/regress/test_replace.cpp +++ b/test/regress/test_replace.cpp @@ -184,5 +184,11 @@ void test_replace() TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "$2", "bb"); TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "d$+{one}c", "daac"); TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "c$+{two}d", "cbbd"); + + TEST_REGEX_REPLACE("(?a)(?b)(c)", perl, " ...abc,,", match_default, "$1.$2.$3.$+{one}", " ...a.b.c.a,,"); + TEST_REGEX_REPLACE("(?:(?a)|(?b))", perl, " ...a,,", match_default, "$1.$2.$+{one}", " ...a..a,,"); + TEST_REGEX_REPLACE("(?:(?a)|(?b))", perl, " ...b,,", match_default, "$1.$2.$+{one}", " ....b.b,,"); + TEST_REGEX_REPLACE("(?:(?a)(?b))", perl, " ...ab,,", match_default, "$1.$2.$+{one}", " ...a.b.a,,"); + } diff --git a/test/regress/test_tricky_cases.cpp b/test/regress/test_tricky_cases.cpp index e025dc43..033c3dea 100644 --- a/test/regress/test_tricky_cases.cpp +++ b/test/regress/test_tricky_cases.cpp @@ -49,7 +49,7 @@ void test_tricky_cases() TEST_REGEX_SEARCH("(a)d|(b)c", perl, "abc", match_default, make_array(1, 3, -1, -1, 1, 2, -2, -2)); TEST_REGEX_SEARCH("_+((www)|(ftp)|(mailto)):_*", perl, "_wwwnocolon _mailto:", match_default, make_array(12, 20, 13, 19, -1, -1, -1, -1, 13, 19, -2, -2)); // subtleties of matching - TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(0, 3, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a(b?c)+d", perl, "accd", match_default, make_array(0, 4, 2, 3, -2, -2)); TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl, "weeknights", match_default, make_array(0, 10, 0, 3, 3, 10, -2, -2)); TEST_REGEX_SEARCH(".*", perl, "abc", match_default, make_array(0, 3, -2, 3, 3, -2, -2));