From 74347b95af2d2d970a67649bdf3d344587da4565 Mon Sep 17 00:00:00 2001 From: jzmaddock Date: Fri, 8 Oct 2021 14:51:01 +0100 Subject: [PATCH] Allow back references to refer to a capture that hasn't happened yet ("forward reference"). Fixes https://github.com/boostorg/regex/issues/133. --- doc/syntax_perl.qbk | 6 ++- .../boost/regex/v5/basic_regex_creator.hpp | 1 - include/boost/regex/v5/basic_regex_parser.hpp | 27 +++++----- include/boost/regex/v5/indexed_bit_flag.hpp | 54 ------------------- test/regress/test_backrefs.cpp | 14 +++-- 5 files changed, 29 insertions(+), 73 deletions(-) delete mode 100644 include/boost/regex/v5/indexed_bit_flag.hpp diff --git a/doc/syntax_perl.qbk b/doc/syntax_perl.qbk index 233db3cf..6d062f46 100644 --- a/doc/syntax_perl.qbk +++ b/doc/syntax_perl.qbk @@ -195,7 +195,11 @@ You can also use the \g escape for the same function, for example: [[[^\g{one}]][Match whatever matched the sub-expression named "one"]] ] -Finally the \k escape can be used to refer to named subexpressions, for example [^\k] will match +Note that a back reference can also be a forward-reference to a sub-expression that has not yet +been seen - this only really makes sense within a repeat, so for example `(\2two|(one))+` will +match "oneonetwo". + +Finally the \k escape can be used to refer to named subexpressions, for example [^\k] will whatever matched the subexpression named "two". [h4 Alternation] diff --git a/include/boost/regex/v5/basic_regex_creator.hpp b/include/boost/regex/v5/basic_regex_creator.hpp index ddd2f257..1dc7f046 100644 --- a/include/boost/regex/v5/basic_regex_creator.hpp +++ b/include/boost/regex/v5/basic_regex_creator.hpp @@ -232,7 +232,6 @@ protected: bool m_icase; // true for case insensitive matches unsigned m_repeater_id; // the state_id of the next repeater bool m_has_backrefs; // true if there are actually any backrefs - indexed_bit_flag m_backrefs; // bitmask of permitted backrefs std::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; bool m_has_recursions; // set when we have recursive expressions to fixup std::vector m_recursion_checks; // notes which recursions we've followed while analysing this expression diff --git a/include/boost/regex/v5/basic_regex_parser.hpp b/include/boost/regex/v5/basic_regex_parser.hpp index 11d581d9..ec9279c0 100644 --- a/include/boost/regex/v5/basic_regex_parser.hpp +++ b/include/boost/regex/v5/basic_regex_parser.hpp @@ -98,6 +98,7 @@ private: std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative bool m_has_case_change; // true if somewhere in the current block the case has changed unsigned m_recursion_count; // How many times we've called parse_all. + unsigned m_max_backref; // Largest index of any backref. #if defined(BOOST_REGEX_MSVC) && defined(_M_IX86) // This is an ugly warning suppression workaround (for warnings *inside* std::vector // that can not otherwise be suppressed)... @@ -114,7 +115,7 @@ private: template basic_regex_parser::basic_regex_parser(regex_data* data) : basic_regex_creator(data), m_parser_proc(), m_base(0), m_end(0), m_position(0), - m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0) + m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0), m_max_backref(0) { } @@ -184,6 +185,13 @@ void basic_regex_parser::parse(const charT* p1, const charT* p2, return; // fill in our sub-expression count: this->m_pdata->m_mark_count = 1u + (std::size_t)m_mark_count; + // + // Check we don't have backreferences to sub-expressions which don't exist: + // + if (m_max_backref > m_mark_count) + { + fail(regex_constants::error_backref, std::distance(m_base, m_position), "Found a backreference to a non-existant sub-expression."); + } this->finalize(p1, p2); } @@ -529,11 +537,6 @@ bool basic_regex_parser::parse_open_paren() // restore the alternate insertion point: // this->m_alt_insert_point = last_alt_point; - // - // allow backrefs to this mark: - // - if(markid > 0) - this->m_backrefs.set(markid); return true; } @@ -899,12 +902,14 @@ escape_type_class_jump: } if(negative) i = 1 + (static_cast(m_mark_count) - i); - if(((i < hash_value_mask) && (i > 0) && (this->m_backrefs.test((std::size_t)i))) || ((i >= hash_value_mask) && (this->m_pdata->get_id((int)i) > 0) && (this->m_backrefs.test(this->m_pdata->get_id((int)i))))) + if(((i < hash_value_mask) && (i > 0)) || ((i >= hash_value_mask) && (this->m_pdata->get_id((int)i) > 0))) { m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); pb->index = (int)i; pb->icase = this->flags() & regbase::icase; + if ((i > m_max_backref) && (i < hash_value_mask)) + m_max_backref = i; } else { @@ -1934,12 +1939,14 @@ bool basic_regex_parser::parse_backref() charT c = unescape_character(); this->append_literal(c); } - else if((i > 0) && (this->m_backrefs.test((std::size_t)i))) + else if((i > 0)) { m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); pb->index = (int)i; pb->icase = this->flags() & regbase::icase; + if(i > m_max_backref) + m_max_backref = i; } else { @@ -2695,10 +2702,6 @@ option_group_jump: { if(this->flags() & regbase::save_subexpression_location) this->m_pdata->m_subs.at((std::size_t)markid - 1).second = std::distance(m_base, m_position) - 1; - // - // allow backrefs to this mark: - // - this->m_backrefs.set(markid); } return true; } diff --git a/include/boost/regex/v5/indexed_bit_flag.hpp b/include/boost/regex/v5/indexed_bit_flag.hpp deleted file mode 100644 index b61e5cad..00000000 --- a/include/boost/regex/v5/indexed_bit_flag.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * - * Copyright (c) 2020 - * John Maddock - * - * Use, modification and distribution are subject to the - * Boost Software License, Version 1.0. (See accompanying file - * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - * - */ - - /* - * LOCATION: see http://www.boost.org for most recent version. - * FILE basic_regex_parser.cpp - * VERSION see - * DESCRIPTION: Declares template class basic_regex_parser. - */ - -#include -#include - -#ifndef BOOST_REGEX_V5_INDEXED_BIT_FLAG_HPP -#define BOOST_REGEX_V5_INDEXED_BIT_FLAG_HPP - -namespace boost{ -namespace BOOST_REGEX_DETAIL_NS{ - -class indexed_bit_flag -{ - std::uint64_t low_mask; - std::set mask_set; -public: - indexed_bit_flag() : low_mask(0) {} - void set(std::size_t i) - { - if (i < std::numeric_limits::digits - 1) - low_mask |= static_cast(1u) << i; - else - mask_set.insert(i); - } - bool test(std::size_t i) - { - if (i < std::numeric_limits::digits - 1) - return low_mask & static_cast(1u) << i ? true : false; - else - return mask_set.find(i) != mask_set.end(); - } -}; - -} // namespace BOOST_REGEX_DETAIL_NS -} // namespace boost - - -#endif diff --git a/test/regress/test_backrefs.cpp b/test/regress/test_backrefs.cpp index be9f54ca..20a77838 100644 --- a/test/regress/test_backrefs.cpp +++ b/test/regress/test_backrefs.cpp @@ -19,13 +19,15 @@ void test_backrefs() { using namespace boost::regex_constants; TEST_INVALID_REGEX("a(b)\\2c", perl); - TEST_INVALID_REGEX("a(b\\1)c", perl); + //TEST_INVALID_REGEX("a(b\\1)c", perl); TEST_REGEX_SEARCH("a(b*)c\\1d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); TEST_REGEX_SEARCH("a(b*)c\\1d", perl, "abbcbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a(b*)c\\1d", perl, "abbcbbbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("^(.)\\1", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a([bc])\\1d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); TEST_REGEX_SEARCH("a\\([bc]\\)\\1d", basic, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("(\\2two|(one))+", perl, "oneonetwo", match_default, make_array(0, 9, 3, 9, 0, 3, -2, -2)); + TEST_INVALID_REGEX("(\\3two|(one))+", perl); // strictly speaking this is at best ambiguous, at worst wrong, this is what most // re implimentations will match though. TEST_REGEX_SEARCH("a(([bc])\\2)*d", perl, "abbccd", match_default, make_array(0, 6, 3, 5, 3, 4, -2, -2)); @@ -59,7 +61,7 @@ void test_backrefs() // Now test the \g version: // TEST_INVALID_REGEX("a(b)\\g2c", perl); - TEST_INVALID_REGEX("a(b\\g1)c", perl); + //TEST_INVALID_REGEX("a(b\\g1)c", perl); TEST_INVALID_REGEX("a(b\\g0)c", perl); TEST_REGEX_SEARCH("a(b*)c\\g1d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); TEST_REGEX_SEARCH("a(b*)c\\g1d", perl, "abbcbd", match_default, make_array(-2, -2)); @@ -67,8 +69,10 @@ void test_backrefs() TEST_REGEX_SEARCH("^(.)\\g1", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a([bc])\\g1d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); TEST_INVALID_REGEX("a(b)\\g{2}c", perl); - TEST_INVALID_REGEX("a(b\\g{1})c", perl); + //TEST_INVALID_REGEX("a(b\\g{1})c", perl); TEST_INVALID_REGEX("a(b\\g{0})c", perl); + TEST_REGEX_SEARCH("(\\g{2}two|(one))+", perl, "oneonetwo", match_default, make_array(0, 9, 3, 9, 0, 3, -2, -2)); + TEST_INVALID_REGEX("(\\g{3}two|(one))+", perl); TEST_REGEX_SEARCH("a(b*)c\\g{1}d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); TEST_REGEX_SEARCH("a(b*)c\\g{1}d", perl, "abbcbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a(b*)c\\g{1}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); @@ -76,7 +80,7 @@ void test_backrefs() TEST_REGEX_SEARCH("a([bc])\\g{1}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); // And again but with negative indexes: TEST_INVALID_REGEX("a(b)\\g-2c", perl); - TEST_INVALID_REGEX("a(b\\g-1)c", perl); + //TEST_INVALID_REGEX("a(b\\g-1)c", perl); TEST_INVALID_REGEX("a(b\\g-0)c", perl); TEST_REGEX_SEARCH("a(b*)c\\g-1d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); TEST_REGEX_SEARCH("a(b*)c\\g-1d", perl, "abbcbd", match_default, make_array(-2, -2)); @@ -84,7 +88,7 @@ void test_backrefs() TEST_REGEX_SEARCH("^(.)\\g1", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a([bc])\\g1d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); TEST_INVALID_REGEX("a(b)\\g{-2}c", perl); - TEST_INVALID_REGEX("a(b\\g{-1})c", perl); + //TEST_INVALID_REGEX("a(b\\g{-1})c", perl); TEST_REGEX_SEARCH("a(b*)c\\g{-1}d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); TEST_REGEX_SEARCH("a(b*)c\\g{-1}d", perl, "abbcbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a(b*)c\\g{-1}d", perl, "abbcbbbd", match_default, make_array(-2, -2));