From 5a6bc29d7ccd68af63fcf2ad0bc6c2554fd889ce Mon Sep 17 00:00:00 2001 From: John Maddock Date: Fri, 17 Jul 2009 10:23:50 +0000 Subject: [PATCH] Added initial support for recursive expressions. Updated docs and tests accordingly. [SVN r54994] --- .../background_information/examples.html | 6 +- .../background_information/history.html | 12 +- .../background_information/locale.html | 8 +- .../background_information/standards.html | 10 +- doc/html/boost_regex/captures.html | 6 +- .../format/boost_format_syntax.html | 8 +- doc/html/boost_regex/install.html | 16 +- doc/html/boost_regex/ref/bad_expression.html | 4 +- doc/html/boost_regex/ref/basic_regex.html | 18 +- .../ref/concepts/traits_concept.html | 4 +- .../deprecated_interfaces/regex_format.html | 2 +- doc/html/boost_regex/ref/error_type.html | 4 +- doc/html/boost_regex/ref/match_flag_type.html | 2 +- doc/html/boost_regex/ref/match_results.html | 4 +- .../ref/non_std_strings/icu/unicode_algo.html | 6 +- .../ref/non_std_strings/icu/unicode_iter.html | 4 +- .../non_std_strings/mfc_strings/mfc_algo.html | 10 +- .../non_std_strings/mfc_strings/mfc_iter.html | 4 +- doc/html/boost_regex/ref/posix.html | 8 +- doc/html/boost_regex/ref/regex_iterator.html | 4 +- doc/html/boost_regex/ref/regex_match.html | 4 +- doc/html/boost_regex/ref/regex_replace.html | 4 +- doc/html/boost_regex/ref/regex_search.html | 4 +- .../boost_regex/ref/regex_token_iterator.html | 4 +- doc/html/boost_regex/ref/regex_traits.html | 2 +- doc/html/boost_regex/ref/sub_match.html | 8 +- .../boost_regex/syntax/basic_extended.html | 66 ++--- doc/html/boost_regex/syntax/basic_syntax.html | 44 ++-- doc/html/boost_regex/syntax/perl_syntax.html | 147 +++++++---- doc/html/boost_regex/unicode.html | 4 +- doc/html/index.html | 4 +- doc/syntax_perl.qbk | 27 ++- include/boost/regex/v4/basic_regex.hpp | 1 + .../boost/regex/v4/basic_regex_creator.hpp | 78 +++++- include/boost/regex/v4/basic_regex_parser.hpp | 113 ++++++++- include/boost/regex/v4/perl_matcher.hpp | 26 +- .../boost/regex/v4/perl_matcher_common.hpp | 65 ++--- .../regex/v4/perl_matcher_non_recursive.hpp | 228 +++++++++++++++++- .../boost/regex/v4/perl_matcher_recursive.hpp | 125 +++++++++- include/boost/regex/v4/states.hpp | 5 +- test/regress/main.cpp | 1 + test/regress/test.hpp | 1 + test/regress/test_non_greedy_repeats.cpp | 1 - test/regress/test_perl_ex.cpp | 175 +++++++++++++- 44 files changed, 1013 insertions(+), 264 deletions(-) diff --git a/doc/html/boost_regex/background_information/examples.html b/doc/html/boost_regex/background_information/examples.html index 24db4e92..dd6786c5 100644 --- a/doc/html/boost_regex/background_information/examples.html +++ b/doc/html/boost_regex/background_information/examples.html @@ -28,7 +28,7 @@ Example Programs
- + Test Programs
@@ -107,7 +107,7 @@ Files: captures_test.cpp.

- + Example programs
@@ -133,7 +133,7 @@ Files: regex_timer.cpp.

- + Code snippets
diff --git a/doc/html/boost_regex/background_information/history.html b/doc/html/boost_regex/background_information/history.html index ed503d51..f04aecc4 100644 --- a/doc/html/boost_regex/background_information/history.html +++ b/doc/html/boost_regex/background_information/history.html @@ -26,7 +26,7 @@ History
- + Boost 1.38
@@ -53,7 +53,7 @@
- + Boost 1.34
@@ -76,7 +76,7 @@
- + Boost 1.33.1
@@ -146,7 +146,7 @@
- + Boost 1.33.0
@@ -201,7 +201,7 @@
- + Boost 1.32.1
@@ -209,7 +209,7 @@ Fixed bug in partial matches of bounded repeats of '.'.
- + Boost 1.31.0
diff --git a/doc/html/boost_regex/background_information/locale.html b/doc/html/boost_regex/background_information/locale.html index 5c9a1bdd..d6af3988 100644 --- a/doc/html/boost_regex/background_information/locale.html +++ b/doc/html/boost_regex/background_information/locale.html @@ -58,7 +58,7 @@ There are three separate localization mechanisms supported by Boost.Regex:

- + Win32 localization model.
@@ -90,7 +90,7 @@ are treated as "unknown" graphic characters.

- + C localization model.
@@ -114,7 +114,7 @@ libraries including version 1 of this library.

- + C++ localization model.
@@ -151,7 +151,7 @@ in your code. The best way to ensure this is to add the #define to <boost/regex/user.hpp>.

- + Providing a message catalogue
diff --git a/doc/html/boost_regex/background_information/standards.html b/doc/html/boost_regex/background_information/standards.html index 95cb80a2..115e5ce3 100644 --- a/doc/html/boost_regex/background_information/standards.html +++ b/doc/html/boost_regex/background_information/standards.html @@ -28,7 +28,7 @@ Conformance
- + C++

@@ -36,7 +36,7 @@ Report on C++ Library Extensions.

- + ECMAScript / JavaScript
@@ -49,7 +49,7 @@ rather than a Unicode escape sequence; use \x{DDDD} for Unicode escape sequences.

- + Perl

@@ -62,7 +62,7 @@ (??{code}) Not implementable in a compiled strongly typed language.

- + POSIX

@@ -82,7 +82,7 @@ a custom traits class.

- + Unicode

diff --git a/doc/html/boost_regex/captures.html b/doc/html/boost_regex/captures.html index 613c1b0a..1b9c755b 100644 --- a/doc/html/boost_regex/captures.html +++ b/doc/html/boost_regex/captures.html @@ -35,7 +35,7 @@ accessed.

- + Marked sub-expressions

@@ -218,7 +218,7 @@ output stream.

- + Unmatched Sub-Expressions

@@ -231,7 +231,7 @@ you can determine which sub-expressions matched by accessing the sub_match::matched data member.

- + Repeated Captures

diff --git a/doc/html/boost_regex/format/boost_format_syntax.html b/doc/html/boost_regex/format/boost_format_syntax.html index a98f3691..a927f24b 100644 --- a/doc/html/boost_regex/format/boost_format_syntax.html +++ b/doc/html/boost_regex/format/boost_format_syntax.html @@ -32,7 +32,7 @@ '$', '\', '(', ')', '?', and ':'.

- + Grouping

@@ -40,7 +40,7 @@ you want a to output literal parenthesis.

- + Conditionals

@@ -79,7 +79,7 @@ ?{NAME}true-expression:false-expression

- + Placeholder Sequences
@@ -319,7 +319,7 @@ as a literal.

- + Escape Sequences
diff --git a/doc/html/boost_regex/install.html b/doc/html/boost_regex/install.html index dbb05cd2..ac178cc9 100644 --- a/doc/html/boost_regex/install.html +++ b/doc/html/boost_regex/install.html @@ -49,7 +49,7 @@ file before you can use it, instructions for specific platforms are as follows:

- + Building with bjam

@@ -58,7 +58,7 @@ started guide for more information.

- + Building With Unicode and ICU Support
@@ -96,11 +96,11 @@ ICU you are using is binary compatible with the toolset you use to build Boost.

- + Building via makefiles
- + Borland C++ Builder:
- + GCC(2.95 and later)

@@ -302,7 +302,7 @@ see the config library documentation.

- + Sun Workshop 6.1

@@ -347,7 +347,7 @@ will build v9 variants of the regex library named libboost_regex_v9.a etc.

- + Makefiles for Other compilers
diff --git a/doc/html/boost_regex/ref/bad_expression.html b/doc/html/boost_regex/ref/bad_expression.html index a5fe7ddf..6ab5beae 100644 --- a/doc/html/boost_regex/ref/bad_expression.html +++ b/doc/html/boost_regex/ref/bad_expression.html @@ -27,7 +27,7 @@ bad_expression
- + Synopsis
#include <boost/pattern_except.hpp>
@@ -54,7 +54,7 @@
 } // namespace boost
 
- + Description
regex_error(const std::string& s, regex_constants::error_type err, std::ptrdiff_t pos);
diff --git a/doc/html/boost_regex/ref/basic_regex.html b/doc/html/boost_regex/ref/basic_regex.html
index dabf17a1..eb7cb84c 100644
--- a/doc/html/boost_regex/ref/basic_regex.html
+++ b/doc/html/boost_regex/ref/basic_regex.html
@@ -27,7 +27,7 @@
  basic_regex
 
 
- + Synopsis
#include <boost/regex.hpp>
@@ -244,7 +244,7 @@
 } // namespace boost
 
- + Description

@@ -327,7 +327,7 @@ basic_regex.

-

Table 1. basic_regex default construction postconditions

+

Table 1. basic_regex default construction postconditions

@@ -407,7 +407,7 @@ flags specified in f.

-

Table 2. Postconditions for basic_regex construction

+

Table 2. Postconditions for basic_regex construction

@@ -512,7 +512,7 @@ specified in f.

-

Table 3. Postconditions for basic_regex construction

+

Table 3. Postconditions for basic_regex construction

@@ -616,7 +616,7 @@ according the option flags specified in f.

-

Table 4. Postconditions for basic_regex construction

+

Table 4. Postconditions for basic_regex construction

@@ -727,7 +727,7 @@ flags specified in f.

-

Table 5. Postconditions for basic_regex construction

+

Table 5. Postconditions for basic_regex construction

@@ -829,7 +829,7 @@ flags specified in f.

-

Table 6. Postconditions for basic_regex construction

+

Table 6. Postconditions for basic_regex construction

@@ -1043,7 +1043,7 @@ in f.

-

Table 7. Postconditions for basic_regex::assign

+

Table 7. Postconditions for basic_regex::assign

diff --git a/doc/html/boost_regex/ref/concepts/traits_concept.html b/doc/html/boost_regex/ref/concepts/traits_concept.html index c517de45..1dcd410c 100644 --- a/doc/html/boost_regex/ref/concepts/traits_concept.html +++ b/doc/html/boost_regex/ref/concepts/traits_concept.html @@ -34,7 +34,7 @@ Boost-specific enhanced interface.

- + Minimal requirements.
@@ -381,7 +381,7 @@
- + Additional Optional Requirements
diff --git a/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html b/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html index 7c3b91e6..fab2dea1 100644 --- a/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html +++ b/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html @@ -34,7 +34,7 @@ previous version of Boost.Regex and will not be further updated:

- + Algorithm regex_format
diff --git a/doc/html/boost_regex/ref/error_type.html b/doc/html/boost_regex/ref/error_type.html index 260f3c72..26ce70d1 100644 --- a/doc/html/boost_regex/ref/error_type.html +++ b/doc/html/boost_regex/ref/error_type.html @@ -27,7 +27,7 @@ error_type
- + Synopsis

@@ -57,7 +57,7 @@ } // namespace boost

- + Description

diff --git a/doc/html/boost_regex/ref/match_flag_type.html b/doc/html/boost_regex/ref/match_flag_type.html index a7152b4d..941bdcb9 100644 --- a/doc/html/boost_regex/ref/match_flag_type.html +++ b/doc/html/boost_regex/ref/match_flag_type.html @@ -69,7 +69,7 @@ } // namespace boost

- + Description

diff --git a/doc/html/boost_regex/ref/match_results.html b/doc/html/boost_regex/ref/match_results.html index 8d065dae..2d1c346c 100644 --- a/doc/html/boost_regex/ref/match_results.html +++ b/doc/html/boost_regex/ref/match_results.html @@ -27,7 +27,7 @@ match_results

- + Synopsis
#include <boost/regex.hpp>
@@ -166,7 +166,7 @@
          match_results<BidirectionalIterator, Allocator>& m2);
 
- + Description

diff --git a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html index 0a7705fc..8a5e4325 100644 --- a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html +++ b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html @@ -43,7 +43,7 @@ on to the "real" algorithm.

- + u32regex_match

@@ -89,7 +89,7 @@ }

- + u32regex_search

@@ -128,7 +128,7 @@ }

- + u32regex_replace

diff --git a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html index b179c8e4..f1fd4fed 100644 --- a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html +++ b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html @@ -28,7 +28,7 @@ Unicode Aware Regex Iterators

- + u32regex_iterator

@@ -126,7 +126,7 @@ Provided of course that the input is encoded as UTF-8.

- + u32regex_token_iterator

diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html index d62ab666..4a6b3297 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html @@ -34,7 +34,7 @@ here they are anyway:

- + regex_match

@@ -82,7 +82,7 @@ }

- + regex_match (second overload)
@@ -110,7 +110,7 @@ }
- + regex_search

@@ -149,7 +149,7 @@ }

- + regex_search (second overload)
@@ -164,7 +164,7 @@ + s.GetLength(), e, f);

- + regex_replace

diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html index 74043549..a2df7d3d 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html @@ -32,7 +32,7 @@ an MFC/ATL string to a regex_iterator or regex_token_iterator:

- + regex_iterator creation helper
@@ -68,7 +68,7 @@ }
- + regex_token_iterator creation helpers
diff --git a/doc/html/boost_regex/ref/posix.html b/doc/html/boost_regex/ref/posix.html index a7456c78..d8b73499 100644 --- a/doc/html/boost_regex/ref/posix.html +++ b/doc/html/boost_regex/ref/posix.html @@ -165,7 +165,7 @@

- + regcomp

@@ -379,7 +379,7 @@

- + regerror

@@ -467,7 +467,7 @@

- + regexec

@@ -537,7 +537,7 @@

- + regfree

diff --git a/doc/html/boost_regex/ref/regex_iterator.html b/doc/html/boost_regex/ref/regex_iterator.html index 33a69671..9930174b 100644 --- a/doc/html/boost_regex/ref/regex_iterator.html +++ b/doc/html/boost_regex/ref/regex_iterator.html @@ -78,7 +78,7 @@ regex_constants::match_flag_type m = regex_constants::match_default);

- + Description

@@ -436,7 +436,7 @@ m.

- + Examples

diff --git a/doc/html/boost_regex/ref/regex_match.html b/doc/html/boost_regex/ref/regex_match.html index 7647ea87..5f432601 100644 --- a/doc/html/boost_regex/ref/regex_match.html +++ b/doc/html/boost_regex/ref/regex_match.html @@ -80,7 +80,7 @@ match_flag_type flags = match_default);

- + Description
template <class BidirectionalIterator, class Allocator, class charT, class traits>
@@ -360,7 +360,7 @@
         Effects: Returns the result of regex_match(s.begin(), s.end(), e, flags).
       

- + Examples

diff --git a/doc/html/boost_regex/ref/regex_replace.html b/doc/html/boost_regex/ref/regex_replace.html index 2e4551f9..650c1178 100644 --- a/doc/html/boost_regex/ref/regex_replace.html +++ b/doc/html/boost_regex/ref/regex_replace.html @@ -53,7 +53,7 @@ match_flag_type flags = match_default);

- + Description
template <class OutputIterator, class BidirectionalIterator, class traits, class charT>
@@ -163,7 +163,7 @@
         and then returns result.
       

- + Examples

diff --git a/doc/html/boost_regex/ref/regex_search.html b/doc/html/boost_regex/ref/regex_search.html index 4017b942..282023dd 100644 --- a/doc/html/boost_regex/ref/regex_search.html +++ b/doc/html/boost_regex/ref/regex_search.html @@ -73,7 +73,7 @@ match_flag_type flags = match_default);

- + Description
template <class BidirectionalIterator, class Allocator, class charT, class traits>
@@ -355,7 +355,7 @@
         Effects: Returns the result of regex_search(s.begin(), s.end(), e, flags).
       

- + Examples

diff --git a/doc/html/boost_regex/ref/regex_token_iterator.html b/doc/html/boost_regex/ref/regex_token_iterator.html index d837fb03..864905fd 100644 --- a/doc/html/boost_regex/ref/regex_token_iterator.html +++ b/doc/html/boost_regex/ref/regex_token_iterator.html @@ -136,7 +136,7 @@ regex_constants::match_flag_type m = regex_constants::match_default);

- + Description

@@ -383,7 +383,7 @@ m.

- + Examples

diff --git a/doc/html/boost_regex/ref/regex_traits.html b/doc/html/boost_regex/ref/regex_traits.html index 43fc043e..eeb38e8e 100644 --- a/doc/html/boost_regex/ref/regex_traits.html +++ b/doc/html/boost_regex/ref/regex_traits.html @@ -46,7 +46,7 @@ } // namespace boost

- + Description

diff --git a/doc/html/boost_regex/ref/sub_match.html b/doc/html/boost_regex/ref/sub_match.html index 4360bb0d..becc2a20 100644 --- a/doc/html/boost_regex/ref/sub_match.html +++ b/doc/html/boost_regex/ref/sub_match.html @@ -329,11 +329,11 @@ } // namespace boost

- + Description
- + Members

@@ -473,7 +473,7 @@

- + sub_match non-member operators
@@ -1008,7 +1008,7 @@ + m2.str().

- + Stream inserter

diff --git a/doc/html/boost_regex/syntax/basic_extended.html b/doc/html/boost_regex/syntax/basic_extended.html index afaca7c9..787c411f 100644 --- a/doc/html/boost_regex/syntax/basic_extended.html +++ b/doc/html/boost_regex/syntax/basic_extended.html @@ -28,7 +28,7 @@ Expression Syntax

- + Synopsis

@@ -46,7 +46,7 @@

- + POSIX Extended Syntax

@@ -56,7 +56,7 @@

.[{()\*+?|^$
- + Wildcard:

@@ -74,7 +74,7 @@

- + Anchors:

@@ -86,7 +86,7 @@ of an expression, or the last character of a sub-expression.

- + Marked sub-expressions:
@@ -98,7 +98,7 @@ to by a back-reference.

- + Repeats:

@@ -184,7 +184,7 @@ cab operator to be applied to.

- + Back references:

@@ -214,7 +214,7 @@ cab

- + Alternation

@@ -227,7 +227,7 @@ cab will match either of "abd" or "abef".

- + Character sets:
@@ -240,7 +240,7 @@ cab A bracket expression may contain any combination of the following:

- + Single characters:
@@ -249,7 +249,7 @@ cab or 'c'.

- + Character ranges:
@@ -265,7 +265,7 @@ cab the code points of the characters only.

- + Negation:

@@ -274,7 +274,7 @@ cab range a-c.

- + Character classes:
@@ -284,7 +284,7 @@ cab character class names.

- + Collating Elements:
@@ -312,7 +312,7 @@ cab matches a NUL character.

- + Equivalence classes:
@@ -329,7 +329,7 @@ cab or even all locales on one platform.

- + Combinations:

@@ -337,7 +337,7 @@ cab [[:digit:]a-c[.NUL.]].

- + Escapes

@@ -363,7 +363,7 @@ cab extensions are also supported by Boost.Regex:

- + Escapes matching a specific character
@@ -552,7 +552,7 @@ cab
- + "Single character" character classes:
@@ -706,7 +706,7 @@ cab
- + Character Properties
@@ -813,7 +813,7 @@ cab matches any "digit" character, as does \p{digit}.

- + Word Boundaries

@@ -888,7 +888,7 @@ cab

- + Buffer boundaries
@@ -979,7 +979,7 @@ cab
- + Continuation Escape
@@ -991,7 +991,7 @@ cab match to start where the last one ended.

- + Quoting escape
@@ -1005,7 +1005,7 @@ cab \*+aaa
- + Unicode escapes
@@ -1056,7 +1056,7 @@ cab
- + Any other escape
@@ -1065,7 +1065,7 @@ cab \@ matches a literal '@'.

- + Operator precedence
@@ -1101,7 +1101,7 @@ cab
- + What Gets Matched
@@ -1111,11 +1111,11 @@ cab rule.

- + Variations

- + Egrep

@@ -1136,7 +1136,7 @@ cab used with the -E option.

- + awk

@@ -1150,7 +1150,7 @@ cab these by default anyway.

- + Options

@@ -1163,7 +1163,7 @@ cab modify how the case and locale sensitivity are to be applied.

- + References

diff --git a/doc/html/boost_regex/syntax/basic_syntax.html b/doc/html/boost_regex/syntax/basic_syntax.html index 47bc62d0..eeb9ef51 100644 --- a/doc/html/boost_regex/syntax/basic_syntax.html +++ b/doc/html/boost_regex/syntax/basic_syntax.html @@ -28,7 +28,7 @@ Expression Syntax

- + Synopsis

@@ -45,7 +45,7 @@

- + POSIX Basic Syntax

@@ -55,7 +55,7 @@

.[\*^$
- + Wildcard:

@@ -73,7 +73,7 @@

- + Anchors:

@@ -85,7 +85,7 @@ of an expression, or the last character of a sub-expression.

- + Marked sub-expressions:
@@ -97,7 +97,7 @@ by a back-reference.

- + Repeats:

@@ -155,7 +155,7 @@ aaaa to.

- + Back references:

@@ -173,7 +173,7 @@ aaaa

aaabba
- + Character sets:
@@ -186,7 +186,7 @@ aaaa A bracket expression may contain any combination of the following:

- + Single characters:
@@ -195,7 +195,7 @@ aaaa or 'c'.

- + Character ranges:
@@ -211,7 +211,7 @@ aaaa of the characters only.

- + Negation:

@@ -220,7 +220,7 @@ aaaa range a-c.

- + Character classes:
@@ -230,7 +230,7 @@ aaaa character class names.

- + Collating Elements:
@@ -259,7 +259,7 @@ aaaa element names.

- + Equivalence classes:
@@ -276,7 +276,7 @@ aaaa or even all locales on one platform.

- + Combinations:

@@ -284,7 +284,7 @@ aaaa [[:digit:]a-c[.NUL.]].

- + Escapes

@@ -299,7 +299,7 @@ aaaa will match either a literal '\' or a '^'.

- + What Gets Matched

@@ -309,13 +309,13 @@ aaaa rule.

- + Variations

- + Grep

@@ -333,7 +333,7 @@ aaaa As its name suggests, this behavior is consistent with the Unix utility grep.

- + emacs

@@ -613,7 +613,7 @@ aaaa leftmost-longest rule.

- + Options

@@ -627,7 +627,7 @@ aaaa options modify how the case and locale sensitivity are to be applied.

- + References

diff --git a/doc/html/boost_regex/syntax/perl_syntax.html b/doc/html/boost_regex/syntax/perl_syntax.html index f257a5d0..46aa35e1 100644 --- a/doc/html/boost_regex/syntax/perl_syntax.html +++ b/doc/html/boost_regex/syntax/perl_syntax.html @@ -28,7 +28,7 @@ Syntax

- + Synopsis

@@ -43,7 +43,7 @@ boost::regex e2(my_expression, boost::regex::perl|boost::regex::icase);

- + Perl Regular Expression Syntax

@@ -53,7 +53,7 @@

.[{()\*+?|^$
- + Wildcard

@@ -73,7 +73,7 @@

- + Anchors

@@ -83,7 +83,7 @@ A '$' character shall match the end of a line.

- + Marked sub-expressions
@@ -94,7 +94,7 @@ can also repeated, or referred to by a back-reference.

- + Non-marking grouping
@@ -107,7 +107,7 @@ without splitting out any separate sub-expressions.

- + Repeats

@@ -188,7 +188,7 @@ to be applied to.

- + Non greedy repeats
@@ -218,7 +218,7 @@ while consuming as little input as possible.

- + Pocessive repeats
@@ -250,7 +250,7 @@ while giving nothing back.

- + Back references

@@ -360,7 +360,7 @@ named "two".

- + Alternation

@@ -387,7 +387,7 @@ (?:abc)?? has exactly the same effect.

- + Character sets

@@ -399,7 +399,7 @@ A bracket expression may contain any combination of the following:

- + Single characters

@@ -407,7 +407,7 @@ 'b', or 'c'.

- + Character ranges
@@ -421,7 +421,7 @@ sensitive.

- + Negation

@@ -430,7 +430,7 @@ matches any character that is not in the range a-c.

- + Character classes
@@ -441,7 +441,7 @@ class names.

- + Collating Elements
@@ -463,7 +463,7 @@ matches a \0 character.

- + Equivalence classes
@@ -480,7 +480,7 @@ or even all locales on one platform.

- + Escaped Characters
@@ -492,7 +492,7 @@ is not a "word" character.

- + Combinations

@@ -500,7 +500,7 @@ [[:digit:]a-c[.NUL.]].

- + Escapes

@@ -692,7 +692,7 @@

- + "Single character" character classes:
@@ -894,7 +894,7 @@
- + Character Properties
@@ -1002,7 +1002,7 @@ as does \p{digit}.

- + Word Boundaries

@@ -1021,7 +1021,7 @@ \B Matches only when not at a word boundary.

- + Buffer boundaries

@@ -1046,7 +1046,7 @@ to the regular expression \n*\z

- + Continuation Escape
@@ -1058,7 +1058,7 @@ one ended.

- + Quoting escape

@@ -1071,7 +1071,7 @@ \*+aaa

- + Unicode escapes

@@ -1081,7 +1081,7 @@ followed by a sequence of zero or more combining characters.

- + Matching Line Endings
@@ -1090,7 +1090,7 @@ sequence, specifically it is identical to the expression (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}]).

- + Keeping back some text
@@ -1105,7 +1105,7 @@ This can be used to simulate variable width lookbehind assertions.

- + Any other escape
@@ -1114,7 +1114,7 @@ \@ matches a literal '@'.

- + Perl Extended Patterns
@@ -1123,7 +1123,7 @@ (?.

- + Named Subexpressions
@@ -1145,14 +1145,14 @@ format string for search and replace operations, or in the match_results member functions.

- + Comments

(?# ... ) is treated as a comment, it's contents are ignored.

- + Modifiers

@@ -1166,7 +1166,7 @@ pattern only.

- + Non-marking groups
@@ -1175,7 +1175,7 @@ an additional sub-expression.

- + Branch reset

@@ -1197,7 +1197,7 @@ # 1 2 2 3 2 3 4

- + Lookahead

@@ -1220,7 +1220,7 @@ could be used to validate the password.

- + Lookbehind

@@ -1234,7 +1234,7 @@ (pattern must be of fixed length).

- + Independent sub-expressions
@@ -1246,8 +1246,32 @@ be considered, if this doesn't allow the expression as a whole to match then no match is found at all.

+
+ + Recursive + Expressions +
+

+ (?N) (?-N) (?+N) + (?R) (?0) +

+

+ (?R) and (?0) recurse to the start + of the entire pattern. +

+

+ (?N) executes sub-expression N + recursively, for example (?2) will recurse to sub-expression + 2. +

+

+ (?-N) and (?+N) + are relative recursions, so for example (?-1) recurses + to the last sub-expression to be declared, and (?+1) recurses + to the next sub-expression to be declared. +

- + Conditional Expressions
@@ -1261,12 +1285,35 @@ if the condition is true, otherwise fails.

- condition may be either a forward lookahead assert, - or the index of a marked sub-expression (the condition becomes true if the - sub-expression has been matched). + condition may be either: a forward lookahead assert, + the index of a marked sub-expression (the condition becomes true if the sub-expression + has been matched), or an index of a recursion (the condition become true + if we are executing directly inside the specified recursion).

+

+ Here is a summary of the possible predicates: +

+
- + Operator precedence
@@ -1301,7 +1348,7 @@

- + What gets matched

@@ -1476,7 +1523,7 @@

- + Variations

@@ -1485,7 +1532,7 @@ and JScript are all synonyms for perl.

- + Options

@@ -1497,7 +1544,7 @@ are to be applied.

- + Pattern Modifiers

@@ -1509,7 +1556,7 @@ and no_mod_s.

- + References

diff --git a/doc/html/boost_regex/unicode.html b/doc/html/boost_regex/unicode.html index 84b3eb20..5abbf2a0 100644 --- a/doc/html/boost_regex/unicode.html +++ b/doc/html/boost_regex/unicode.html @@ -30,7 +30,7 @@ There are two ways to use Boost.Regex with Unicode strings:

- + Rely on wchar_t

@@ -56,7 +56,7 @@

- + Use a Unicode Aware Regular Expression Type.
diff --git a/doc/html/index.html b/doc/html/index.html index 0e5fb761..51da76e4 100644 --- a/doc/html/index.html +++ b/doc/html/index.html @@ -28,7 +28,7 @@
-

+

Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

@@ -196,7 +196,7 @@

- +

Last revised: May 13, 2009 at 09:14:16 GMT

Last revised: July 16, 2009 at 15:54:11 GMT


diff --git a/doc/syntax_perl.qbk b/doc/syntax_perl.qbk index 93707e2b..d97eb989 100644 --- a/doc/syntax_perl.qbk +++ b/doc/syntax_perl.qbk @@ -511,6 +511,17 @@ are typically used to improve performance; only the best possible match for pattern will be considered, if this doesn't allow the expression as a whole to match then no match is found at all. +[h5 Recursive Expressions] + +[^(?['N]) (?-['N]) (?+['N]) (?R) (?0)] + +=(?R)= and =(?0)= recurse to the start of the entire pattern. + +[^(?['N])] executes sub-expression /N/ recursively, for example =(?2)= will recurse to sub-expression 2. + +[^(?-['N])] and [^(?+['N])] are relative recursions, so for example =(?-1)= recurses to the last sub-expression to be declared, +and =(?+1)= recurses to the next sub-expression to be declared. + [h5 Conditional Expressions] =(?(condition)yes-pattern|no-pattern)= attempts to match /yes-pattern/ if @@ -519,9 +530,21 @@ the /condition/ is true, otherwise attempts to match /no-pattern/. =(?(condition)yes-pattern)= attempts to match /yes-pattern/ if the /condition/ is true, otherwise fails. -/condition/ may be either a forward lookahead assert, or the index of +/condition/ may be either: a forward lookahead assert, the index of a marked sub-expression (the condition becomes true if the sub-expression -has been matched). +has been matched), or an index of a recursion (the condition become true if we are executing +directly inside the specified recursion). + +Here is a summary of the possible predicates: + +* [^(?(?\=assert)yes-pattern|no-pattern)] Executes /yes-pattern/ if the forward look-ahead assert matches, otherwise +executes /no-pattern/. +* =(?(?!assert)yes-pattern|no-pattern)= Executes /yes-pattern/ if the forward look-ahead assert does not match, otherwise +executes /no-pattern/. +* =(?(R)yes-pattern|no-pattern)= Executes /yes-pattern/ if we are executing inside a recursion, otherwise +executes /no-pattern/. +* [^(?(R['N])yes-pattern|no-pattern)] Executes /yes-pattern/ if we are executing inside a recursion to sub-expression /N/, otherwise +executes /no-pattern/. [h4 Operator precedence] diff --git a/include/boost/regex/v4/basic_regex.hpp b/include/boost/regex/v4/basic_regex.hpp index 14d19218..617d578e 100644 --- a/include/boost/regex/v4/basic_regex.hpp +++ b/include/boost/regex/v4/basic_regex.hpp @@ -194,6 +194,7 @@ struct regex_data : public named_subexpressions std::vector< std::pair< std::size_t, std::size_t> > m_subs; // Position of sub-expressions within the *string*. + bool m_has_recursions; // whether we have recursive expressions; }; // // class basic_regex_implementation diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index 9f2cbeec..d6a5c09b 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -240,6 +240,7 @@ protected: bool m_has_backrefs; // true if there are actually any backrefs unsigned m_backrefs; // bitmask of permitted backrefs boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; + bool m_has_recursions; // set when we have recursive expresisons to fixup typename traits::char_class_type m_word_mask; // mask used to determine if a character is a word character typename traits::char_class_type m_mask_space; // mask used to determine if a character is a word character typename traits::char_class_type m_lower_mask; // mask used to determine if a character is a lowercase character @@ -250,6 +251,7 @@ private: basic_regex_creator(const basic_regex_creator&); void fixup_pointers(re_syntax_base* state); + void fixup_recursions(re_syntax_base* state); void create_startmaps(re_syntax_base* state); int calculate_backstep(re_syntax_base* state); void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask); @@ -263,7 +265,7 @@ private: template basic_regex_creator::basic_regex_creator(regex_data* data) - : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0) + : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0), m_has_recursions(false) { m_pdata->m_data.clear(); m_pdata->m_status = ::boost::regex_constants::error_ok; @@ -692,6 +694,13 @@ void basic_regex_creator::finalize(const charT* p1, const charT* m_pdata->m_first_state = static_cast(m_pdata->m_data.data()); // fixup pointers in the machine: fixup_pointers(m_pdata->m_first_state); + if(m_has_recursions) + { + m_pdata->m_has_recursions = true; + fixup_recursions(m_pdata->m_first_state); + } + else + m_pdata->m_has_recursions = false; // create nested startmaps: create_startmaps(m_pdata->m_first_state); // create main startmap: @@ -713,6 +722,13 @@ void basic_regex_creator::fixup_pointers(re_syntax_base* state) { switch(state->type) { + case syntax_element_recurse: + m_has_recursions = true; + if(state->next.i) + state->next.p = getaddress(state->next.i, state); + else + state->next.p = 0; + break; case syntax_element_rep: case syntax_element_dot_rep: case syntax_element_char_rep: @@ -738,6 +754,65 @@ void basic_regex_creator::fixup_pointers(re_syntax_base* state) } } +template +void basic_regex_creator::fixup_recursions(re_syntax_base* state) +{ + re_syntax_base* base = state; + while(state) + { + switch(state->type) + { + case syntax_element_recurse: + { + bool ok = false; + re_syntax_base* p = base; + /* + if(static_cast(state)->alt.i == 0) + { + ok = true; + static_cast(state)->alt.p = p; + } + else + {*/ + while(p) + { + if((p->type == syntax_element_startmark) && (static_cast(p)->index == static_cast(state)->alt.i)) + { + static_cast(state)->alt.p = p; + ok = true; + break; + } + p = p->next.p; + } + //} + if(!ok) + { + // recursion to sub-expression that doesn't exist: + if(0 == this->m_pdata->m_status) // update the error code if not already set + this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; + // + // clear the expression, we should be empty: + // + this->m_pdata->m_expression = 0; + this->m_pdata->m_expression_len = 0; + // + // and throw if required: + // + if(0 == (this->flags() & regex_constants::no_except)) + { + std::string message = this->m_pdata->m_ptraits->error_string(boost::regex_constants::error_bad_pattern); + boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); + e.raise(); + } + } + } + default: + break; + } + state = state->next.p; + } +} + template void basic_regex_creator::create_startmaps(re_syntax_base* state) { @@ -953,6 +1028,7 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, create_startmap(state->next.p, 0, pnull, mask); return; } + case syntax_element_recurse: case syntax_element_backref: // can be null, and any character can match: if(pnull) diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index e36f91db..a9e77b3b 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -125,8 +125,16 @@ void basic_regex_parser::parse(const charT* p1, const charT* p2, switch(l_flags & regbase::main_option_type) { case regbase::perl_syntax_group: - m_parser_proc = &basic_regex_parser::parse_extended; - break; + { + m_parser_proc = &basic_regex_parser::parse_extended; + // + // Add a leading paren with index zero to give recursions a target: + // + re_brace* br = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); + br->index = 0; + br->icase = this->flags() & regbase::icase; + break; + } case regbase::basic_syntax_group: m_parser_proc = &basic_regex_parser::parse_basic; break; @@ -387,6 +395,7 @@ bool basic_regex_parser::parse_open_paren() } re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); pb->index = markid; + pb->icase = this->flags() & regbase::icase; std::ptrdiff_t last_paren_start = this->getoffset(pb); // back up insertion point for alternations, and set new point: std::ptrdiff_t last_alt_point = m_alt_insert_point; @@ -399,6 +408,11 @@ bool basic_regex_parser::parse_open_paren() bool old_case_change = m_has_case_change; m_has_case_change = false; // no changes to this scope as yet... // + // Back up branch reset data in case we have a nested (?|...) + // + int mark_reset = m_mark_reset; + m_mark_reset = -1; + // // now recursively add more states, this will terminate when we get to a // matching ')' : // @@ -423,6 +437,10 @@ bool basic_regex_parser::parse_open_paren() this->flags(opts); m_has_case_change = old_case_change; // + // restore branch reset: + // + m_mark_reset = mark_reset; + // // we either have a ')' or we have run out of characters prematurely: // if(m_position == m_end) @@ -444,6 +462,7 @@ bool basic_regex_parser::parse_open_paren() // pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); pb->index = markid; + pb->icase = this->flags() & regbase::icase; this->m_paren_start = last_paren_start; // // restore the alternate insertion point: @@ -729,6 +748,7 @@ escape_type_class_jump: { re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); pb->index = -5; + pb->icase = this->flags() & regbase::icase; this->m_pdata->m_data.align(); ++m_position; return true; @@ -795,6 +815,7 @@ escape_type_class_jump: m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); pb->index = i; + pb->icase = this->flags() & regbase::icase; } else { @@ -946,11 +967,13 @@ bool basic_regex_parser::parse_repeat(std::size_t low, std::size_ { re_brace* pb = static_cast(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace))); pb->index = -3; + pb->icase = this->flags() & regbase::icase; re_jump* jmp = static_cast(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump))); this->m_pdata->m_data.align(); jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp); pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); pb->index = -3; + pb->icase = this->flags() & regbase::icase; } return true; } @@ -1706,6 +1729,7 @@ bool basic_regex_parser::parse_backref() m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); pb->index = i; + pb->icase = this->flags() & regbase::icase; } else { @@ -1793,6 +1817,7 @@ bool basic_regex_parser::parse_perl_extension() int markid = 0; std::ptrdiff_t jump_offset = 0; re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); + pb->icase = this->flags() & regbase::icase; std::ptrdiff_t last_paren_start = this->getoffset(pb); // back up insertion point for alternations, and set new point: std::ptrdiff_t last_alt_point = m_alt_insert_point; @@ -1806,6 +1831,7 @@ bool basic_regex_parser::parse_perl_extension() charT name_delim; int mark_reset = m_mark_reset; m_mark_reset = -1; + int v; // // select the actual extension used: // @@ -1821,6 +1847,57 @@ bool basic_regex_parser::parse_perl_extension() pb->index = markid = 0; ++m_position; break; + case regex_constants::syntax_digit: + { + // + // a recursive subexpression: + // + v = this->m_traits.toi(m_position, m_end, 10); + if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } +insert_recursion: + pb->index = markid = 0; + static_cast(this->append_state(syntax_element_recurse, sizeof(re_jump)))->alt.i = v; + static_cast( + this->append_state(syntax_element_toggle_case, sizeof(re_case)) + )->icase = this->flags() & regbase::icase; + break; + } + case regex_constants::syntax_plus: + // + // A forward-relative recursive subexpression: + // + ++m_position; + v = this->m_traits.toi(m_position, m_end, 10); + if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + v += m_mark_count; + goto insert_recursion; + case regex_constants::syntax_dash: + // + // Possibly a backward-relative recursive subexpression: + // + ++m_position; + v = this->m_traits.toi(m_position, m_end, 10); + if(v <= 0) + { + --m_position; + // Oops not a relative recursion at all, but a (?-imsx) group: + goto option_group_jump; + } + v = m_mark_count + 1 - v; + if(v <= 0) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + goto insert_recursion; case regex_constants::syntax_equal: pb->index = markid = -1; ++m_position; @@ -1882,7 +1959,24 @@ bool basic_regex_parser::parse_perl_extension() return false; } int v = this->m_traits.toi(m_position, m_end, 10); - if(v > 0) + if(*m_position == charT('R')) + { + ++m_position; + v = -this->m_traits.toi(m_position, m_end, 10); + re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); + br->index = v < 0 ? (v - 1) : 0; + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + if(++m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + } + else if(v > 0) { re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); br->index = v; @@ -1976,9 +2070,21 @@ named_capture_jump: break; } default: + if(*m_position == charT('R')) + { + ++m_position; + v = 0; + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + goto insert_recursion; + } // // lets assume that we have a (?imsx) group and try and parse it: // +option_group_jump: regex_constants::syntax_option_type opts = parse_options(); if(m_position == m_end) return false; @@ -2095,6 +2201,7 @@ named_capture_jump: // pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); pb->index = markid; + pb->icase = this->flags() & regbase::icase; this->m_paren_start = last_paren_start; // // restore the alternate insertion point: diff --git a/include/boost/regex/v4/perl_matcher.hpp b/include/boost/regex/v4/perl_matcher.hpp index e1ef9c2d..726c2881 100644 --- a/include/boost/regex/v4/perl_matcher.hpp +++ b/include/boost/regex/v4/perl_matcher.hpp @@ -285,7 +285,8 @@ public: } ~repeater_count() { - *stack = next; + if(next) + *stack = next; } std::size_t get_count() { return count; } int get_id() { return state_id; } @@ -325,6 +326,17 @@ enum saved_state_type saved_state_count = 14 }; +template +struct recursion_info +{ + typedef typename Results::value_type value_type; + typedef typename value_type::iterator iterator; + int id; + const re_syntax_base* preturn_address; + Results results; + repeater_count* repeater_stack; +}; + #ifdef BOOST_MSVC #pragma warning(push) #pragma warning(disable : 4251 4231 4660) @@ -340,6 +352,7 @@ public: typedef std::size_t traits_size_type; typedef typename is_byte::width_type width_type; typedef typename regex_iterator_traits::difference_type difference_type; + typedef match_results results_type; perl_matcher(BidiIterator first, BidiIterator end, match_results& what, @@ -348,7 +361,7 @@ public: BidiIterator l_base) : m_result(what), base(first), last(end), position(first), backstop(l_base), re(e), traits_inst(e.get_traits()), - m_independent(false), next_count(&rep_obj), rep_obj(&next_count) + m_independent(false), next_count(&rep_obj), rep_obj(&next_count), recursion_stack_position(0) { construct_init(e, f); } @@ -413,6 +426,7 @@ private: #ifdef BOOST_REGEX_RECURSIVE bool backtrack_till_match(std::size_t count); #endif + bool match_recursion(); // find procs stored in s_find_vtable: bool find_restart_any(); @@ -468,6 +482,9 @@ private: typename traits::char_class_type m_word_mask; // the bitmask to use when determining whether a match_any matches a newline or not: unsigned char match_any_mask; + // recursion information: + recursion_info recursion_stack[50]; + unsigned recursion_stack_position; #ifdef BOOST_REGEX_NON_RECURSIVE // @@ -491,6 +508,8 @@ private: bool unwind_short_set_repeat(bool); bool unwind_long_set_repeat(bool); bool unwind_non_greedy_repeat(bool); + bool unwind_recursion(bool); + bool unwind_recursion_pop(bool); void destroy_single_repeat(); void push_matched_paren(int index, const sub_match& sub); void push_recursion_stopper(); @@ -499,7 +518,8 @@ private: void push_repeater_count(int i, repeater_count** s); void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id); void push_non_greedy_repeat(const re_syntax_base* ps); - + void push_recursion(int id, const re_syntax_base* p, results_type* presults); + void push_recursion_pop(); // pointer to base of stack: saved_state* m_stack_base; diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index 09b0a9bb..a3eed0c2 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -345,25 +345,6 @@ bool perl_matcher::match_prefix() return m_has_found_match; } -template -bool perl_matcher::match_endmark() -{ - int index = static_cast(pstate)->index; - if(index > 0) - { - if((m_match_flags & match_nosubs) == 0) - m_presult->set_second(position, index); - } - else if((index < 0) && (index != -4)) - { - // matched forward lookahead: - pstate = 0; - return true; - } - pstate = pstate->next.p; - return true; -} - template bool perl_matcher::match_literal() { @@ -464,35 +445,6 @@ bool perl_matcher::match_wild() return true; } -template -bool perl_matcher::match_match() -{ - if((m_match_flags & match_not_null) && (position == (*m_presult)[0].first)) - return false; - if((m_match_flags & match_all) && (position != last)) - return false; - if((m_match_flags & regex_constants::match_not_initial_null) && (position == search_base)) - return false; - m_presult->set_second(position); - pstate = 0; - m_has_found_match = true; - if((m_match_flags & match_posix) == match_posix) - { - m_result.maybe_assign(*m_presult); - if((m_match_flags & match_any) == 0) - return false; - } -#ifdef BOOST_REGEX_MATCH_EXTRA - if(match_extra & m_match_flags) - { - for(unsigned i = 0; i < m_presult->size(); ++i) - if((*m_presult)[i].matched) - ((*m_presult)[i]).get_captures().push_back((*m_presult)[i]); - } -#endif - return true; -} - template bool perl_matcher::match_word_boundary() { @@ -760,8 +712,21 @@ template inline bool perl_matcher::match_assert_backref() { // return true if marked sub-expression N has been matched: - bool result = (*m_presult)[static_cast(pstate)->index].matched; - pstate = pstate->next.p; + int index = static_cast(pstate)->index; + bool result; + if(index > 0) + { + // Have we matched subexpression "index"? + result = (*m_presult)[index].matched; + pstate = pstate->next.p; + } + else + { + // Have we recursed into subexpression "index"? + // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1. + result = recursion_stack_position && ((recursion_stack[recursion_stack_position-1].id == -index-1) || (index == 0)); + pstate = pstate->next.p; + } return result; } diff --git a/include/boost/regex/v4/perl_matcher_non_recursive.hpp b/include/boost/regex/v4/perl_matcher_non_recursive.hpp index 572e32a6..2ce7ebe0 100644 --- a/include/boost/regex/v4/perl_matcher_non_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_non_recursive.hpp @@ -127,10 +127,21 @@ struct saved_single_repeat : public saved_state : saved_state(arg_id), count(c), rep(r), last_position(lp){} }; +template +struct saved_recursion : public saved_state +{ + saved_recursion(int id, const re_syntax_base* p, Results* pr) + : saved_state(14), recursion_id(id), preturn_address(p), results(*pr) + {} + int recursion_id; + const re_syntax_base* preturn_address; + Results results; +}; + template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[29] = + static matcher_proc_type const s_match_vtable[30] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -165,6 +176,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_backstep, &perl_matcher::match_assert_backref, &perl_matcher::match_toggle_case, + &perl_matcher::match_recursion, }; push_recursion_stopper(); @@ -316,10 +328,26 @@ inline void perl_matcher::push_single_repeat(st m_backup_state = pmp; } +template +inline void perl_matcher::push_recursion(int id, const re_syntax_base* p, results_type* presults) +{ + saved_recursion* pmp = static_cast*>(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast*>(m_backup_state); + --pmp; + } + (void) new (pmp)saved_recursion(id, p, presults); + m_backup_state = pmp; +} + template bool perl_matcher::match_startmark() { int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; switch(index) { case 0: @@ -859,6 +887,100 @@ bool perl_matcher::match_long_set_repeat() #endif } +template +bool perl_matcher::match_recursion() +{ + BOOST_ASSERT(pstate->type == syntax_element_recurse); + // + // Backup call stack: + // + push_recursion_pop(); + // + // Set new call stack: + // + if(recursion_stack_position >= static_cast(sizeof(recursion_stack)/sizeof(recursion_stack[0]))) + { + return false; + } + recursion_stack[recursion_stack_position].preturn_address = pstate->next.p; + recursion_stack[recursion_stack_position].results = *m_presult; + pstate = static_cast(pstate)->alt.p; + recursion_stack[recursion_stack_position].id = static_cast(pstate)->index; + ++recursion_stack_position; + //BOOST_ASSERT(recursion_stack[recursion_stack_position-1].id); + return true; +} + +template +bool perl_matcher::match_endmark() +{ + int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; + if(index > 0) + { + if((m_match_flags & match_nosubs) == 0) + { + m_presult->set_second(position, index); + } + if(recursion_stack_position) + { + if(index == recursion_stack[recursion_stack_position-1].id) + { + --recursion_stack_position; + pstate = recursion_stack[recursion_stack_position].preturn_address; + *m_presult = recursion_stack[recursion_stack_position].results; + push_recursion(recursion_stack[recursion_stack_position].id, recursion_stack[recursion_stack_position].preturn_address, &recursion_stack[recursion_stack_position].results); + } + } + } + else if((index < 0) && (index != -4)) + { + // matched forward lookahead: + pstate = 0; + return true; + } + pstate = pstate->next.p; + return true; +} + +template +bool perl_matcher::match_match() +{ + if(recursion_stack_position) + { + BOOST_ASSERT(0 == recursion_stack[recursion_stack_position-1].id); + --recursion_stack_position; + pstate = recursion_stack[recursion_stack_position].preturn_address; + *m_presult = recursion_stack[recursion_stack_position].results; + push_recursion(recursion_stack[recursion_stack_position].id, recursion_stack[recursion_stack_position].preturn_address, &recursion_stack[recursion_stack_position].results); + return true; + } + if((m_match_flags & match_not_null) && (position == (*m_presult)[0].first)) + return false; + if((m_match_flags & match_all) && (position != last)) + return false; + if((m_match_flags & regex_constants::match_not_initial_null) && (position == search_base)) + return false; + m_presult->set_second(position); + pstate = 0; + m_has_found_match = true; + if((m_match_flags & match_posix) == match_posix) + { + m_result.maybe_assign(*m_presult); + if((m_match_flags & match_any) == 0) + return false; + } +#ifdef BOOST_REGEX_MATCH_EXTRA + if(match_extra & m_match_flags) + { + for(unsigned i = 0; i < m_presult->size(); ++i) + if((*m_presult)[i].matched) + ((*m_presult)[i]).get_captures().push_back((*m_presult)[i]); + } +#endif + return true; +} + /**************************************************************************** Unwind and associated proceedures follow, these perform what normal stack @@ -869,7 +991,7 @@ unwinding does in the recursive implementation. template bool perl_matcher::unwind(bool have_match) { - static unwind_proc_type const s_unwind_table[14] = + static unwind_proc_type const s_unwind_table[18] = { &perl_matcher::unwind_end, &perl_matcher::unwind_paren, @@ -885,6 +1007,8 @@ bool perl_matcher::unwind(bool have_match) &perl_matcher::unwind_short_set_repeat, &perl_matcher::unwind_long_set_repeat, &perl_matcher::unwind_non_greedy_repeat, + &perl_matcher::unwind_recursion, + &perl_matcher::unwind_recursion_pop, }; m_recursive_result = have_match; @@ -1388,6 +1512,106 @@ bool perl_matcher::unwind_non_greedy_repeat(boo return r; } +template +bool perl_matcher::unwind_recursion(bool r) +{ + saved_recursion* pmp = static_cast*>(m_backup_state); + if(!r) + { + recursion_stack[recursion_stack_position].id = pmp->recursion_id; + recursion_stack[recursion_stack_position].preturn_address = pmp->preturn_address; + recursion_stack[recursion_stack_position].results = pmp->results; + ++recursion_stack_position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +bool perl_matcher::unwind_recursion_pop(bool r) +{ + saved_state* pmp = static_cast(m_backup_state); + if(!r) + { + --recursion_stack_position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +void perl_matcher::push_recursion_pop() +{ + saved_state* pmp = static_cast(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast(m_backup_state); + --pmp; + } + (void) new (pmp)saved_state(15); + m_backup_state = pmp; +} +/* +template +bool perl_matcher::unwind_parenthesis_pop(bool r) +{ + saved_state* pmp = static_cast(m_backup_state); + if(!r) + { + --parenthesis_stack_position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +void perl_matcher::push_parenthesis_pop() +{ + saved_state* pmp = static_cast(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast(m_backup_state); + --pmp; + } + (void) new (pmp)saved_state(16); + m_backup_state = pmp; +} + +template +bool perl_matcher::unwind_parenthesis_push(bool r) +{ + saved_position* pmp = static_cast*>(m_backup_state); + if(!r) + { + parenthesis_stack[parenthesis_stack_position++] = pmp->position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +inline void perl_matcher::push_parenthesis_push(BidiIterator p) +{ + saved_position* pmp = static_cast*>(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast*>(m_backup_state); + --pmp; + } + (void) new (pmp)saved_position(0, p, 17); + m_backup_state = pmp; +} +*/ } // namespace re_detail } // namespace boost diff --git a/include/boost/regex/v4/perl_matcher_recursive.hpp b/include/boost/regex/v4/perl_matcher_recursive.hpp index bc0fe958..48f08b7b 100644 --- a/include/boost/regex/v4/perl_matcher_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_recursive.hpp @@ -60,7 +60,7 @@ public: template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[29] = + static matcher_proc_type const s_match_vtable[30] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -95,6 +95,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_backstep, &perl_matcher::match_assert_backref, &perl_matcher::match_toggle_case, + &perl_matcher::match_recursion, }; if(state_count > max_state_count) @@ -117,6 +118,7 @@ template bool perl_matcher::match_startmark() { int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; bool r = true; switch(index) { @@ -848,6 +850,127 @@ bool perl_matcher::backtrack_till_match(std::si #endif } +template +bool perl_matcher::match_recursion() +{ + BOOST_ASSERT(pstate->type == syntax_element_recurse); + // + // Set new call stack: + // + if(recursion_stack_position >= static_cast(sizeof(recursion_stack)/sizeof(recursion_stack[0]))) + { + return false; + } + recursion_stack[recursion_stack_position].preturn_address = pstate->next.p; + recursion_stack[recursion_stack_position].results = *m_presult; + recursion_stack[recursion_stack_position].repeater_stack = next_count; + pstate = static_cast(pstate)->alt.p; + recursion_stack[recursion_stack_position].id = static_cast(pstate)->index; + ++recursion_stack_position; + + repeater_count* saved = next_count; + repeater_count r(&next_count); // resets all repeat counts since we're recursing and starting fresh on those + next_count = &r; + bool result = match_all_states(); + next_count = saved; + + if(!result) + { + --recursion_stack_position; + next_count = recursion_stack[recursion_stack_position].repeater_stack; + *m_presult = recursion_stack[recursion_stack_position].results; + return false; + } + return true; +} + +template +bool perl_matcher::match_endmark() +{ + int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; + if(index > 0) + { + if((m_match_flags & match_nosubs) == 0) + { + m_presult->set_second(position, index); + } + if(recursion_stack_position) + { + if(index == recursion_stack[recursion_stack_position-1].id) + { + --recursion_stack_position; + recursion_info saved = recursion_stack[recursion_stack_position]; + const re_syntax_base* saved_state = pstate = saved.preturn_address; + repeater_count* saved_count = next_count; + next_count = saved.repeater_stack; + *m_presult = saved.results; + if(!match_all_states()) + { + recursion_stack[recursion_stack_position] = saved; + ++recursion_stack_position; + next_count = saved_count; + return false; + } + } + } + } + else if((index < 0) && (index != -4)) + { + // matched forward lookahead: + pstate = 0; + return true; + } + pstate = pstate ? pstate->next.p : 0; + return true; +} + +template +bool perl_matcher::match_match() +{ + if(recursion_stack_position) + { + BOOST_ASSERT(0 == recursion_stack[recursion_stack_position-1].id); + --recursion_stack_position; + const re_syntax_base* saved_state = pstate = recursion_stack[recursion_stack_position].preturn_address; + *m_presult = recursion_stack[recursion_stack_position].results; + if(!match_all_states()) + { + recursion_stack[recursion_stack_position].preturn_address = saved_state; + recursion_stack[recursion_stack_position].results = *m_presult; + ++recursion_stack_position; + return false; + } + return true; + } + if((m_match_flags & match_not_null) && (position == (*m_presult)[0].first)) + return false; + if((m_match_flags & match_all) && (position != last)) + return false; + if((m_match_flags & regex_constants::match_not_initial_null) && (position == search_base)) + return false; + m_presult->set_second(position); + pstate = 0; + m_has_found_match = true; + if((m_match_flags & match_posix) == match_posix) + { + m_result.maybe_assign(*m_presult); + if((m_match_flags & match_any) == 0) + return false; + } +#ifdef BOOST_REGEX_MATCH_EXTRA + if(match_extra & m_match_flags) + { + for(unsigned i = 0; i < m_presult->size(); ++i) + if((*m_presult)[i].matched) + ((*m_presult)[i]).get_captures().push_back((*m_presult)[i]); + } +#endif + return true; +} + + + } // namespace re_detail } // namespace boost #ifdef BOOST_MSVC diff --git a/include/boost/regex/v4/states.hpp b/include/boost/regex/v4/states.hpp index 44dd2b4a..efdebbe5 100644 --- a/include/boost/regex/v4/states.hpp +++ b/include/boost/regex/v4/states.hpp @@ -118,7 +118,9 @@ enum syntax_element_type syntax_element_backstep = syntax_element_long_set_rep + 1, // an assertion that a mark was matched: syntax_element_assert_backref = syntax_element_backstep + 1, - syntax_element_toggle_case = syntax_element_assert_backref + 1 + syntax_element_toggle_case = syntax_element_assert_backref + 1, + // a recursive expression: + syntax_element_recurse = syntax_element_toggle_case + 1 }; #ifdef BOOST_REGEX_DEBUG @@ -156,6 +158,7 @@ struct re_brace : public re_syntax_base // The index to match, can be zero (don't mark the sub-expression) // or negative (for perl style (?...) extentions): int index; + bool icase; }; /*** struct re_dot ************************************************** diff --git a/test/regress/main.cpp b/test/regress/main.cpp index 064a19dd..e741db50 100644 --- a/test/regress/main.cpp +++ b/test/regress/main.cpp @@ -72,6 +72,7 @@ void run_tests() RUN_TESTS(test_unicode); RUN_TESTS(test_pocessive_repeats); RUN_TESTS(test_mark_resets); + RUN_TESTS(test_recursion); } int cpp_main(int /*argc*/, char * /*argv*/[]) diff --git a/test/regress/test.hpp b/test/regress/test.hpp index 0e6b88f9..9e271288 100644 --- a/test/regress/test.hpp +++ b/test/regress/test.hpp @@ -260,5 +260,6 @@ void test_overloads(); void test_unicode(); void test_pocessive_repeats(); void test_mark_resets(); +void test_recursion(); #endif diff --git a/test/regress/test_non_greedy_repeats.cpp b/test/regress/test_non_greedy_repeats.cpp index c9935bd8..3196f5ac 100644 --- a/test/regress/test_non_greedy_repeats.cpp +++ b/test/regress/test_non_greedy_repeats.cpp @@ -41,6 +41,5 @@ void test_non_greedy_repeats() TEST_REGEX_SEARCH("xx[/-]{0,2}?(?:[+-][0-9])??\\z", perl, "xx--", match_default, make_array(0, 4, -2, -2)); TEST_INVALID_REGEX("a{1,3}{1}", perl); TEST_INVALID_REGEX("a**", perl); - //TEST_INVALID_REGEX("a++", perl); } diff --git a/test/regress/test_perl_ex.cpp b/test/regress/test_perl_ex.cpp index 2cb5f208..764d0c43 100644 --- a/test/regress/test_perl_ex.cpp +++ b/test/regress/test_perl_ex.cpp @@ -665,14 +665,173 @@ void test_mark_resets() TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "xyzxyz", match_default, make_array(0, 6, 0, 3, -2, -2)); TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "abcxyz", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "xyzabc", match_default, make_array(-2, -2)); - //TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "abcabc", match_default, make_array(0, 6, 0, 3, -2, -2)); - //TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzabc", match_default, make_array(0, 6, 0, 3, -2, -2)); - //TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzxyz", match_default, make_array(-2, -2)); - //TEST_REGEX_SEARCH("^X(?5)(a)(?|(b)|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); - //TEST_INVALID_REGEX("^X(?5)(a)(?|(b)|(q))(c)(d)Y", perl); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "abcabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzxyz", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^X(?5)(a)(?|(b)|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_INVALID_REGEX("^X(?5)(a)(?|(b)|(q))(c)(d)Y", perl); //TEST_REGEX_SEARCH("^X(?&N)(a)(?|(b)|(q))(c)(d)(?Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); - //TEST_REGEX_SEARCH("^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 6, 7, -2, -2)); - //TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 6, 7, -2, -2)); - //TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); +} + +void test_recursion() +{ + using namespace boost::regex_constants; + + TEST_INVALID_REGEX("(a(?2)b)", perl); + TEST_INVALID_REGEX("(a(?1b))", perl); + TEST_REGEX_SEARCH("(a(?1)b)", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(a(?1)+b)", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "abc", match_default, make_array(0, 3, 2, 3, -2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "a(b)c", match_default, make_array(0, 5, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "a(b(c))d", match_default, make_array(0, 8, 7, 8, -2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "a(b(c)d", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^>abc>([^()]|\\((?1)*\\))*abc>123abc>([^()]|\\((?1)*\\))*abc>1(2)3abc>([^()]|\\((?1)*\\))*abc>(1(2)3)P)", perl|icase, "abcPXP123", match_default, make_array(3, 6, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("(abc)(?i:(?1))", perl|icase, "defabcabcxyz", match_default, make_array(3, 9, 3, 6, -2, -2)); + TEST_REGEX_SEARCH("(abc)(?i:(?1))", perl, "DEFabcABCXYZ", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(abc)(?i:(?1)abc)", perl, "DEFabcABCABCXYZ", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(abc)(?:(?i)(?1))", perl, "defabcabcxyz", match_default, make_array(3, 9, 3, 6, -2, -2)); + TEST_REGEX_SEARCH("(abc)(?:(?i)(?1))", perl, "DEFabcABCXYZ", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "abcabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzxyz", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?1)[]a()b](abc)", perl, "abcbabc", match_default, make_array(0, 7, 4, 7, -2, -2)); + TEST_REGEX_SEARCH("(?1)[]a()b](abc)", perl, "abcXabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?1)[^]a()b](abc)", perl, "abcXabc", match_default, make_array(0, 7, 4, 7, -2, -2)); + TEST_REGEX_SEARCH("(?1)[^]a()b](abc)", perl, "abcbabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?2)[]a()b](abc)(xyz)", perl, "xyzbabcxyz", match_default, make_array(0, 10, 4, 7, 7, 10, -2, -2)); + TEST_REGEX_SEARCH("^X(?5)(a)(?|(b)|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_INVALID_REGEX("^X(?5)(a)(?|(b)|(q))(c)(d)Y", perl); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"1234", match_default, make_array(0, 4, 0, 4, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\"1234\"", match_default, make_array(0, 6, 0, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100"L"1234", match_default, make_array(0, 5, 1, 5, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\"\x100"L"1234\"", match_default, make_array(1, 6, 2, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100\x100"L"12ab", match_default, make_array(0, 4, 2, 4, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100\x100"L"\"12\"", match_default, make_array(0, 6, 2, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100\x100"L"abcd", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(ab|c)(?-1)", perl, "abc", match_default, make_array(0, 3, 0, 2, -2, -2)); + TEST_REGEX_SEARCH("xy(?+1)(abc)", perl, "xyabcabc", match_default, make_array(0, 8, 5, 8, -2, -2)); + TEST_REGEX_SEARCH("xy(?+1)(abc)", perl, "xyabc", match_default, make_array(-2, -2)); + TEST_INVALID_REGEX("x(?-0)y", perl); + TEST_INVALID_REGEX("x(?-1)y", perl); + TEST_INVALID_REGEX("x(?+0)y", perl); + TEST_INVALID_REGEX("x(?+1)y", perl); + TEST_REGEX_SEARCH("^(?+1)(?x|y){0}z", perl, "xzxx", match_default, make_array(0, 2, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("^(?+1)(?x|y){0}z", perl, "yzyy", match_default, make_array(0, 2, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("^(?+1)(?x|y){0}z", perl, "xxz", match_default, make_array(-2, -2)); + + // Now recurse to sub-expression zero: + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(abcd)xyz", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "xyz(abcd)", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(ab(xy)cd)pqr", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(ab(xycd)pqr", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "() abc ()", match_default, make_array(0, 2, -2, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(2, 31, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "abcd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "abcd)", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(abcd", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?0) )* \\) ", perl|mod_x, "(ab(xy)cd)pqr", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?0) )* \\) ", perl|mod_x, "1(abcd)(x(y)z)pqr", match_default, make_array(1, 7, 2, 6, -2, 7, 14, 12, 13, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "(a(b(c)d)e)", match_default, make_array(4, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "((ab))", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "()", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) )? \\) ", perl|mod_x, "()", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) )? \\) ", perl|mod_x, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(8, 13, -2, 20, 25, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?0) )* \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, -1, -1, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 4, 4, 12, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, -1, -1, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 12, 1, 4, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( (((((((((( ( (?>[^()]+) | (?0) )* )))))))))) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?0) )* ) \\) ", perl|mod_x, "(abcd(xyz

qrs)123)", match_default, make_array(0, 20, 1, 19, 16, 19, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?0)) )* ) \\) ", perl|mod_x, "(ab(cd)ef)", match_default, make_array(0, 10, 1, 9, 7, 9, 3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?0)) )* ) \\) ", perl|mod_x, "(ab(cd(ef)gh)ij)", match_default, make_array(0, 16, 1, 15, 13, 15, 3, 13, -2, -2)); + // Again with (?R): + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(abcd)xyz", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "xyz(abcd)", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(ab(xy)cd)pqr", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(ab(xycd)pqr", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "() abc ()", match_default, make_array(0, 2, -2, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(2, 31, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "abcd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "abcd)", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(abcd", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?R) )* \\) ", perl|mod_x, "(ab(xy)cd)pqr", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?R) )* \\) ", perl|mod_x, "1(abcd)(x(y)z)pqr", match_default, make_array(1, 7, 2, 6, -2, 7, 14, 12, 13, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "(a(b(c)d)e)", match_default, make_array(4, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "((ab))", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "()", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) )? \\) ", perl|mod_x, "()", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) )? \\) ", perl|mod_x, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(8, 13, -2, 20, 25, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?R) )* \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, -1, -1, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 4, 4, 12, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, -1, -1, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 12, 1, 4, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( (((((((((( ( (?>[^()]+) | (?R) )* )))))))))) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?R) )* ) \\) ", perl|mod_x, "(abcd(xyz

qrs)123)", match_default, make_array(0, 20, 1, 19, 16, 19, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?R)) )* ) \\) ", perl|mod_x, "(ab(cd)ef)", match_default, make_array(0, 10, 1, 9, 7, 9, 3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?R)) )* ) \\) ", perl|mod_x, "(ab(cd(ef)gh)ij)", match_default, make_array(0, 16, 1, 15, 13, 15, 3, 13, -2, -2)); + // And some extra cases: + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xab", match_default, make_array(0, 3, 1, 3, -1, -1, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xbc", match_default, make_array(0, 3, 1, 3, 1, 3, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xde", match_default, make_array(0, 3, 1, 3, 1, 3, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xxab", match_default, make_array(0, 4, 1, 4, 1, 4, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xxxab", match_default, make_array(0, 5, 1, 5, 1, 5, 1, 5, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xyab", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("[^()]*(?:\\((?R)\\)[^()]*)*", perl|mod_x, "(this(and)that)", match_default, make_array(0, 15, -2, 15, 15, -2, -2)); + TEST_REGEX_SEARCH("[^()]*(?:\\((?R)\\)[^()]*)*", perl|mod_x, "(this(and)that)stuff", match_default, make_array(0, 20, -2, 20, 20, -2, -2)); + TEST_REGEX_SEARCH("[^()]*(?:\\((?>(?R))\\)[^()]*)*", perl|mod_x, "(this(and)that)", match_default, make_array(0, 15, -2, 15, 15, -2, -2)); + + // More complex cases involving (?(R): + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "<>", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, " hij>", match_default, make_array(0, 15, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, " hij>", match_default, make_array(5, 10, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "def>", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "", match_default, make_array(4, 6, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "