diff --git a/build/Jamfile.v2 b/build/Jamfile.v2 index ca7f801f..8943a3f9 100644 --- a/build/Jamfile.v2 +++ b/build/Jamfile.v2 @@ -22,7 +22,7 @@ rule check-icu-config ( ) if ! $(gICU_CONFIG_CHECKED) { - if $(HAVE_ICU) + if $(HAVE_ICU) && ! $(ICU_PATH) { gHAS_ICU = true ; gICU_CORE_LIB = icuuc ; @@ -79,6 +79,10 @@ rule check-icu-config ( ) { gICU_IN_LIB = icuin ; } + else if [ GLOB $(dir)/lib : icui18n.* ] + { + gICU_IN_LIB = icui18n ; + } else if [ GLOB $(dir)/lib64 : libicui18n.* ] { gICU_IN_LIB = icui18n ; diff --git a/doc/format_boost_syntax.qbk b/doc/format_boost_syntax.qbk index cba57dd8..c3871041 100644 --- a/doc/format_boost_syntax.qbk +++ b/doc/format_boost_syntax.qbk @@ -33,6 +33,15 @@ order to prevent ambiguities. For example, the format string "(?1foo:bar)" will replace each match found with "foo" if the sub-expression $1 was matched, and with "bar" otherwise. +For sub-expressions with an index greater than 9, or for access to named sub-expressions use: + +?{INDEX}true-expression:false-expression + +or + +?{NAME}true-expression:false-expression + + [h4 Placeholder Sequences] Placeholder sequences specify that some part of what matched the regular expression @@ -41,12 +50,24 @@ should be sent to output as follows: [table [[Placeholder][Meaning]] [[$&][Outputs what matched the whole expression.]] -[[$`][Outputs the text between the end of the last match found (or the - start of the text if no previous match was found), and the start - of the current match.]] +[[$MATCH][As $&]] +[[${^MATCH}][As $&]] +[[$\`][Outputs the text between the end of the last match found (or the + start of the text if no previous match was found), and the start + of the current match.]] +[[$PREMATCH][As $\`]] +[[${^PREMATCH}][As $\`]] [[$'][Outputs all the text following the end of the current match.]] +[[$POSTMATCH][As $']] +[[${^POSTMATCH}][As $']] +[[$+][Outputs what matched the last marked sub-expression in the regular expression.]] +[[$LAST_PAREN_MATCH][As $+]] +[[$LAST_SUBMATCH_RESULT][Outputs what matched the last sub-expression to be actually matched.]] +[[$^N][As $LAST_SUBMATCH_RESULT]] [[$$][Outputs a literal '$']] [[$n][Outputs what matched the n'th sub-expression.]] +[[${n}][Outputs what matched the n'th sub-expression.]] +[[$+{NAME}][Outputs whatever matched the sub-expression named "NAME".]] ] Any $-placeholder sequence not listed above, results in '$' being treated as a literal. diff --git a/doc/format_perl_syntax.qbk b/doc/format_perl_syntax.qbk index cfd57500..63cdbab2 100644 --- a/doc/format_perl_syntax.qbk +++ b/doc/format_perl_syntax.qbk @@ -17,13 +17,24 @@ should be sent to output as follows: [table [[Placeholder][Meaning]] [[$&][Outputs what matched the whole expression.]] -[[$`][Outputs the text between the end of the last match found (or the +[[$MATCH][As $&]] +[[${^MATCH}][As $&]] +[[$\`][Outputs the text between the end of the last match found (or the start of the text if no previous match was found), and the start of the current match.]] +[[$PREMATCH][As $\`]] +[[${^PREMATCH}][As $\`]] [[$'][Outputs all the text following the end of the current match.]] +[[$POSTMATCH][As $']] +[[${^POSTMATCH}][As $']] +[[$+][Outputs what matched the last marked sub-expression in the regular expression.]] +[[$LAST_PAREN_MATCH][As $+]] +[[$LAST_SUBMATCH_RESULT][Outputs what matched the last sub-expression to be actually matched.]] +[[$^N][As $LAST_SUBMATCH_RESULT]] [[$$][Outputs a literal '$']] [[$n][Outputs what matched the n'th sub-expression.]] [[${n}][Outputs what matched the n'th sub-expression.]] +[[$+{NAME}][Outputs whatever matched the sub-expression named "NAME".]] ] Any $-placeholder sequence not listed above, results in '$' being treated diff --git a/doc/history.qbk b/doc/history.qbk index 05a9710e..360f8922 100644 --- a/doc/history.qbk +++ b/doc/history.qbk @@ -8,6 +8,11 @@ [section:history History] +[h4 Boost 1.40] + +* Added support for many Perl 5.10 syntax elements including named +sub-expressions, branch resets and recursive regular expressions. + [h4 Boost 1.38] * [*Breaking change]: empty expressions, and empty alternatives are now diff --git a/doc/html/boost_regex/background_information.html b/doc/html/boost_regex/background_information.html index db5c0165..76048eed 100644 --- a/doc/html/boost_regex/background_information.html +++ b/doc/html/boost_regex/background_information.html @@ -3,7 +3,7 @@ Background Information - + diff --git a/doc/html/boost_regex/background_information/acknowledgements.html b/doc/html/boost_regex/background_information/acknowledgements.html index 71f5252f..68565667 100644 --- a/doc/html/boost_regex/background_information/acknowledgements.html +++ b/doc/html/boost_regex/background_information/acknowledgements.html @@ -3,7 +3,7 @@ Acknowledgements - + diff --git a/doc/html/boost_regex/background_information/examples.html b/doc/html/boost_regex/background_information/examples.html index 9f9dcc6c..39bd0df1 100644 --- a/doc/html/boost_regex/background_information/examples.html +++ b/doc/html/boost_regex/background_information/examples.html @@ -3,7 +3,7 @@ Test and Example Programs - + @@ -28,7 +28,7 @@ Example Programs
- + Test Programs
@@ -107,7 +107,7 @@ Files: captures_test.cpp.

- + Example programs
@@ -133,7 +133,7 @@ Files: regex_timer.cpp.

- + Code snippets
diff --git a/doc/html/boost_regex/background_information/faq.html b/doc/html/boost_regex/background_information/faq.html index 950b9eac..aded683f 100644 --- a/doc/html/boost_regex/background_information/faq.html +++ b/doc/html/boost_regex/background_information/faq.html @@ -3,7 +3,7 @@ FAQ - + diff --git a/doc/html/boost_regex/background_information/futher.html b/doc/html/boost_regex/background_information/futher.html index d058b19f..6ff56c4c 100644 --- a/doc/html/boost_regex/background_information/futher.html +++ b/doc/html/boost_regex/background_information/futher.html @@ -3,7 +3,7 @@ References and Further Information - + diff --git a/doc/html/boost_regex/background_information/headers.html b/doc/html/boost_regex/background_information/headers.html index e5a9a7c4..da1abfcc 100644 --- a/doc/html/boost_regex/background_information/headers.html +++ b/doc/html/boost_regex/background_information/headers.html @@ -3,7 +3,7 @@ Headers - + diff --git a/doc/html/boost_regex/background_information/history.html b/doc/html/boost_regex/background_information/history.html index b2069ec7..084381fc 100644 --- a/doc/html/boost_regex/background_information/history.html +++ b/doc/html/boost_regex/background_information/history.html @@ -3,7 +3,7 @@ History - + @@ -25,8 +25,17 @@

History

+
+ + Boost + 1.40 +
+
- + Boost 1.38
@@ -36,7 +45,7 @@ empty alternatives are now allowed when using the Perl regular expression syntax. This change has been added for Perl compatibility, when the new syntax_option_typeno_empty_expressions is set then the old behaviour - is preserved and empty expressions are prohibited. + is preserved and empty expressions are prohibited. This is issue #1081.
  • Added support for Perl style ${n} expressions in format strings (issue @@ -53,7 +62,7 @@
  • - + Boost 1.34
    @@ -76,7 +85,7 @@
    - + Boost 1.33.1
    @@ -146,7 +155,7 @@
    - + Boost 1.33.0
    @@ -201,7 +210,7 @@
    - + Boost 1.32.1
    @@ -209,7 +218,7 @@ Fixed bug in partial matches of bounded repeats of '.'.
    - + Boost 1.31.0
    diff --git a/doc/html/boost_regex/background_information/locale.html b/doc/html/boost_regex/background_information/locale.html index 0e8b0694..eb901bae 100644 --- a/doc/html/boost_regex/background_information/locale.html +++ b/doc/html/boost_regex/background_information/locale.html @@ -3,7 +3,7 @@ Localization - + @@ -58,7 +58,7 @@ There are three separate localization mechanisms supported by Boost.Regex:

    - + Win32 localization model.
    @@ -90,7 +90,7 @@ are treated as "unknown" graphic characters.

    - + C localization model.
    @@ -114,7 +114,7 @@ libraries including version 1 of this library.

    - + C++ localization model.
    @@ -151,7 +151,7 @@ in your code. The best way to ensure this is to add the #define to <boost/regex/user.hpp>.

    - + Providing a message catalogue
    diff --git a/doc/html/boost_regex/background_information/performance.html b/doc/html/boost_regex/background_information/performance.html index f25106fc..c3ab2735 100644 --- a/doc/html/boost_regex/background_information/performance.html +++ b/doc/html/boost_regex/background_information/performance.html @@ -3,7 +3,7 @@ Performance - + diff --git a/doc/html/boost_regex/background_information/redist.html b/doc/html/boost_regex/background_information/redist.html index 73c05a0a..6901b931 100644 --- a/doc/html/boost_regex/background_information/redist.html +++ b/doc/html/boost_regex/background_information/redist.html @@ -3,7 +3,7 @@ Redistributables - + diff --git a/doc/html/boost_regex/background_information/standards.html b/doc/html/boost_regex/background_information/standards.html index d2732b2f..31c09f0d 100644 --- a/doc/html/boost_regex/background_information/standards.html +++ b/doc/html/boost_regex/background_information/standards.html @@ -3,7 +3,7 @@ Standards Conformance - + @@ -28,7 +28,7 @@ Conformance
    - + C++

    @@ -36,7 +36,7 @@ Report on C++ Library Extensions.

    - + ECMAScript / JavaScript
    @@ -49,7 +49,7 @@ rather than a Unicode escape sequence; use \x{DDDD} for Unicode escape sequences.

    - + Perl

    @@ -62,7 +62,7 @@ (??{code}) Not implementable in a compiled strongly typed language.

    - + POSIX

    @@ -82,7 +82,7 @@ a custom traits class.

    - + Unicode

    diff --git a/doc/html/boost_regex/background_information/thread_safety.html b/doc/html/boost_regex/background_information/thread_safety.html index eac436ac..c67aa107 100644 --- a/doc/html/boost_regex/background_information/thread_safety.html +++ b/doc/html/boost_regex/background_information/thread_safety.html @@ -3,7 +3,7 @@ Thread Safety - + diff --git a/doc/html/boost_regex/captures.html b/doc/html/boost_regex/captures.html index 7497c1c6..c6345106 100644 --- a/doc/html/boost_regex/captures.html +++ b/doc/html/boost_regex/captures.html @@ -3,7 +3,7 @@ Understanding Marked Sub-Expressions and Captures - + @@ -35,7 +35,7 @@ accessed.

    - + Marked sub-expressions

    @@ -218,7 +218,7 @@ output stream.

    - + Unmatched Sub-Expressions

    @@ -231,7 +231,7 @@ you can determine which sub-expressions matched by accessing the sub_match::matched data member.

    - + Repeated Captures

    diff --git a/doc/html/boost_regex/configuration.html b/doc/html/boost_regex/configuration.html index 4489bc41..7b656e1a 100644 --- a/doc/html/boost_regex/configuration.html +++ b/doc/html/boost_regex/configuration.html @@ -3,7 +3,7 @@ Configuration - + diff --git a/doc/html/boost_regex/configuration/algorithm.html b/doc/html/boost_regex/configuration/algorithm.html index e68aee41..ea8e4d80 100644 --- a/doc/html/boost_regex/configuration/algorithm.html +++ b/doc/html/boost_regex/configuration/algorithm.html @@ -3,7 +3,7 @@ Algorithm Selection - + diff --git a/doc/html/boost_regex/configuration/compiler.html b/doc/html/boost_regex/configuration/compiler.html index 752d0c27..62931099 100644 --- a/doc/html/boost_regex/configuration/compiler.html +++ b/doc/html/boost_regex/configuration/compiler.html @@ -3,7 +3,7 @@ Compiler Setup - + diff --git a/doc/html/boost_regex/configuration/linkage.html b/doc/html/boost_regex/configuration/linkage.html index a7fa14b8..a6a5084d 100644 --- a/doc/html/boost_regex/configuration/linkage.html +++ b/doc/html/boost_regex/configuration/linkage.html @@ -3,7 +3,7 @@ Linkage Options - + diff --git a/doc/html/boost_regex/configuration/locale.html b/doc/html/boost_regex/configuration/locale.html index dc93af47..bfcbe924 100644 --- a/doc/html/boost_regex/configuration/locale.html +++ b/doc/html/boost_regex/configuration/locale.html @@ -3,7 +3,7 @@ Locale and traits class selection - + diff --git a/doc/html/boost_regex/configuration/tuning.html b/doc/html/boost_regex/configuration/tuning.html index 5884cca5..400afa6d 100644 --- a/doc/html/boost_regex/configuration/tuning.html +++ b/doc/html/boost_regex/configuration/tuning.html @@ -3,7 +3,7 @@ Algorithm Tuning - + diff --git a/doc/html/boost_regex/format.html b/doc/html/boost_regex/format.html index bbd5faae..f046222f 100644 --- a/doc/html/boost_regex/format.html +++ b/doc/html/boost_regex/format.html @@ -3,7 +3,7 @@ Search and Replace Format String Syntax - + diff --git a/doc/html/boost_regex/format/boost_format_syntax.html b/doc/html/boost_regex/format/boost_format_syntax.html index 63159fc3..90053a97 100644 --- a/doc/html/boost_regex/format/boost_format_syntax.html +++ b/doc/html/boost_regex/format/boost_format_syntax.html @@ -3,7 +3,7 @@ Boost-Extended Format String Syntax - + @@ -32,7 +32,7 @@ '$', '\', '(', ')', '?', and ':'.

    - + Grouping

    @@ -40,7 +40,7 @@ you want a to output literal parenthesis.

    - + Conditionals

    @@ -65,8 +65,21 @@ match found with "foo" if the sub-expression $1 was matched, and with "bar" otherwise.

    +

    + For sub-expressions with an index greater than 9, or for access to named + sub-expressions use: +

    +

    + ?{INDEX}true-expression:false-expression +

    +

    + or +

    +

    + ?{NAME}true-expression:false-expression +

    - + Placeholder Sequences
    @@ -105,6 +118,30 @@ + +

    + $MATCH +

    + + +

    + As $& +

    + + + + +

    + ${^MATCH} +

    + + +

    + As $& +

    + + +

    $` @@ -119,6 +156,30 @@ + +

    + $PREMATCH +

    + + +

    + As $` +

    + + + + +

    + ${^PREMATCH} +

    + + +

    + As $` +

    + + +

    $' @@ -131,6 +192,79 @@ + +

    + $POSTMATCH +

    + + +

    + As $' +

    + + + + +

    + ${^POSTMATCH} +

    + + +

    + As $' +

    + + + + +

    + $+ +

    + + +

    + Outputs what matched the last marked sub-expression in the regular + expression. +

    + + + + +

    + $LAST_PAREN_MATCH +

    + + +

    + As $+ +

    + + + + +

    + $LAST_SUBMATCH_RESULT +

    + + +

    + Outputs what matched the last sub-expression to be actually matched. +

    + + + + +

    + $^N +

    + + +

    + As $LAST_SUBMATCH_RESULT +

    + + +

    $$ @@ -154,6 +288,30 @@

    + + +

    + ${n} +

    + + +

    + Outputs what matched the n'th sub-expression. +

    + + + + +

    + $+{NAME} +

    + + +

    + Outputs whatever matched the sub-expression named "NAME". +

    + +

    @@ -161,7 +319,7 @@ as a literal.

    - + Escape Sequences
    diff --git a/doc/html/boost_regex/format/perl_format.html b/doc/html/boost_regex/format/perl_format.html index 3764ed34..616dd3fb 100644 --- a/doc/html/boost_regex/format/perl_format.html +++ b/doc/html/boost_regex/format/perl_format.html @@ -3,7 +3,7 @@ Perl Format String Syntax - + @@ -65,6 +65,30 @@ + +

    + $MATCH +

    + + +

    + As $& +

    + + + + +

    + ${^MATCH} +

    + + +

    + As $& +

    + + +

    $` @@ -79,6 +103,30 @@ + +

    + $PREMATCH +

    + + +

    + As $` +

    + + + + +

    + ${^PREMATCH} +

    + + +

    + As $` +

    + + +

    $' @@ -91,6 +139,79 @@ + +

    + $POSTMATCH +

    + + +

    + As $' +

    + + + + +

    + ${^POSTMATCH} +

    + + +

    + As $' +

    + + + + +

    + $+ +

    + + +

    + Outputs what matched the last marked sub-expression in the regular + expression. +

    + + + + +

    + $LAST_PAREN_MATCH +

    + + +

    + As $+ +

    + + + + +

    + $LAST_SUBMATCH_RESULT +

    + + +

    + Outputs what matched the last sub-expression to be actually matched. +

    + + + + +

    + $^N +

    + + +

    + As $LAST_SUBMATCH_RESULT +

    + + +

    $$ @@ -126,6 +247,18 @@

    + + +

    + $+{NAME} +

    + + +

    + Outputs whatever matched the sub-expression named "NAME". +

    + +

    diff --git a/doc/html/boost_regex/format/sed_format.html b/doc/html/boost_regex/format/sed_format.html index cb806f6b..c2f310d7 100644 --- a/doc/html/boost_regex/format/sed_format.html +++ b/doc/html/boost_regex/format/sed_format.html @@ -3,7 +3,7 @@ Sed Format String Syntax - + diff --git a/doc/html/boost_regex/install.html b/doc/html/boost_regex/install.html index d325791c..3f61840d 100644 --- a/doc/html/boost_regex/install.html +++ b/doc/html/boost_regex/install.html @@ -3,7 +3,7 @@ Building and Installing the Library - + @@ -49,7 +49,7 @@ file before you can use it, instructions for specific platforms are as follows:

    - + Building with bjam

    @@ -58,7 +58,7 @@ started guide for more information.

    - + Building With Unicode and ICU Support
    @@ -96,11 +96,11 @@ ICU you are using is binary compatible with the toolset you use to build Boost.

    - + Building via makefiles
    - + Borland C++ Builder:
    - + GCC(2.95 and later)

    @@ -302,7 +302,7 @@ see the config library documentation.

    - + Sun Workshop 6.1

    @@ -347,7 +347,7 @@ will build v9 variants of the regex library named libboost_regex_v9.a etc.

    - + Makefiles for Other compilers
    diff --git a/doc/html/boost_regex/introduction_and_overview.html b/doc/html/boost_regex/introduction_and_overview.html index 4ca4583c..2ca40f79 100644 --- a/doc/html/boost_regex/introduction_and_overview.html +++ b/doc/html/boost_regex/introduction_and_overview.html @@ -3,7 +3,7 @@ Introduction and Overview - + diff --git a/doc/html/boost_regex/partial_matches.html b/doc/html/boost_regex/partial_matches.html index d47cb538..a123ecb5 100644 --- a/doc/html/boost_regex/partial_matches.html +++ b/doc/html/boost_regex/partial_matches.html @@ -3,7 +3,7 @@ Partial Matches - + diff --git a/doc/html/boost_regex/ref.html b/doc/html/boost_regex/ref.html index 3d911322..52ee39a7 100644 --- a/doc/html/boost_regex/ref.html +++ b/doc/html/boost_regex/ref.html @@ -3,7 +3,7 @@ Reference - + diff --git a/doc/html/boost_regex/ref/bad_expression.html b/doc/html/boost_regex/ref/bad_expression.html index 1cbff685..f4b5f922 100644 --- a/doc/html/boost_regex/ref/bad_expression.html +++ b/doc/html/boost_regex/ref/bad_expression.html @@ -3,7 +3,7 @@ bad_expression - + @@ -27,7 +27,7 @@ bad_expression
    - + Synopsis
    #include <boost/pattern_except.hpp>
    @@ -54,7 +54,7 @@
     } // namespace boost
     
    - + Description
    regex_error(const std::string& s, regex_constants::error_type err, std::ptrdiff_t pos);
    diff --git a/doc/html/boost_regex/ref/basic_regex.html b/doc/html/boost_regex/ref/basic_regex.html
    index 264865a5..30d3449c 100644
    --- a/doc/html/boost_regex/ref/basic_regex.html
    +++ b/doc/html/boost_regex/ref/basic_regex.html
    @@ -3,7 +3,7 @@
     
     basic_regex
     
    -
    +
     
     
     
    @@ -27,7 +27,7 @@
      basic_regex
     
     
    - + Synopsis
    #include <boost/regex.hpp>
    @@ -244,7 +244,7 @@
     } // namespace boost
     
    - + Description

    @@ -327,7 +327,7 @@ basic_regex.

    -

    Table 1. basic_regex default construction postconditions

    +

    Table 1. basic_regex default construction postconditions

    @@ -407,7 +407,7 @@ flags specified in f.

    -

    Table 2. Postconditions for basic_regex construction

    +

    Table 2. Postconditions for basic_regex construction

    @@ -512,7 +512,7 @@ specified in f.

    -

    Table 3. Postconditions for basic_regex construction

    +

    Table 3. Postconditions for basic_regex construction

    @@ -616,7 +616,7 @@ according the option flags specified in f.

    -

    Table 4. Postconditions for basic_regex construction

    +

    Table 4. Postconditions for basic_regex construction

    @@ -727,7 +727,7 @@ flags specified in f.

    -

    Table 5. Postconditions for basic_regex construction

    +

    Table 5. Postconditions for basic_regex construction

    @@ -829,7 +829,7 @@ flags specified in f.

    -

    Table 6. Postconditions for basic_regex construction

    +

    Table 6. Postconditions for basic_regex construction

    @@ -1043,7 +1043,7 @@ in f.

    -

    Table 7. Postconditions for basic_regex::assign

    +

    Table 7. Postconditions for basic_regex::assign

    diff --git a/doc/html/boost_regex/ref/concepts.html b/doc/html/boost_regex/ref/concepts.html index b64d9225..1ba9307e 100644 --- a/doc/html/boost_regex/ref/concepts.html +++ b/doc/html/boost_regex/ref/concepts.html @@ -3,7 +3,7 @@ Concepts - + diff --git a/doc/html/boost_regex/ref/concepts/charT_concept.html b/doc/html/boost_regex/ref/concepts/charT_concept.html index ad3a48b6..8a8d6879 100644 --- a/doc/html/boost_regex/ref/concepts/charT_concept.html +++ b/doc/html/boost_regex/ref/concepts/charT_concept.html @@ -3,7 +3,7 @@ charT Requirements - + diff --git a/doc/html/boost_regex/ref/concepts/iterator_concepts.html b/doc/html/boost_regex/ref/concepts/iterator_concepts.html index ca98a141..421ea25a 100644 --- a/doc/html/boost_regex/ref/concepts/iterator_concepts.html +++ b/doc/html/boost_regex/ref/concepts/iterator_concepts.html @@ -3,7 +3,7 @@ Iterator Requirements - + diff --git a/doc/html/boost_regex/ref/concepts/traits_concept.html b/doc/html/boost_regex/ref/concepts/traits_concept.html index ef09ab1c..3d051b83 100644 --- a/doc/html/boost_regex/ref/concepts/traits_concept.html +++ b/doc/html/boost_regex/ref/concepts/traits_concept.html @@ -3,7 +3,7 @@ Traits Class Requirements - + @@ -34,7 +34,7 @@ Boost-specific enhanced interface.

    - + Minimal requirements.
    @@ -381,7 +381,7 @@
    - + Additional Optional Requirements
    diff --git a/doc/html/boost_regex/ref/deprecated_interfaces.html b/doc/html/boost_regex/ref/deprecated_interfaces.html index caa8cd1f..784d146a 100644 --- a/doc/html/boost_regex/ref/deprecated_interfaces.html +++ b/doc/html/boost_regex/ref/deprecated_interfaces.html @@ -3,7 +3,7 @@ Deprecated Interfaces - + diff --git a/doc/html/boost_regex/ref/deprecated_interfaces/old_regex.html b/doc/html/boost_regex/ref/deprecated_interfaces/old_regex.html index a360c329..a79abdd6 100644 --- a/doc/html/boost_regex/ref/deprecated_interfaces/old_regex.html +++ b/doc/html/boost_regex/ref/deprecated_interfaces/old_regex.html @@ -3,7 +3,7 @@ High Level Class RegEx (Deprecated) - + diff --git a/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html b/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html index 62293232..8e482ee8 100644 --- a/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html +++ b/doc/html/boost_regex/ref/deprecated_interfaces/regex_format.html @@ -3,7 +3,7 @@ regex_format (Deprecated) - + @@ -34,7 +34,7 @@ previous version of Boost.Regex and will not be further updated:

    - + Algorithm regex_format
    diff --git a/doc/html/boost_regex/ref/deprecated_interfaces/regex_grep.html b/doc/html/boost_regex/ref/deprecated_interfaces/regex_grep.html index 1bd7f867..ae24f41c 100644 --- a/doc/html/boost_regex/ref/deprecated_interfaces/regex_grep.html +++ b/doc/html/boost_regex/ref/deprecated_interfaces/regex_grep.html @@ -3,7 +3,7 @@ regex_grep (Deprecated) - + diff --git a/doc/html/boost_regex/ref/deprecated_interfaces/regex_split.html b/doc/html/boost_regex/ref/deprecated_interfaces/regex_split.html index 1c34e94d..26643536 100644 --- a/doc/html/boost_regex/ref/deprecated_interfaces/regex_split.html +++ b/doc/html/boost_regex/ref/deprecated_interfaces/regex_split.html @@ -3,7 +3,7 @@ regex_split (deprecated) - + diff --git a/doc/html/boost_regex/ref/error_type.html b/doc/html/boost_regex/ref/error_type.html index d91f0e9e..331567be 100644 --- a/doc/html/boost_regex/ref/error_type.html +++ b/doc/html/boost_regex/ref/error_type.html @@ -3,7 +3,7 @@ error_type - + @@ -27,7 +27,7 @@ error_type
    - + Synopsis

    @@ -57,7 +57,7 @@ } // namespace boost

    - + Description

    diff --git a/doc/html/boost_regex/ref/match_flag_type.html b/doc/html/boost_regex/ref/match_flag_type.html index 50784ec1..102543d0 100644 --- a/doc/html/boost_regex/ref/match_flag_type.html +++ b/doc/html/boost_regex/ref/match_flag_type.html @@ -3,7 +3,7 @@ match_flag_type - + @@ -69,7 +69,7 @@ } // namespace boost

    - + Description

    diff --git a/doc/html/boost_regex/ref/match_results.html b/doc/html/boost_regex/ref/match_results.html index 20f0bd3d..6e8f7c0b 100644 --- a/doc/html/boost_regex/ref/match_results.html +++ b/doc/html/boost_regex/ref/match_results.html @@ -3,7 +3,7 @@ match_results - + @@ -27,7 +27,7 @@ match_results

    - + Synopsis
    #include <boost/regex.hpp>
    @@ -98,9 +98,33 @@
        bool empty() const;
        // element access:
        difference_type length(int sub = 0) const;
    +   difference_type length(const char_type* sub) const;
    +   template <class charT>
    +   difference_type length(const charT* sub) const;
    +   template <class charT, class Traits, class A>
    +   difference_type length(const std::basic_string<charT, Traits, A>& sub) const;
        difference_type position(unsigned int sub = 0) const;
    +   difference_type position(const char_type* sub) const;
    +   template <class charT>
    +   difference_type position(const charT* sub) const;
    +   template <class charT, class Traits, class A>
    +   difference_type position(const std::basic_string<charT, Traits, A>& sub) const;
        string_type str(int sub = 0) const;
    +   string_type str(const char_type* sub)const;
    +   template <class Traits, class A>
    +   string_type str(const std::basic_string<char_type, Traits, A>& sub)const;
    +   template <class charT>
    +   string_type str(const charT* sub)const;
    +   template <class charT, class Traits, class A>
    +   string_type str(const std::basic_string<charT, Traits, A>& sub)const;
        const_reference operator[](int n) const;
    +   const_reference operator[](const char_type* n) const;
    +   template <class Traits, class A>
    +   const_reference operator[](const std::basic_string<char_type, Traits, A>& n) const;
    +   template <class charT>
    +   const_reference operator[](const charT* n) const;
    +   template <class charT, class Traits, class A>
    +   const_reference operator[](const std::basic_string<charT, Traits, A>& n) const;
     
        const_reference prefix() const;
     
    @@ -142,7 +166,7 @@
              match_results<BidirectionalIterator, Allocator>& m2);
     
    - + Description

    @@ -375,14 +399,39 @@

    difference_type length(int sub = 0)const;
    +difference_type length(const char_type* sub)const;
    +template <class charT>
    +difference_type length(const charT* sub)const;
    +template <class charT, class Traits, class A>
    +difference_type length(const std::basic_string<charT, Traits, A>&)const;
     

    Effects: Returns the length of sub-expression sub, that is to say: (*this)[sub].length().

    +

    + The overloads that accept a string refer to a named sub-expression n. + In the event that there is no such named sub-expression then returns an empty + string. +

    +

    + The template overloads of this function, allow the string and/or character + type to be different from the character type of the underlying sequence and/or + regular expression: in this case the characters will be widened to the underlying + character type of the original regular expression. A compiler error will + occur if the argument passes a wider character type than the underlying sequence. + These overloads allow a normal narrow character C string literal to be used + as an argument, even when the underlying character type of the expression + being matched may be something more exotic such as a Unicode character type. +

    difference_type position(unsigned int sub = 0)const;
    +difference_type position(const char_type* sub)const;
    +template <class charT>
    +difference_type position(const charT* sub)const;
    +template <class charT, class Traits, class A>
    +difference_type position(const std::basic_string<charT, Traits, A>&)const;
     

    Effects: Returns the starting location of @@ -391,17 +440,61 @@ will return the location of the partial match even though (*this)[0].matched is false.

    +

    + The overloads that accept a string refer to a named sub-expression n. + In the event that there is no such named sub-expression then returns an empty + string. +

    +

    + The template overloads of this function, allow the string and/or character + type to be different from the character type of the underlying sequence and/or + regular expression: in this case the characters will be widened to the underlying + character type of the original regular expression. A compiler error will + occur if the argument passes a wider character type than the underlying sequence. + These overloads allow a normal narrow character C string literal to be used + as an argument, even when the underlying character type of the expression + being matched may be something more exotic such as a Unicode character type. +

    string_type str(int sub = 0)const;
    +string_type str(const char_type* sub)const;
    +template <class Traits, class A>
    +string_type str(const std::basic_string<char_type, Traits, A>& sub)const;
    +template <class charT>
    +string_type str(const charT* sub)const;
    +template <class charT, class Traits, class A>
    +string_type str(const std::basic_string<charT, Traits, A>& sub)const;
     

    Effects: Returns sub-expression sub as a string: string_type((*this)[sub]).

    +

    + The overloads that accept a string, return the string that matched the named + sub-expression n. In the event that there is no such + named sub-expression then returns an empty string. +

    +

    + The template overloads of this function, allow the string and/or character + type to be different from the character type of the underlying sequence and/or + regular expression: in this case the characters will be widened to the underlying + character type of the original regular expression. A compiler error will + occur if the argument passes a wider character type than the underlying sequence. + These overloads allow a normal narrow character C string literal to be used + as an argument, even when the underlying character type of the expression + being matched may be something more exotic such as a Unicode character type. +

    const_reference operator[](int n) const;
    +const_reference operator[](const char_type* n) const;
    +template <class Traits, class A>
    +const_reference operator[](const std::basic_string<char_type, Traits, A>& n) const;
    +template <class charT>
    +const_reference operator[](const charT* n) const;
    +template <class charT, class Traits, class A>
    +const_reference operator[](const std::basic_string<charT, Traits, A>& n) const;
     

    Effects: Returns a reference to the sub_match @@ -413,6 +506,22 @@ then returns a sub_match object whose matched member is false.

    +

    + The overloads that accept a string, return a reference to the sub_match object representing the + character sequence that matched the named sub-expression n. + In the event that there is no such named sub-expression then returns a sub_match + object whose matched member is false. +

    +

    + The template overloads of this function, allow the string and/or character + type to be different from the character type of the underlying sequence and/or + regular expression: in this case the characters will be widened to the underlying + character type of the original regular expression. A compiler error will + occur if the argument passes a wider character type than the underlying sequence. + These overloads allow a normal narrow character C string literal to be used + as an argument, even when the underlying character type of the expression + being matched may be something more exotic such as a Unicode character type. +

    const_reference prefix()const;
    diff --git a/doc/html/boost_regex/ref/non_std_strings.html b/doc/html/boost_regex/ref/non_std_strings.html
    index 5f38cc26..4681e22b 100644
    --- a/doc/html/boost_regex/ref/non_std_strings.html
    +++ b/doc/html/boost_regex/ref/non_std_strings.html
    @@ -3,7 +3,7 @@
     
     Interfacing With Non-Standard String Types
     
    -
    +
     
     
     
    diff --git a/doc/html/boost_regex/ref/non_std_strings/icu.html b/doc/html/boost_regex/ref/non_std_strings/icu.html
    index 2e7fdaf3..c369d563 100644
    --- a/doc/html/boost_regex/ref/non_std_strings/icu.html
    +++ b/doc/html/boost_regex/ref/non_std_strings/icu.html
    @@ -3,7 +3,7 @@
     
     Working With Unicode and ICU String Types
     
    -
    +
     
     
     
    diff --git a/doc/html/boost_regex/ref/non_std_strings/icu/intro.html b/doc/html/boost_regex/ref/non_std_strings/icu/intro.html
    index 0f426829..69b659bf 100644
    --- a/doc/html/boost_regex/ref/non_std_strings/icu/intro.html
    +++ b/doc/html/boost_regex/ref/non_std_strings/icu/intro.html
    @@ -3,7 +3,7 @@
     
     Introduction to using Regex with ICU
     
    -
    +
     
     
     
    diff --git a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html
    index e636e849..4ea31468 100644
    --- a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html
    +++ b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_algo.html
    @@ -3,7 +3,7 @@
     
     Unicode Regular Expression Algorithms
     
    -
    +
     
     
     
    @@ -43,7 +43,7 @@
                 on to the "real" algorithm.
               

    - + u32regex_match

    @@ -89,7 +89,7 @@ }

    - + u32regex_search

    @@ -128,7 +128,7 @@ }

    - + u32regex_replace

    diff --git a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html index 0fbd6483..892034a2 100644 --- a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html +++ b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_iter.html @@ -3,7 +3,7 @@ Unicode Aware Regex Iterators - + @@ -28,7 +28,7 @@ Unicode Aware Regex Iterators

    - + u32regex_iterator

    @@ -126,7 +126,7 @@ Provided of course that the input is encoded as UTF-8.

    - + u32regex_token_iterator

    diff --git a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_types.html b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_types.html index cefbc15f..6c64febb 100644 --- a/doc/html/boost_regex/ref/non_std_strings/icu/unicode_types.html +++ b/doc/html/boost_regex/ref/non_std_strings/icu/unicode_types.html @@ -3,7 +3,7 @@ Unicode regular expression types - + diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings.html index 4592ae3d..e31a7b9c 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings.html @@ -3,7 +3,7 @@ Using Boost Regex With MFC Strings - + diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html index 1f73a084..6148023b 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_algo.html @@ -3,7 +3,7 @@ Overloaded Algorithms For MFC String Types - + @@ -34,7 +34,7 @@ here they are anyway:

    - + regex_match

    @@ -82,7 +82,7 @@ }

    - + regex_match (second overload)
    @@ -110,7 +110,7 @@ }
    - + regex_search

    @@ -149,7 +149,7 @@ }

    - + regex_search (second overload)
    @@ -164,7 +164,7 @@ + s.GetLength(), e, f);

    - + regex_replace

    diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_intro.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_intro.html index b9c60c1b..9ccedc23 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_intro.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_intro.html @@ -3,7 +3,7 @@ Introduction to Boost.Regex and MFC Strings - + diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html index 1cae51e5..54858bc8 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_iter.html @@ -3,7 +3,7 @@ Iterating Over the Matches Within An MFC String - + @@ -32,7 +32,7 @@ an MFC/ATL string to a regex_iterator or regex_token_iterator:

    - + regex_iterator creation helper
    @@ -68,7 +68,7 @@ }
    - + regex_token_iterator creation helpers
    diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_create.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_create.html index 4279151e..487a6ed0 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_create.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_create.html @@ -3,7 +3,7 @@ Regular Expression Creation From an MFC String - + diff --git a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_types.html b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_types.html index fe00bb69..3cfa2cfc 100644 --- a/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_types.html +++ b/doc/html/boost_regex/ref/non_std_strings/mfc_strings/mfc_regex_types.html @@ -3,7 +3,7 @@ Regex Types Used With MFC Strings - + diff --git a/doc/html/boost_regex/ref/posix.html b/doc/html/boost_regex/ref/posix.html index b60ac8c4..6424b99c 100644 --- a/doc/html/boost_regex/ref/posix.html +++ b/doc/html/boost_regex/ref/posix.html @@ -3,7 +3,7 @@ POSIX Compatible C API's - + @@ -165,7 +165,7 @@

    - + regcomp

    @@ -379,7 +379,7 @@

    - + regerror

    @@ -467,7 +467,7 @@

    - + regexec

    @@ -537,7 +537,7 @@

    - + regfree

    diff --git a/doc/html/boost_regex/ref/regex_iterator.html b/doc/html/boost_regex/ref/regex_iterator.html index 80561efa..2a101c5e 100644 --- a/doc/html/boost_regex/ref/regex_iterator.html +++ b/doc/html/boost_regex/ref/regex_iterator.html @@ -3,7 +3,7 @@ regex_iterator - + @@ -78,7 +78,7 @@ regex_constants::match_flag_type m = regex_constants::match_default);

    - + Description

    @@ -436,7 +436,7 @@ m.

    - + Examples

    diff --git a/doc/html/boost_regex/ref/regex_match.html b/doc/html/boost_regex/ref/regex_match.html index 7e8ea6b0..9b1a7108 100644 --- a/doc/html/boost_regex/ref/regex_match.html +++ b/doc/html/boost_regex/ref/regex_match.html @@ -3,7 +3,7 @@ regex_match - + @@ -80,7 +80,7 @@ match_flag_type flags = match_default);

    - + Description
    template <class BidirectionalIterator, class Allocator, class charT, class traits>
    @@ -360,7 +360,7 @@
             Effects: Returns the result of regex_match(s.begin(), s.end(), e, flags).
           

    - + Examples

    diff --git a/doc/html/boost_regex/ref/regex_replace.html b/doc/html/boost_regex/ref/regex_replace.html index 1135203f..70a65308 100644 --- a/doc/html/boost_regex/ref/regex_replace.html +++ b/doc/html/boost_regex/ref/regex_replace.html @@ -3,7 +3,7 @@ regex_replace - + @@ -53,7 +53,7 @@ match_flag_type flags = match_default);

    - + Description
    template <class OutputIterator, class BidirectionalIterator, class traits, class charT>
    @@ -163,7 +163,7 @@
             and then returns result.
           

    - + Examples

    diff --git a/doc/html/boost_regex/ref/regex_search.html b/doc/html/boost_regex/ref/regex_search.html index 35f3a630..44faddee 100644 --- a/doc/html/boost_regex/ref/regex_search.html +++ b/doc/html/boost_regex/ref/regex_search.html @@ -3,7 +3,7 @@ regex_search - + @@ -73,7 +73,7 @@ match_flag_type flags = match_default);

    - + Description
    template <class BidirectionalIterator, class Allocator, class charT, class traits>
    @@ -355,7 +355,7 @@
             Effects: Returns the result of regex_search(s.begin(), s.end(), e, flags).
           

    - + Examples

    diff --git a/doc/html/boost_regex/ref/regex_token_iterator.html b/doc/html/boost_regex/ref/regex_token_iterator.html index 0df2dcd4..8728ee16 100644 --- a/doc/html/boost_regex/ref/regex_token_iterator.html +++ b/doc/html/boost_regex/ref/regex_token_iterator.html @@ -3,7 +3,7 @@ regex_token_iterator - + @@ -136,7 +136,7 @@ regex_constants::match_flag_type m = regex_constants::match_default);

    - + Description

    @@ -383,7 +383,7 @@ m.

    - + Examples

    diff --git a/doc/html/boost_regex/ref/regex_traits.html b/doc/html/boost_regex/ref/regex_traits.html index 0d515761..cccf4917 100644 --- a/doc/html/boost_regex/ref/regex_traits.html +++ b/doc/html/boost_regex/ref/regex_traits.html @@ -3,7 +3,7 @@ regex_traits - + @@ -46,7 +46,7 @@ } // namespace boost

    - + Description

    diff --git a/doc/html/boost_regex/ref/sub_match.html b/doc/html/boost_regex/ref/sub_match.html index bcc5eed7..999f0ce4 100644 --- a/doc/html/boost_regex/ref/sub_match.html +++ b/doc/html/boost_regex/ref/sub_match.html @@ -3,7 +3,7 @@ sub_match - + @@ -329,11 +329,11 @@ } // namespace boost

    - + Description
    - + Members

    @@ -473,7 +473,7 @@

    - + sub_match non-member operators
    @@ -1008,7 +1008,7 @@ + m2.str().

    - + Stream inserter

    diff --git a/doc/html/boost_regex/ref/syntax_option_type.html b/doc/html/boost_regex/ref/syntax_option_type.html index 16e179c8..c5a27483 100644 --- a/doc/html/boost_regex/ref/syntax_option_type.html +++ b/doc/html/boost_regex/ref/syntax_option_type.html @@ -3,7 +3,7 @@ syntax_option_type - + diff --git a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_basic.html b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_basic.html index df577ade..b7d7d225 100644 --- a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_basic.html +++ b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_basic.html @@ -3,7 +3,7 @@ Options for POSIX Basic Regular Expressions - + diff --git a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_extended.html b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_extended.html index 64163afc..f0895bc2 100644 --- a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_extended.html +++ b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_extended.html @@ -3,7 +3,7 @@ Options for POSIX Extended Regular Expressions - + diff --git a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_literal.html b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_literal.html index c948c3f6..ce3d6437 100644 --- a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_literal.html +++ b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_literal.html @@ -3,7 +3,7 @@ Options for Literal Strings - + diff --git a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_overview.html b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_overview.html index 1ece1a8b..0b3534fb 100644 --- a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_overview.html +++ b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_overview.html @@ -3,7 +3,7 @@ Overview of syntax_option_type - + diff --git a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_perl.html b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_perl.html index 53eea33c..b7f8f77a 100644 --- a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_perl.html +++ b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_perl.html @@ -3,7 +3,7 @@ Options for Perl Regular Expressions - + diff --git a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_synopsis.html b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_synopsis.html index 8ca8f042..9168914b 100644 --- a/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_synopsis.html +++ b/doc/html/boost_regex/ref/syntax_option_type/syntax_option_type_synopsis.html @@ -3,7 +3,7 @@ syntax_option_type Synopsis - + diff --git a/doc/html/boost_regex/syntax.html b/doc/html/boost_regex/syntax.html index ff46f19b..7cc7ca4c 100644 --- a/doc/html/boost_regex/syntax.html +++ b/doc/html/boost_regex/syntax.html @@ -3,7 +3,7 @@ Regular Expression Syntax - + diff --git a/doc/html/boost_regex/syntax/basic_extended.html b/doc/html/boost_regex/syntax/basic_extended.html index fe1d4cfa..6821fd0d 100644 --- a/doc/html/boost_regex/syntax/basic_extended.html +++ b/doc/html/boost_regex/syntax/basic_extended.html @@ -3,7 +3,7 @@ POSIX Extended Regular Expression Syntax - + @@ -28,7 +28,7 @@ Expression Syntax

    - + Synopsis

    @@ -46,7 +46,7 @@

    - + POSIX Extended Syntax

    @@ -56,7 +56,7 @@

    .[{()\*+?|^$
    - + Wildcard:

    @@ -74,7 +74,7 @@

    - + Anchors:

    @@ -86,7 +86,7 @@ of an expression, or the last character of a sub-expression.

    - + Marked sub-expressions:
    @@ -98,7 +98,7 @@ to by a back-reference.

    - + Repeats:

    @@ -184,7 +184,7 @@ cab operator to be applied to.

    - + Back references:

    @@ -214,7 +214,7 @@ cab

    - + Alternation

    @@ -227,7 +227,7 @@ cab will match either of "abd" or "abef".

    - + Character sets:
    @@ -240,7 +240,7 @@ cab A bracket expression may contain any combination of the following:

    - + Single characters:
    @@ -249,7 +249,7 @@ cab or 'c'.

    - + Character ranges:
    @@ -265,7 +265,7 @@ cab the code points of the characters only.

    - + Negation:

    @@ -274,7 +274,7 @@ cab range a-c.

    - + Character classes:
    @@ -284,7 +284,7 @@ cab character class names.

    - + Collating Elements:
    @@ -312,7 +312,7 @@ cab matches a NUL character.

    - + Equivalence classes:
    @@ -329,7 +329,7 @@ cab or even all locales on one platform.

    - + Combinations:

    @@ -337,7 +337,7 @@ cab [[:digit:]a-c[.NUL.]].

    - + Escapes

    @@ -363,7 +363,7 @@ cab extensions are also supported by Boost.Regex:

    - + Escapes matching a specific character
    @@ -552,7 +552,7 @@ cab
    - + "Single character" character classes:
    @@ -706,7 +706,7 @@ cab
    - + Character Properties
    @@ -813,7 +813,7 @@ cab matches any "digit" character, as does \p{digit}.

    - + Word Boundaries

    @@ -888,7 +888,7 @@ cab

    - + Buffer boundaries
    @@ -979,7 +979,7 @@ cab
    - + Continuation Escape
    @@ -991,7 +991,7 @@ cab match to start where the last one ended.

    - + Quoting escape
    @@ -1005,7 +1005,7 @@ cab \*+aaa
    - + Unicode escapes
    @@ -1056,7 +1056,7 @@ cab
    - + Any other escape
    @@ -1065,7 +1065,7 @@ cab \@ matches a literal '@'.

    - + Operator precedence
    @@ -1101,7 +1101,7 @@ cab
    - + What Gets Matched
    @@ -1111,11 +1111,11 @@ cab rule.

    - + Variations

    - + Egrep

    @@ -1136,7 +1136,7 @@ cab used with the -E option.

    - + awk

    @@ -1150,7 +1150,7 @@ cab these by default anyway.

    - + Options

    @@ -1163,7 +1163,7 @@ cab modify how the case and locale sensitivity are to be applied.

    - + References

    diff --git a/doc/html/boost_regex/syntax/basic_syntax.html b/doc/html/boost_regex/syntax/basic_syntax.html index 87d92ff1..0c6e6604 100644 --- a/doc/html/boost_regex/syntax/basic_syntax.html +++ b/doc/html/boost_regex/syntax/basic_syntax.html @@ -3,7 +3,7 @@ POSIX Basic Regular Expression Syntax - + @@ -28,7 +28,7 @@ Expression Syntax

    - + Synopsis

    @@ -45,7 +45,7 @@

    - + POSIX Basic Syntax

    @@ -55,7 +55,7 @@

    .[\*^$
    - + Wildcard:

    @@ -73,7 +73,7 @@

    - + Anchors:

    @@ -85,7 +85,7 @@ of an expression, or the last character of a sub-expression.

    - + Marked sub-expressions:
    @@ -97,7 +97,7 @@ by a back-reference.

    - + Repeats:

    @@ -155,7 +155,7 @@ aaaa to.

    - + Back references:

    @@ -173,7 +173,7 @@ aaaa

    aaabba
    - + Character sets:
    @@ -186,7 +186,7 @@ aaaa A bracket expression may contain any combination of the following:

    - + Single characters:
    @@ -195,7 +195,7 @@ aaaa or 'c'.

    - + Character ranges:
    @@ -211,7 +211,7 @@ aaaa of the characters only.

    - + Negation:

    @@ -220,7 +220,7 @@ aaaa range a-c.

    - + Character classes:
    @@ -230,7 +230,7 @@ aaaa character class names.

    - + Collating Elements:
    @@ -259,7 +259,7 @@ aaaa element names.

    - + Equivalence classes:
    @@ -276,7 +276,7 @@ aaaa or even all locales on one platform.

    - + Combinations:

    @@ -284,7 +284,7 @@ aaaa [[:digit:]a-c[.NUL.]].

    - + Escapes

    @@ -299,7 +299,7 @@ aaaa will match either a literal '\' or a '^'.

    - + What Gets Matched

    @@ -309,13 +309,13 @@ aaaa rule.

    - + Variations

    - + Grep

    @@ -333,7 +333,7 @@ aaaa As its name suggests, this behavior is consistent with the Unix utility grep.

    - + emacs

    @@ -613,7 +613,7 @@ aaaa leftmost-longest rule.

    - + Options

    @@ -627,7 +627,7 @@ aaaa options modify how the case and locale sensitivity are to be applied.

    - + References

    diff --git a/doc/html/boost_regex/syntax/character_classes.html b/doc/html/boost_regex/syntax/character_classes.html index 730370e1..8611af01 100644 --- a/doc/html/boost_regex/syntax/character_classes.html +++ b/doc/html/boost_regex/syntax/character_classes.html @@ -3,7 +3,7 @@ Character Class Names - + diff --git a/doc/html/boost_regex/syntax/character_classes/optional_char_class_names.html b/doc/html/boost_regex/syntax/character_classes/optional_char_class_names.html index 13e92c0d..0c689b93 100644 --- a/doc/html/boost_regex/syntax/character_classes/optional_char_class_names.html +++ b/doc/html/boost_regex/syntax/character_classes/optional_char_class_names.html @@ -3,7 +3,7 @@ Character classes that are supported by Unicode Regular Expressions - + diff --git a/doc/html/boost_regex/syntax/character_classes/std_char_clases.html b/doc/html/boost_regex/syntax/character_classes/std_char_clases.html index f662c10c..f8ddd4c3 100644 --- a/doc/html/boost_regex/syntax/character_classes/std_char_clases.html +++ b/doc/html/boost_regex/syntax/character_classes/std_char_clases.html @@ -3,7 +3,7 @@ Character Classes that are Always Supported - + diff --git a/doc/html/boost_regex/syntax/collating_names.html b/doc/html/boost_regex/syntax/collating_names.html index c2ff15a2..b3a8e1c2 100644 --- a/doc/html/boost_regex/syntax/collating_names.html +++ b/doc/html/boost_regex/syntax/collating_names.html @@ -3,7 +3,7 @@ Collating Names - + diff --git a/doc/html/boost_regex/syntax/collating_names/digraphs.html b/doc/html/boost_regex/syntax/collating_names/digraphs.html index 661301ad..7affbae9 100644 --- a/doc/html/boost_regex/syntax/collating_names/digraphs.html +++ b/doc/html/boost_regex/syntax/collating_names/digraphs.html @@ -3,7 +3,7 @@ Digraphs - + diff --git a/doc/html/boost_regex/syntax/collating_names/named_unicode.html b/doc/html/boost_regex/syntax/collating_names/named_unicode.html index fdadc6e9..acb51cb1 100644 --- a/doc/html/boost_regex/syntax/collating_names/named_unicode.html +++ b/doc/html/boost_regex/syntax/collating_names/named_unicode.html @@ -3,7 +3,7 @@ Named Unicode Characters - + diff --git a/doc/html/boost_regex/syntax/collating_names/posix_symbolic_names.html b/doc/html/boost_regex/syntax/collating_names/posix_symbolic_names.html index 99d05b20..12bdded3 100644 --- a/doc/html/boost_regex/syntax/collating_names/posix_symbolic_names.html +++ b/doc/html/boost_regex/syntax/collating_names/posix_symbolic_names.html @@ -3,7 +3,7 @@ POSIX Symbolic Names - + diff --git a/doc/html/boost_regex/syntax/leftmost_longest_rule.html b/doc/html/boost_regex/syntax/leftmost_longest_rule.html index 951db6aa..4ea80846 100644 --- a/doc/html/boost_regex/syntax/leftmost_longest_rule.html +++ b/doc/html/boost_regex/syntax/leftmost_longest_rule.html @@ -3,7 +3,7 @@ The Leftmost Longest Rule - + diff --git a/doc/html/boost_regex/syntax/perl_syntax.html b/doc/html/boost_regex/syntax/perl_syntax.html index b4f5427f..cc7a368a 100644 --- a/doc/html/boost_regex/syntax/perl_syntax.html +++ b/doc/html/boost_regex/syntax/perl_syntax.html @@ -3,7 +3,7 @@ Perl Regular Expression Syntax - + @@ -28,15 +28,13 @@ Syntax

    - + Synopsis

    The Perl regular expression syntax is based on that used by the programming language Perl . Perl regular expressions are the default behavior in Boost.Regex - or you can pass the flag perl - to the basic_regex - constructor, for example: + or you can pass the flag perl to the basic_regex constructor, for example:

    // e1 is a case sensitive Perl regular expression: 
     // since Perl is the default option there's no need to explicitly specify the syntax used here:
    @@ -45,7 +43,7 @@
     boost::regex e2(my_expression, boost::regex::perl|boost::regex::icase);
     

    - + Perl Regular Expression Syntax

    @@ -55,7 +53,7 @@

    .[{()\*+?|^$
    - + Wildcard

    @@ -65,17 +63,17 @@

    - + Anchors

    @@ -85,19 +83,18 @@ A '$' character shall match the end of a line.

    - + Marked sub-expressions

    - A section beginning ( and ending - ) acts as a marked sub-expression. - Whatever matched the sub-expression is split out in a separate field by the - matching algorithms. Marked sub-expressions can also repeated, or referred - to by a back-reference. + A section beginning ( and ending ) + acts as a marked sub-expression. Whatever matched the sub-expression is split + out in a separate field by the matching algorithms. Marked sub-expressions + can also repeated, or referred to by a back-reference.

    - + Non-marking grouping
    @@ -105,34 +102,32 @@ A marked sub-expression is useful to lexically group part of a regular expression, but has the side-effect of spitting out an extra field in the result. As an alternative you can lexically group part of a regular expression, without - generating a marked sub-expression by using (?: - and ) , for example (?:ab)+ - will repeat ab without splitting - out any separate sub-expressions. + generating a marked sub-expression by using (?: and ) + , for example (?:ab)+ will repeat ab + without splitting out any separate sub-expressions.

    - + Repeats

    Any atom (a single character, a marked sub-expression, or a character class) - can be repeated with the *, - +, ?, - and {} operators. + can be repeated with the *, +, ?, + and {} operators.

    - The * operator will match the - preceding atom zero or more times, for example the expression a*b - will match any of the following: + The * operator will match the preceding atom zero or more + times, for example the expression a*b will match any of + the following:

    b
     ab
     aaaaaaaab
     

    - The + operator will match the - preceding atom one or more times, for example the expression a+b - will match any of the following: + The + operator will match the preceding atom one or more + times, for example the expression a+b will match any of + the following:

    ab
     aaaaaaaab
    @@ -143,9 +138,8 @@
     
    b
     

    - The ? operator will match the - preceding atom zero or one times, for example the expression ca?b will match - any of the following: + The ? operator will match the preceding atom zero or one + times, for example the expression ca?b will match any of the following:

    cb
     cab
    @@ -159,16 +153,13 @@
             An atom can also be repeated with a bounded repeat:
           

    - a{n} Matches - 'a' repeated exactly n times. + a{n} Matches 'a' repeated exactly n times.

    - a{n,} Matches - 'a' repeated n or more times. + a{n,} Matches 'a' repeated n or more times.

    - a{n, m} Matches 'a' repeated between n and m times - inclusive. + a{n, m} Matches 'a' repeated between n and m times inclusive.

    For example: @@ -193,11 +184,11 @@

    a(*)
     

    - Will raise an error, as there is nothing for the * - operator to be applied to. + Will raise an error, as there is nothing for the * operator + to be applied to.

    - + Non greedy repeats
    @@ -207,28 +198,59 @@ that will consume as little input as possible while still producing a match.

    - *? Matches the previous atom - zero or more times, while consuming as little input as possible. + *? Matches the previous atom zero or more times, while + consuming as little input as possible.

    - +? Matches the previous atom - one or more times, while consuming as little input as possible. + +? Matches the previous atom one or more times, while + consuming as little input as possible.

    - ?? Matches the previous atom - zero or one times, while consuming as little input as possible. + ?? Matches the previous atom zero or one times, while + consuming as little input as possible.

    - {n,}? Matches the previous atom n or more times, + {n,}? Matches the previous atom n or more times, while + consuming as little input as possible. +

    +

    + {n,m}? Matches the previous atom between n and m times, while consuming as little input as possible.

    +
    + + Pocessive + repeats +

    - {n,m}? - Matches the previous atom between n and m times, while consuming as little - input as possible. + By default when a repeated patten does not match then the engine will backtrack + until a match is found. However, this behaviour can sometime be undesireable + so there are also "pocessive" repeats: these match as much as possible + and do not then allow backtracking if the rest of the expression fails to + match. +

    +

    + *+ Matches the previous atom zero or more times, while + giving nothing back. +

    +

    + ++ Matches the previous atom one or more times, while + giving nothing back. +

    +

    + ?+ Matches the previous atom zero or one times, while + giving nothing back. +

    +

    + {n,}+ Matches the previous atom n or more times, while + giving nothing back. +

    +

    + {n,m}+ Matches the previous atom between n and m times, + while giving nothing back.

    - + Back references

    @@ -247,99 +269,187 @@

    aaabba
     
    +

    + You can also use the \g escape for the same function, for example: +

    +
    ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    + Escape +

    +
    +

    + Meaning +

    +
    +

    + \g1 +

    +
    +

    + Match whatever matched sub-expression 1 +

    +
    +

    + \g{1} +

    +
    +

    + Match whatever matched sub-expression 1: this form allows for safer + parsing of the expression in cases like \g{1}2 + or for indexes higher than 9 as in \g{1234} +

    +
    +

    + \g-1 +

    +
    +

    + Match whatever matched the last opened sub-expression +

    +
    +

    + \g{-2} +

    +
    +

    + Match whatever matched the last but one opened sub-expression +

    +
    +

    + \g{one} +

    +
    +

    + Match whatever matched the sub-expression named "one" +

    +
    +

    + Finally the \k escape can be used to refer to named subexpressions, for example + \k<two> will match whatever matched the subexpression + named "two". +

    - + Alternation

    - The | operator will match either - of its arguments, so for example: abc|def will - match either "abc" or "def". + The | operator will match either of its arguments, so + for example: abc|def will match either "abc" + or "def".

    - Parenthesis can be used to group alternations, for example: ab(d|ef) + Parenthesis can be used to group alternations, for example: ab(d|ef) will match either of "abd" or "abef".

    Empty alternatives are not allowed (these are almost always a mistake), but - if you really want an empty alternative use (?:) - as a placeholder, for example: + if you really want an empty alternative use (?:) as a + placeholder, for example:

    - |abc - is not a valid expression, but + |abc is not a valid expression, but

    - (?:)|abc - is and is equivalent, also the expression: + (?:)|abc is and is equivalent, also the expression:

    - (?:abc)?? has exactly the same effect. + (?:abc)?? has exactly the same effect.

    - + Character sets

    - A character set is a bracket-expression starting with [ - and ending with ], it defines - a set of characters, and matches any single character that is a member of - that set. + A character set is a bracket-expression starting with [ + and ending with ], it defines a set of characters, and + matches any single character that is a member of that set.

    A bracket expression may contain any combination of the following:

    - + Single characters

    - For example [abc], will match any of the characters 'a', 'b', - or 'c'. + For example [abc], will match any of the characters 'a', + 'b', or 'c'.

    - + Character ranges

    - For example [a-c] - will match any single character in the range 'a' to 'c'. By default, for - Perl regular expressions, a character x is within the range y to z, if the - code point of the character lies within the codepoints of the endpoints of - the range. Alternatively, if you set the collate flag when constructing the - regular expression, then ranges are locale sensitive. + For example [a-c] will match any single character in the + range 'a' to 'c'. By default, for Perl regular expressions, a character x + is within the range y to z, if the code point of the character lies within + the codepoints of the endpoints of the range. Alternatively, if you set the + collate + flag when constructing the regular expression, then ranges are locale + sensitive.

    - + Negation

    If the bracket-expression begins with the ^ character, then it matches the - complement of the characters it contains, for example [^a-c] matches any character that is not in the - range a-c. + complement of the characters it contains, for example [^a-c] + matches any character that is not in the range a-c.

    - + Character classes

    - An expression of the form [[:name:]] - matches the named character class "name", for example [[:lower:]] matches any lower case character. See - character class names. + An expression of the form [[:name:]] matches the named + character class "name", for example [[:lower:]] + matches any lower case character. See character + class names.

    - + Collating Elements

    - An expression of the form [[.col.] matches - the collating element col. A collating element is any - single character, or any sequence of characters that collates as a single - unit. Collating elements may also be used as the end point of a range, for - example: [[.ae.]-c] + An expression of the form [[.col.]] matches the collating + element col. A collating element is any single character, + or any sequence of characters that collates as a single unit. Collating elements + may also be used as the end point of a range, for example: [[.ae.]-c] matches the character sequence "ae", plus any single character in the range "ae"-c, assuming that "ae" is treated as a single collating element in the current locale. @@ -350,28 +460,27 @@

    [[.NUL.]]
     

    - matches a \0 - character. + matches a \0 character.

    - + Equivalence classes

    - An expression of the form [[=col=]], - matches any character or collating element whose primary sort key is the - same as that for collating element col, as with collating - elements the name col may be a symbolic - name. A primary sort key is one that ignores case, accentation, or - locale-specific tailorings; so for example [[=a=]] matches + An expression of the form [[=col=]], matches any character + or collating element whose primary sort key is the same as that for collating + element col, as with collating elements the name col + may be a symbolic name. + A primary sort key is one that ignores case, accentation, or locale-specific + tailorings; so for example [[=a=]] matches any of the characters: a, À, Á, Â, Ã, Ä, Å, A, à, á, â, ã, ä and å. Unfortunately implementation of this is reliant on the platform's collation and localisation support; this feature can not be relied upon to work portably across all platforms, or even all locales on one platform.

    - + Escaped Characters
    @@ -383,15 +492,15 @@ is not a "word" character.

    - + Combinations

    All of the above can be combined in one character set declaration, for example: - [[:digit:]a-c[.NUL.]]. + [[:digit:]a-c[.NUL.]].

    - + Escapes

    @@ -421,105 +530,103 @@

    - \a + \a

    - \a + \a

    - \e + \e

    - 0x1B + 0x1B

    - \f + \f

    - \f + \f

    - \n + \n

    - \n + \n

    - \r + \r

    - \r + \r

    - \t + \t

    - \t + \t

    - \v - + \v

    - \v + \v

    - \b + \b

    - \b - (but only inside a character class declaration). + \b (but only inside a character class declaration).

    - \cX + \cX

    @@ -532,7 +639,7 @@

    - \xdd + \xdd

    @@ -545,7 +652,7 @@

    - \x{dddd} + \x{dddd}

    @@ -558,7 +665,7 @@

    - \0ddd + \0ddd

    @@ -571,20 +678,21 @@

    - \N{name} + \N{name}

    Matches the single character which has the symbolic - name name. For example \N{newline} matches the single character \n. + name name. For example \N{newline} + matches the single character \n.

    - + "Single character" character classes:
    @@ -676,6 +784,30 @@ + +

    + \h +

    + + +

    + Horizontal whitespace +

    + + + + +

    + \v +

    + + +

    + Vertical whitespace +

    + + +

    \D @@ -735,10 +867,34 @@

    + + +

    + \H +

    + + +

    + Not Horizontal whitespace +

    + + + + +

    + \V +

    + + +

    + Not Vertical whitespace +

    + +
    - + Character Properties
    @@ -842,33 +998,30 @@

    - For example \pd - matches any "digit" character, as does \p{digit}. + For example \pd matches any "digit" character, + as does \p{digit}.

    - + Word Boundaries

    The following escape sequences match the boundaries of words:

    - \< Matches the start of a - word. + \< Matches the start of a word.

    - \> Matches the end of a word. + \> Matches the end of a word.

    - \b - Matches a word boundary (the start or end of a word). + \b Matches a word boundary (the start or end of a word).

    - \B - Matches only when not at a word boundary. + \B Matches only when not at a word boundary.

    - + Buffer boundaries

    @@ -890,46 +1043,69 @@

    \Z Matches an optional sequence of newlines at the end of a buffer: equivalent - to the regular expression \n*\z + to the regular expression \n*\z

    - + Continuation Escape

    - The sequence \G - matches only at the end of the last match found, or at the start of the text - being matched if no previous match was found. This escape useful if you're - iterating over the matches contained within a text, and you want each subsequence - match to start where the last one ended. + The sequence \G matches only at the end of the last match + found, or at the start of the text being matched if no previous match was + found. This escape useful if you're iterating over the matches contained + within a text, and you want each subsequence match to start where the last + one ended.

    - + Quoting escape

    - The escape sequence \Q - begins a "quoted sequence": all the subsequent characters are treated - as literals, until either the end of the regular expression or \E is found. - For example the expression: \Q\*+\Ea+ would match either of: + The escape sequence \Q begins a "quoted sequence": + all the subsequent characters are treated as literals, until either the end + of the regular expression or \E is found. For example the expression: \Q\*+\Ea+ + would match either of:

    \*+a
     \*+aaa
     
    - + Unicode escapes

    - \C - Matches a single code point: in Boost regex this has exactly the same effect - as a "." operator. \X Matches a combining character sequence: - that is any non-combining character followed by a sequence of zero or more - combining characters. + \C Matches a single code point: in Boost regex this has + exactly the same effect as a "." operator. \X + Matches a combining character sequence: that is any non-combining character + followed by a sequence of zero or more combining characters. +

    +
    + + Matching + Line Endings +
    +

    + The escape sequence \R matches any line ending character + sequence, specifically it is identical to the expression (?>\x0D\x0A?|[\x0A-\x0C\x85\x{2028}\x{2029}]). +

    +
    + + Keeping + back some text +
    +

    + \K Resets the start location of $0 to the current text + position: in other words everything to the left of \K is "kept back" + and does not form part of the regular expression match. $` is updated accordingly. +

    +

    + For example foo\Kbar matched against the text "foobar" + would return the match "bar" for $0 and "foo" for $`. + This can be used to simulate variable width lookbehind assertions.

    - + Any other escape
    @@ -938,57 +1114,98 @@ \@ matches a literal '@'.

    - + Perl Extended Patterns

    Perl-specific extensions to the regular expression syntax all start with - (?. + (?. +

    +
    + + Named + Subexpressions +
    +

    + You can create a named subexpression using: +

    +
    (?<NAME>expression)
    +
    +

    + Which can be then be refered to by the name NAME. Alternatively + you can delimit the name using 'NAME' as in: +

    +
    (?'NAME'expression)
    +
    +

    + These named subexpressions can be refered to in a backreference using either + \g{NAME} or \k<NAME> and can + also be refered to by name in a Perl + format string for search and replace operations, or in the match_results member functions.

    - + Comments

    - (?# ... - ) is treated as a comment, it's contents - are ignored. + (?# ... ) is treated as a comment, it's contents are ignored.

    - + Modifiers

    - (?imsx-imsx ... ) alters - which of the perl modifiers are in effect within the pattern, changes take - effect from the point that the block is first seen and extend to any enclosing - ). Letters before a '-' turn - that perl modifier on, letters afterward, turn it off. + (?imsx-imsx ... ) alters which of the perl modifiers are + in effect within the pattern, changes take effect from the point that the + block is first seen and extend to any enclosing ). Letters + before a '-' turn that perl modifier on, letters afterward, turn it off.

    - (?imsx-imsx:pattern) - applies the specified modifiers to pattern only. + (?imsx-imsx:pattern) applies the specified modifiers to + pattern only.

    - + Non-marking groups

    - (?:pattern) lexically groups pattern, without generating + (?:pattern) lexically groups pattern, without generating an additional sub-expression.

    +
    + + Branch reset +
    +

    + (?|pattern) resets the subexpression count at the start + of each "|" alternative within pattern. +

    +

    + The sub-expression count following this construct is that of whichever branch + had the largest number of sub-expressions. This construct is useful when + you want to capture one of a number of alternative matches in a single sub-expression + index. +

    +

    + In the following example the index of each sub-expression is shown below + the expression: +

    +
    # before  ---------------branch-reset----------- after        
    +/ ( a )  (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x
    +# 1            2         2  3        2     3     4
    +
    - + Lookahead

    - (?=pattern) consumes zero characters, only if pattern + (?=pattern) consumes zero characters, only if pattern matches.

    - (?!pattern) consumes zero characters, only if pattern + (?!pattern) consumes zero characters, only if pattern does not match.

    @@ -1003,54 +1220,106 @@ could be used to validate the password.

    - + Lookbehind

    - (?<=pattern) consumes zero characters, only if pattern + (?<=pattern) consumes zero characters, only if pattern could be matched against the characters preceding the current position (pattern must be of fixed length).

    - (?<!pattern) consumes zero characters, only if pattern + (?<!pattern) consumes zero characters, only if pattern could not be matched against the characters preceding the current position (pattern must be of fixed length).

    - + Independent sub-expressions

    - (?>pattern) pattern is matched + (?>pattern) pattern is matched independently of the surrounding patterns, the expression will never backtrack into pattern. Independent sub-expressions are typically used to improve performance; only the best possible match for pattern will be considered, if this doesn't allow the expression as a whole to match then no match is found at all.

    +
    + + Recursive + Expressions +
    +

    + (?N) (?-N) (?+N) + (?R) (?0) +

    +

    + (?R) and (?0) recurse to the start + of the entire pattern. +

    +

    + (?N) executes sub-expression N + recursively, for example (?2) will recurse to sub-expression + 2. +

    +

    + (?-N) and (?+N) + are relative recursions, so for example (?-1) recurses + to the last sub-expression to be declared, and (?+1) recurses + to the next sub-expression to be declared. +

    - + Conditional Expressions

    - (?(condition)yes-pattern|no-pattern) attempts to match yes-pattern - if the condition is true, otherwise attempts to match - no-pattern. + (?(condition)yes-pattern|no-pattern) attempts to match + yes-pattern if the condition is + true, otherwise attempts to match no-pattern.

    - (?(condition)yes-pattern) - attempts to match yes-pattern if the condition - is true, otherwise fails. + (?(condition)yes-pattern) attempts to match yes-pattern + if the condition is true, otherwise fails.

    - condition may be either a forward lookahead assert, - or the index of a marked sub-expression (the condition becomes true if the - sub-expression has been matched). + condition may be either: a forward lookahead assert, + the index of a marked sub-expression (the condition becomes true if the sub-expression + has been matched), or an index of a recursion (the condition become true + if we are executing directly inside the specified recursion).

    +

    + Here is a summary of the possible predicates: +

    +
      +
    • +(?(?=assert)yes-pattern|no-pattern) Executes yes-pattern + if the forward look-ahead assert matches, otherwise executes no-pattern. +
    • +
    • +(?(?!assert)yes-pattern|no-pattern) Executes yes-pattern + if the forward look-ahead assert does not match, otherwise executes no-pattern. +
    • +
    • +(?(R)yes-pattern|no-pattern) Executes yes-pattern + if we are executing inside a recursion, otherwise executes no-pattern. +
    • +
    • +(?(RN)yes-pattern|no-pattern) Executes + yes-pattern if we are executing inside a recursion + to sub-expression N, otherwise executes no-pattern. +
    • +
    • +(?(DEFINE)never-exectuted-pattern) Defines a block of + code that is never executed and matches no characters: this is usually + used to define one or more named sub-expressions which are refered to from + elsewhere in the pattern. +
    • +
    - + Operator precedence
    @@ -1063,17 +1332,16 @@ [::] [..]
  • - Escaped characters \ + Escaped characters \
  • Character set (bracket expression) []
  • - Grouping () + Grouping ()
  • - Single-character-ERE duplication * - + ? {m,n} + Single-character-ERE duplication * + ? {m,n}
  • Concatenation @@ -1086,7 +1354,7 @@
  • - + What gets matched

    @@ -1123,7 +1391,7 @@

    - AtomA AtomB + AtomA AtomB

    @@ -1136,8 +1404,7 @@

    - Expression1 | - Expression2 + Expression1 | Expression2

    @@ -1150,7 +1417,7 @@

    - S{N} + S{N}

    @@ -1162,7 +1429,7 @@

    - S{N,M} + S{N,M}

    @@ -1174,7 +1441,7 @@

    - S{N,M}? + S{N,M}?

    @@ -1186,41 +1453,33 @@

    - S?, - S*, - S+ + S?, S*, S+

    - The same as S{0,1}, - S{0,UINT_MAX}, - S{1,UINT_MAX} - respectively. + The same as S{0,1}, S{0,UINT_MAX}, + S{1,UINT_MAX} respectively.

    - S??, - S*?, - S+? + S??, S*?, S+?

    - The same as S{0,1}?, - S{0,UINT_MAX}?, - S{1,UINT_MAX}? - respectively. + The same as S{0,1}?, S{0,UINT_MAX}?, + S{1,UINT_MAX}? respectively.

    - (?>S) + (?>S)

    @@ -1232,7 +1491,7 @@

    - (?=S), (?<=S) + (?=S), (?<=S)

    @@ -1245,7 +1504,7 @@

    - (?!S), (?<!S) + (?!S), (?<!S)

    @@ -1257,8 +1516,7 @@

    - (?(condition)yes-pattern - | no-pattern) + (?(condition)yes-pattern | no-pattern)

    @@ -1271,41 +1529,40 @@

    - + Variations

    The options - normal, ECMAScript, - JavaScript and JScript are all synonyms for perl. + normal, ECMAScript, JavaScript + and JScript are all synonyms for perl.

    - + Options

    There are a variety - of flags that may be combined with the perl - option when constructing the regular expression, in particular note that - the newline_alt option alters - the syntax, while the collate, - nosubs and icase options modify how the case and locale - sensitivity are to be applied. + of flags that may be combined with the perl option + when constructing the regular expression, in particular note that the newline_alt + option alters the syntax, while the collate, nosubs + and icase options modify how the case and locale sensitivity + are to be applied.

    - + Pattern Modifiers

    - The perl smix modifiers can - either be applied using a (?smix-smix) prefix to the regular expression, or with + The perl smix modifiers can either be applied using a + (?smix-smix) prefix to the regular expression, or with one of the regex-compile - time flags no_mod_m, mod_x, mod_s, - and no_mod_s. + time flags no_mod_m, mod_x, mod_s, + and no_mod_s.

    - + References

    diff --git a/doc/html/boost_regex/unicode.html b/doc/html/boost_regex/unicode.html index 85678400..94b8709f 100644 --- a/doc/html/boost_regex/unicode.html +++ b/doc/html/boost_regex/unicode.html @@ -3,7 +3,7 @@ Unicode and Boost.Regex - + @@ -30,7 +30,7 @@ There are two ways to use Boost.Regex with Unicode strings:

    - + Rely on wchar_t

    @@ -56,7 +56,7 @@

    - + Use a Unicode Aware Regular Expression Type.
    diff --git a/doc/html/index.html b/doc/html/index.html index 9b623b67..cb420c49 100644 --- a/doc/html/index.html +++ b/doc/html/index.html @@ -3,7 +3,7 @@ Boost.Regex - + @@ -28,7 +28,7 @@
    -

    +

    Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

    @@ -196,7 +196,7 @@

    - +

    Last revised: December 23, 2008 at 17:35:37 GMT

    Last revised: July 29, 2009 at 15:59:46 GMT


    diff --git a/doc/match_result.qbk b/doc/match_result.qbk index eb3861fd..8e1ae788 100644 --- a/doc/match_result.qbk +++ b/doc/match_result.qbk @@ -71,9 +71,33 @@ Class template `match_results` is most commonly used as one of the typedefs bool ``[link boost_regex.match_results.empty empty]``() const; // element access: difference_type ``[link boost_regex.match_results.length length]``(int sub = 0) const; + difference_type ``[link boost_regex.match_results.length length]``(const char_type* sub) const; + template + difference_type ``[link boost_regex.match_results.length length]``(const charT* sub) const; + template + difference_type ``[link boost_regex.match_results.length length]``(const std::basic_string& sub) const; difference_type ``[link boost_regex.match_results.position position]``(unsigned int sub = 0) const; + difference_type ``[link boost_regex.match_results.position position]``(const char_type* sub) const; + template + difference_type ``[link boost_regex.match_results.position position]``(const charT* sub) const; + template + difference_type ``[link boost_regex.match_results.position position]``(const std::basic_string& sub) const; string_type ``[link boost_regex.match_results.str str]``(int sub = 0) const; + string_type ``[link boost_regex.match_results.str str]``(const char_type* sub)const; + template + string_type ``[link boost_regex.match_results.str str]``(const std::basic_string& sub)const; + template + string_type ``[link boost_regex.match_results.str str]``(const charT* sub)const; + template + string_type ``[link boost_regex.match_results.str str]``(const std::basic_string& sub)const; const_reference ``[link boost_regex.match_results.subscript operator\[\]]``(int n) const; + const_reference ``[link boost_regex.match_results.subscript operator\[\]]``(const char_type* n) const; + template + const_reference ``[link boost_regex.match_results.subscript operator\[\]]``(const std::basic_string& n) const; + template + const_reference ``[link boost_regex.match_results.subscript operator\[\]]``(const charT* n) const; + template + const_reference ``[link boost_regex.match_results.subscript operator\[\]]``(const std::basic_string& n) const; const_reference ``[link boost_regex.match_results.prefix prefix]``() const; @@ -190,30 +214,86 @@ stored in *this. [#boost_regex.match_results.length] difference_type length(int sub = 0)const; + difference_type length(const char_type* sub)const; + template + difference_type length(const charT* sub)const; + template + difference_type length(const std::basic_string&)const; [*Effects]: Returns the length of sub-expression /sub/, that is to say: `(*this)[sub].length()`. +The overloads that accept a string refer to a named sub-expression /n/. +In the event that there is no such named sub-expression then returns an empty string. + +The template overloads of this function, allow the string and\/or character type +to be different from the character type of the underlying sequence and\/or regular expression: +in this case the characters will be widened to the underlying character type of the original regular expression. +A compiler error will occur if the argument passes a wider character type than the underlying sequence. +These overloads allow a normal narrow character C string literal to be used as an argument, even when +the underlying character type of the expression being matched may be something more exotic such as a +Unicode character type. [#boost_regex.match_results.position] difference_type position(unsigned int sub = 0)const; + difference_type position(const char_type* sub)const; + template + difference_type position(const charT* sub)const; + template + difference_type position(const std::basic_string&)const; [*Effects]: Returns the starting location of sub-expression /sub/, or -1 if /sub/ was not matched. Note that if this represents a partial match , then `position()` will return the location of the partial match even though `(*this)[0].matched` is false. +The overloads that accept a string refer to a named sub-expression /n/. +In the event that there is no such named sub-expression then returns an empty string. + +The template overloads of this function, allow the string and\/or character type +to be different from the character type of the underlying sequence and\/or regular expression: +in this case the characters will be widened to the underlying character type of the original regular expression. +A compiler error will occur if the argument passes a wider character type than the underlying sequence. +These overloads allow a normal narrow character C string literal to be used as an argument, even when +the underlying character type of the expression being matched may be something more exotic such as a +Unicode character type. + [#boost_regex.match_results.str] string_type str(int sub = 0)const; + string_type str(const char_type* sub)const; + template + string_type str(const std::basic_string& sub)const; + template + string_type str(const charT* sub)const; + template + string_type str(const std::basic_string& sub)const; [*Effects]: Returns sub-expression /sub/ as a string: `string_type((*this)[sub])`. +The overloads that accept a string, return the string that matched the named sub-expression /n/. +In the event that there is no such named sub-expression then returns an empty string. + +The template overloads of this function, allow the string and\/or character type +to be different from the character type of the underlying sequence and\/or regular expression: +in this case the characters will be widened to the underlying character type of the original regular expression. +A compiler error will occur if the argument passes a wider character type than the underlying sequence. +These overloads allow a normal narrow character C string literal to be used as an argument, even when +the underlying character type of the expression being matched may be something more exotic such as a +Unicode character type. + [#boost_regex.match_results.subscript] - const_reference operator[](int n) const; + const_reference operator[](int n) const; + const_reference operator[](const char_type* n) const; + template + const_reference operator[](const std::basic_string& n) const; + template + const_reference operator[](const charT* n) const; + template + const_reference operator[](const std::basic_string& n) const; [*Effects]: Returns a reference to the [sub_match] object representing the character sequence that matched marked sub-expression /n/. If `n == 0` then returns a @@ -222,6 +302,19 @@ matched the whole regular expression. If /n/ is out of range, or if /n/ is an unmatched sub-expression, then returns a [sub_match] object whose matched member is false. +The overloads that accept a string, return a reference to the [sub_match] +object representing the character sequence that matched the named sub-expression /n/. +In the event that there is no such named sub-expression then returns a [sub_match] object whose matched +member is false. + +The template overloads of this function, allow the string and\/or character type +to be different from the character type of the underlying sequence and\/or regular expression: +in this case the characters will be widened to the underlying character type of the original regular expression. +A compiler error will occur if the argument passes a wider character type than the underlying sequence. +These overloads allow a normal narrow character C string literal to be used as an argument, even when +the underlying character type of the expression being matched may be something more exotic such as a +Unicode character type. + [#boost_regex.match_results.prefix] diff --git a/doc/syntax_perl.qbk b/doc/syntax_perl.qbk index 9a76276d..716df83c 100644 --- a/doc/syntax_perl.qbk +++ b/doc/syntax_perl.qbk @@ -12,7 +12,7 @@ The Perl regular expression syntax is based on that used by the programming language Perl . Perl regular expressions are the -default behavior in Boost.Regex or you can pass the flag `perl` to the +default behavior in Boost.Regex or you can pass the flag =perl= to the [basic_regex] constructor, for example: // e1 is a case sensitive Perl regular expression: @@ -34,9 +34,9 @@ The single character '.' when used outside of a character set will match any single character except: * The NULL character when the [link boost_regex.ref.match_flag_type flag - `match_not_dot_null`] is passed to the matching algorithms. + =match_not_dot_null=] is passed to the matching algorithms. * The newline character when the [link boost_regex.ref.match_flag_type - flag `match_not_dot_newline`] is passed to + flag =match_not_dot_newline=] is passed to the matching algorithms. [h4 Anchors] @@ -47,7 +47,7 @@ A '$' character shall match the end of a line. [h4 Marked sub-expressions] -A section beginning `(` and ending `)` acts as a marked sub-expression. +A section beginning =(= and ending =)= acts as a marked sub-expression. Whatever matched the sub-expression is split out in a separate field by the matching algorithms. Marked sub-expressions can also repeated, or referred to by a back-reference. @@ -58,23 +58,23 @@ A marked sub-expression is useful to lexically group part of a regular expression, but has the side-effect of spitting out an extra field in the result. As an alternative you can lexically group part of a regular expression, without generating a marked sub-expression by using -`(?:` and `)` , for example `(?:ab)+` will repeat `ab` without splitting +=(?:= and =)= , for example =(?:ab)+= will repeat =ab= without splitting out any separate sub-expressions. [h4 Repeats] Any atom (a single character, a marked sub-expression, or a character class) -can be repeated with the `*`, `+`, `?`, and `{}` operators. +can be repeated with the =*=, =+=, =?=, and ={}= operators. -The `*` operator will match the preceding atom zero or more times, -for example the expression `a*b` will match any of the following: +The =*= operator will match the preceding atom zero or more times, +for example the expression =a*b= will match any of the following: b ab aaaaaaaab -The `+` operator will match the preceding atom one or more times, for -example the expression `a+b` will match any of the following: +The =+= operator will match the preceding atom one or more times, for +example the expression =a+b= will match any of the following: ab aaaaaaaab @@ -83,7 +83,7 @@ But will not match: b -The `?` operator will match the preceding atom zero or one times, for +The =?= operator will match the preceding atom zero or one times, for example the expression ca?b will match any of the following: cb @@ -95,11 +95,11 @@ But will not match: An atom can also be repeated with a bounded repeat: -`a{n}` Matches 'a' repeated exactly n times. +=a{n}= Matches 'a' repeated exactly n times. -`a{n,}` Matches 'a' repeated n or more times. +=a{n,}= Matches 'a' repeated n or more times. -`a{n, m}` Matches 'a' repeated between n and m times inclusive. +=a{n, m}= Matches 'a' repeated between n and m times inclusive. For example: @@ -120,7 +120,7 @@ be repeated, for example: a(*) -Will raise an error, as there is nothing for the `*` operator to be applied to. +Will raise an error, as there is nothing for the =*= operator to be applied to. [h4 Non greedy repeats] @@ -128,21 +128,38 @@ The normal repeat operators are "greedy", that is to say they will consume as much input as possible. There are non-greedy versions available that will consume as little input as possible while still producing a match. -`*?` Matches the previous atom zero or more times, while consuming as little +=*?= Matches the previous atom zero or more times, while consuming as little input as possible. -`+?` Matches the previous atom one or more times, while consuming as +=+?= Matches the previous atom one or more times, while consuming as little input as possible. -`??` Matches the previous atom zero or one times, while consuming +=??= Matches the previous atom zero or one times, while consuming as little input as possible. -`{n,}?` Matches the previous atom n or more times, while consuming as +={n,}?= Matches the previous atom n or more times, while consuming as little input as possible. -`{n,m}?` Matches the previous atom between n and m times, while +={n,m}?= Matches the previous atom between n and m times, while consuming as little input as possible. +[h4 Pocessive repeats] + +By default when a repeated patten does not match then the engine will backtrack until +a match is found. However, this behaviour can sometime be undesireable so there are +also "pocessive" repeats: these match as much as possible and do not then allow +backtracking if the rest of the expression fails to match. + +=*+= Matches the previous atom zero or more times, while giving nothing back. + +=++= Matches the previous atom one or more times, while giving nothing back. + +=?+= Matches the previous atom zero or one times, while giving nothing back. + +={n,}+= Matches the previous atom n or more times, while giving nothing back. + +={n,m}+= Matches the previous atom between n and m times, while giving nothing back. + [h4 Back references] An escape character followed by a digit /n/, where /n/ is in the range 1-9, @@ -158,27 +175,42 @@ Will match the string: But not the string: aaabba + +You can also use the \g escape for the same function, for example: + +[table +[[Escape][Meaning]] +[[=\g1=][Match whatever matched sub-expression 1]] +[[=\g{1}=][Match whatever matched sub-expression 1: this form allows for safer + parsing of the expression in cases like =\g{1}2= or for indexes higher than 9 as in =\g{1234}=]] +[[=\g-1=][Match whatever matched the last opened sub-expression]] +[[=\g{-2}=][Match whatever matched the last but one opened sub-expression]] +[[=\g{one}=][Match whatever matched the sub-expression named "one"]] +] + +Finally the \k escape can be used to refer to named subexpressions, for example [^\k] will match +whatever matched the subexpression named "two". [h4 Alternation] -The `|` operator will match either of its arguments, so for example: -`abc|def` will match either "abc" or "def". +The =|= operator will match either of its arguments, so for example: +=abc|def= will match either "abc" or "def". -Parenthesis can be used to group alternations, for example: `ab(d|ef)` +Parenthesis can be used to group alternations, for example: =ab(d|ef)= will match either of "abd" or "abef". Empty alternatives are not allowed (these are almost always a mistake), but -if you really want an empty alternative use `(?:)` as a placeholder, for example: +if you really want an empty alternative use =(?:)= as a placeholder, for example: -`|abc` is not a valid expression, but +=|abc= is not a valid expression, but -`(?:)|abc` is and is equivalent, also the expression: +=(?:)|abc= is and is equivalent, also the expression: -`(?:abc)??` has exactly the same effect. +=(?:abc)??= has exactly the same effect. [h4 Character sets] -A character set is a bracket-expression starting with `[` and ending with `]`, +A character set is a bracket-expression starting with =[= and ending with =]=, it defines a set of characters, and matches any single character that is a member of that set. @@ -186,35 +218,35 @@ A bracket expression may contain any combination of the following: [h5 Single characters] -For example `[abc]`, will match any of the characters 'a', 'b', or 'c'. +For example =[abc]=, will match any of the characters 'a', 'b', or 'c'. [h5 Character ranges] -For example `[a-c]` will match any single character in the range 'a' to 'c'. +For example =[a-c]= will match any single character in the range 'a' to 'c'. By default, for Perl regular expressions, a character x is within the range y to z, if the code point of the character lies within the codepoints of the endpoints of the range. Alternatively, if you set the -[link boost_regex.ref.syntax_option_type.syntax_option_type_perl `collate` flag] +[link boost_regex.ref.syntax_option_type.syntax_option_type_perl =collate= flag] when constructing the regular expression, then ranges are locale sensitive. [h5 Negation] If the bracket-expression begins with the ^ character, then it matches the -complement of the characters it contains, for example `[^a-c]` matches -any character that is not in the range `a-c`. +complement of the characters it contains, for example =[^a-c]= matches +any character that is not in the range =a-c=. [h5 Character classes] -An expression of the form `[[:name:]]` matches the named character class -"name", for example `[[:lower:]]` matches any lower case character. +An expression of the form [^\[\[:name:\]\]] matches the named character class +"name", for example [^\[\[:lower:\]\]] matches any lower case character. See [link boost_regex.syntax.character_classes character class names]. [h5 Collating Elements] -An expression of the form `[[.col.]` matches the collating element /col/. +An expression of the form [^\[\[.col.\]\]] matches the collating element /col/. A collating element is any single character, or any sequence of characters that collates as a single unit. Collating elements may also be used -as the end point of a range, for example: `[[.ae.]-c]` matches the +as the end point of a range, for example: [^\[\[.ae.\]-c\]] matches the character sequence "ae", plus any single character in the range "ae"-c, assuming that "ae" is treated as a single collating element in the current locale. @@ -223,11 +255,11 @@ As an extension, a collating element may also be specified via it's [[.NUL.]] -matches a `\0` character. +matches a =\0= character. [h5 Equivalence classes] -An expression of the form `[[=col=]]`, matches any character or collating element +An expression of the form [^\[\[\=col\=\]\]], matches any character or collating element whose primary sort key is the same as that for collating element /col/, as with collating elements the name /col/ may be a [link boost_regex.syntax.collating_names symbolic name]. A primary sort key is @@ -250,7 +282,7 @@ that is either a "digit", /or/ is /not/ a "word" character. [h5 Combinations] All of the above can be combined in one character set declaration, for example: -`[[:digit:]a-c[.NUL.]]`. +[^\[\[:digit:\]a-c\[.NUL.\]\]]. [h4 Escapes] @@ -260,24 +292,24 @@ The following escape sequences are all synonyms for single characters: [table [[Escape][Character]] -[[`\a`][`\a`]] -[[`\e`][`0x1B`]] -[[`\f`][`\f`]] -[[`\n`][`\n`]] -[[`\r`][`\r`]] -[[`\t`][`\t`]] -[[`\v `][`\v`]] -[[`\b`][`\b` (but only inside a character class declaration).]] -[[`\cX`][An ASCII escape sequence - the character whose code point is X % 32]] -[[`\xdd`][A hexadecimal escape sequence - matches the single character whose +[[=\a=][=\a=]] +[[=\e=][=0x1B=]] +[[=\f=][=\f=]] +[[=\n=][=\n=]] +[[=\r=][=\r=]] +[[=\t=][=\t=]] +[[=\v=][=\v=]] +[[=\b=][=\b= (but only inside a character class declaration).]] +[[=\cX=][An ASCII escape sequence - the character whose code point is X % 32]] +[[=\xdd=][A hexadecimal escape sequence - matches the single character whose code point is 0xdd.]] -[[`\x{dddd}`][A hexadecimal escape sequence - matches the single character whose +[[=\x{dddd}=][A hexadecimal escape sequence - matches the single character whose code point is 0xdddd.]] -[[`\0ddd`][An octal escape sequence - matches the single character whose +[[=\0ddd=][An octal escape sequence - matches the single character whose code point is 0ddd.]] -[[`\N{name}`][Matches the single character which has the +[[=\N{name}=][Matches the single character which has the [link boost_regex.syntax.collating_names symbolic name] /name/. - For example `\N{newline}` matches the single character \\n.]] + For example =\N{newline}= matches the single character \\n.]] ] [h5 "Single character" character classes:] @@ -296,11 +328,15 @@ The following are supported by default: [[`\s`][`[[:space:]]`]] [[`\u`][`[[:upper:]]`]] [[`\w`][`[[:word:]]`]] +[[`\h`][Horizontal whitespace]] +[[`\v`][Vertical whitespace]] [[`\D`][`[^[:digit:]]`]] [[`\L`][`[^[:lower:]]`]] [[`\S`][`[^[:space:]]`]] [[`\U`][`[^[:upper:]]`]] [[`\W`][`[^[:word:]]`]] +[[`\H`][Not Horizontal whitespace]] +[[`\V`][Not Vertical whitespace]] ] [h5 Character Properties] @@ -316,19 +352,19 @@ to the [link boost_regex.syntax.character_classes names used in character classe [[`\P{Name}`][Matches any character that does not have the property Name.][`[^[:Name:]]`]] ] -For example `\pd` matches any "digit" character, as does `\p{digit}`. +For example =\pd= matches any "digit" character, as does =\p{digit}=. [h5 Word Boundaries] The following escape sequences match the boundaries of words: -`\<` Matches the start of a word. +=\<= Matches the start of a word. -`\>` Matches the end of a word. +=\>= Matches the end of a word. -`\b` Matches a word boundary (the start or end of a word). +=\b= Matches a word boundary (the start or end of a word). -`\B` Matches only when not at a word boundary. +=\B= Matches only when not at a word boundary. [h5 Buffer boundaries] @@ -345,30 +381,44 @@ context is the whole of the input text that is being matched against \\z Matches at the end of a buffer only (the same as \\'). \\Z Matches an optional sequence of newlines at the end of a buffer: -equivalent to the regular expression `\n*\z` +equivalent to the regular expression =\n*\z= [h5 Continuation Escape] -The sequence `\G` matches only at the end of the last match found, or at +The sequence =\G= matches only at the end of the last match found, or at the start of the text being matched if no previous match was found. This escape useful if you're iterating over the matches contained within a text, and you want each subsequence match to start where the last one ended. [h5 Quoting escape] -The escape sequence `\Q` begins a "quoted sequence": all the subsequent characters +The escape sequence =\Q= begins a "quoted sequence": all the subsequent characters are treated as literals, until either the end of the regular expression or \\E -is found. For example the expression: `\Q\*+\Ea+` would match either of: +is found. For example the expression: =\Q\*+\Ea+= would match either of: \*+a \*+aaa [h5 Unicode escapes] -`\C` Matches a single code point: in Boost regex this has exactly the +=\C= Matches a single code point: in Boost regex this has exactly the same effect as a "." operator. -`\X` Matches a combining character sequence: that is any non-combining +=\X= Matches a combining character sequence: that is any non-combining character followed by a sequence of zero or more combining characters. + +[h5 Matching Line Endings] + +The escape sequence =\R= matches any line ending character sequence, specifically it is identical to +the expression [^(?>\x0D\x0A?|\[\x0A-\x0C\x85\x{2028}\x{2029}\])]. + +[h5 Keeping back some text] + +=\K= Resets the start location of $0 to the current text position: in other words everything to the +left of \K is "kept back" and does not form part of the regular expression match. $` is updated +accordingly. + +For example =foo\Kbar= matched against the text "foobar" would return the match "bar" for $0 and "foo" +for $`. This can be used to simulate variable width lookbehind assertions. [h5 Any other escape] @@ -377,31 +427,62 @@ Any other escape sequence matches the character that is escaped, for example [h4 Perl Extended Patterns] -Perl-specific extensions to the regular expression syntax all start with `(?`. +Perl-specific extensions to the regular expression syntax all start with =(?=. +[h5 Named Subexpressions] + +You can create a named subexpression using: + + (?expression) + +Which can be then be refered to by the name /NAME/. Alternatively you can delimit the name +using 'NAME' as in: + + (?'NAME'expression) + +These named subexpressions can be refered to in a backreference using either [^\g{NAME}] or [^\k] +and can also be refered to by name in a [perl_format] format string for search and replace operations, or in the +[match_results] member functions. + [h5 Comments] -`(?# ... )` is treated as a comment, it's contents are ignored. +=(?# ... )= is treated as a comment, it's contents are ignored. [h5 Modifiers] -`(?imsx-imsx ... )` alters which of the perl modifiers are in effect within +=(?imsx-imsx ... )= alters which of the perl modifiers are in effect within the pattern, changes take effect from the point that the block is first seen -and extend to any enclosing `)`. Letters before a '-' turn that perl +and extend to any enclosing =)=. Letters before a '-' turn that perl modifier on, letters afterward, turn it off. -`(?imsx-imsx:pattern)` applies the specified modifiers to pattern only. +=(?imsx-imsx:pattern)= applies the specified modifiers to pattern only. [h5 Non-marking groups] -`(?:pattern)` lexically groups pattern, without generating an additional +=(?:pattern)= lexically groups pattern, without generating an additional sub-expression. +[h5 Branch reset] + +=(?|pattern)= resets the subexpression count at the start of each "|" alternative within /pattern/. + +The sub-expression count following this construct is that of whichever branch had the largest number of +sub-expressions. This construct is useful when you want to capture one of a number of alternative matches +in a single sub-expression index. + +In the following example the index of each sub-expression is shown below the expression: + +[pre +# before ---------------branch-reset----------- after +/ ( a ) (?| x ( y ) z | (p (q) r) | (t) u (v) ) ( z ) /x +# 1 2 2 3 2 3 4 +] + [h5 Lookahead] -`(?=pattern)` consumes zero characters, only if pattern matches. +[^(?=pattern)] consumes zero characters, only if pattern matches. -`(?!pattern)` consumes zero characters, only if pattern does not match. +=(?!pattern)= consumes zero characters, only if pattern does not match. Lookahead is typically used to create the logical AND of two regular expressions, for example if a password must contain a lower case letter, @@ -414,43 +495,68 @@ could be used to validate the password. [h5 Lookbehind] -`(?<=pattern)` consumes zero characters, only if pattern could be matched +[^(?<=pattern)] consumes zero characters, only if pattern could be matched against the characters preceding the current position (pattern must be of fixed length). -`(?pattern)` /pattern/ is matched independently of the surrounding patterns, +=(?>pattern)= /pattern/ is matched independently of the surrounding patterns, the expression will never backtrack into /pattern/. Independent sub-expressions are typically used to improve performance; only the best possible match for pattern will be considered, if this doesn't allow the expression as a whole to match then no match is found at all. +[h5 Recursive Expressions] + +[^(?['N]) (?-['N]) (?+['N]) (?R) (?0)] + +=(?R)= and =(?0)= recurse to the start of the entire pattern. + +[^(?['N])] executes sub-expression /N/ recursively, for example =(?2)= will recurse to sub-expression 2. + +[^(?-['N])] and [^(?+['N])] are relative recursions, so for example =(?-1)= recurses to the last sub-expression to be declared, +and =(?+1)= recurses to the next sub-expression to be declared. + [h5 Conditional Expressions] -`(?(condition)yes-pattern|no-pattern)` attempts to match /yes-pattern/ if +=(?(condition)yes-pattern|no-pattern)= attempts to match /yes-pattern/ if the /condition/ is true, otherwise attempts to match /no-pattern/. -`(?(condition)yes-pattern)` attempts to match /yes-pattern/ if the /condition/ +=(?(condition)yes-pattern)= attempts to match /yes-pattern/ if the /condition/ is true, otherwise fails. -/condition/ may be either a forward lookahead assert, or the index of +/condition/ may be either: a forward lookahead assert, the index of a marked sub-expression (the condition becomes true if the sub-expression -has been matched). +has been matched), or an index of a recursion (the condition become true if we are executing +directly inside the specified recursion). + +Here is a summary of the possible predicates: + +* [^(?(?\=assert)yes-pattern|no-pattern)] Executes /yes-pattern/ if the forward look-ahead assert matches, otherwise +executes /no-pattern/. +* =(?(?!assert)yes-pattern|no-pattern)= Executes /yes-pattern/ if the forward look-ahead assert does not match, otherwise +executes /no-pattern/. +* =(?(R)yes-pattern|no-pattern)= Executes /yes-pattern/ if we are executing inside a recursion, otherwise +executes /no-pattern/. +* [^(?(R['N])yes-pattern|no-pattern)] Executes /yes-pattern/ if we are executing inside a recursion to sub-expression /N/, otherwise +executes /no-pattern/. +* [^(?(DEFINE)never-exectuted-pattern)] Defines a block of code that is never executed and matches no characters: +this is usually used to define one or more named sub-expressions which are refered to from elsewhere in the pattern. [h4 Operator precedence] The order of precedence for of operators is as follows: # Collation-related bracket symbols `[==] [::] [..]` -# Escaped characters `\` +# Escaped characters =\= # Character set (bracket expression) `[]` -# Grouping `()` -# Single-character-ERE duplication `* + ? {m,n}` +# Grouping =()= +# Single-character-ERE duplication =* + ? {m,n}= # Concatenation # Anchoring ^$ # Alternation | @@ -469,42 +575,42 @@ with individual elements matched as follows; [table [[Construct][What gets matched]] -[[`AtomA AtomB`][Locates the best match for /AtomA/ that has a following match for /AtomB/.]] -[[`Expression1 | Expression2`][If /Expresion1/ can be matched then returns that match, +[[=AtomA AtomB=][Locates the best match for /AtomA/ that has a following match for /AtomB/.]] +[[=Expression1 | Expression2=][If /Expresion1/ can be matched then returns that match, otherwise attempts to match /Expression2/.]] -[[`S{N}`][Matches /S/ repeated exactly N times.]] -[[`S{N,M}`][Matches S repeated between N and M times, and as many times as possible.]] -[[`S{N,M}?`][Matches S repeated between N and M times, and as few times as possible.]] -[[`S?, S*, S+`][The same as `S{0,1}`, `S{0,UINT_MAX}`, `S{1,UINT_MAX}` respectively.]] -[[`S??, S*?, S+?`][The same as `S{0,1}?`, `S{0,UINT_MAX}?`, `S{1,UINT_MAX}?` respectively.]] -[[`(?>S)`][Matches the best match for /S/, and only that.]] -[[`(?=S), (?<=S)`][Matches only the best match for /S/ (this is only +[[=S{N}=][Matches /S/ repeated exactly N times.]] +[[=S{N,M}=][Matches S repeated between N and M times, and as many times as possible.]] +[[=S{N,M}?=][Matches S repeated between N and M times, and as few times as possible.]] +[[=S?, S*, S+=][The same as =S{0,1}=, =S{0,UINT_MAX}=, =S{1,UINT_MAX}= respectively.]] +[[=S??, S*?, S+?=][The same as =S{0,1}?=, =S{0,UINT_MAX}?=, =S{1,UINT_MAX}?= respectively.]] +[[=(?>S)=][Matches the best match for /S/, and only that.]] +[[[^(?=S), (?<=S)]][Matches only the best match for /S/ (this is only visible if there are capturing parenthesis within /S/).]] -[[`(?!S), (? extern template class __declspec(dllimport) std::basic_string; # endif diff --git a/include/boost/regex/icu.hpp b/include/boost/regex/icu.hpp index 7af1d678..24715572 100644 --- a/include/boost/regex/icu.hpp +++ b/include/boost/regex/icu.hpp @@ -184,7 +184,9 @@ private: offset_underscore = U_CHAR_CATEGORY_COUNT+3, offset_unicode = U_CHAR_CATEGORY_COUNT+4, offset_any = U_CHAR_CATEGORY_COUNT+5, - offset_ascii = U_CHAR_CATEGORY_COUNT+6 + offset_ascii = U_CHAR_CATEGORY_COUNT+6, + offset_horizontal = U_CHAR_CATEGORY_COUNT+7, + offset_vertical = U_CHAR_CATEGORY_COUNT+8 }; // @@ -197,6 +199,8 @@ private: static const char_class_type mask_unicode; static const char_class_type mask_any; static const char_class_type mask_ascii; + static const char_class_type mask_horizontal; + static const char_class_type mask_vertical; static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2); @@ -311,12 +315,12 @@ inline u32regex do_make_u32regex(InputIterator i, boost::regex_constants::syntax_option_type opt, const boost::mpl::int_<4>*) { - typedef std::vector vector_type; + typedef std::vector vector_type; vector_type v; while(i != j) { - v.push_back((UCHAR32)(*i)); - ++a; + v.push_back((UChar32)(*i)); + ++i; } if(v.size()) return u32regex(&*v.begin(), v.size(), opt); diff --git a/include/boost/regex/v4/basic_regex.hpp b/include/boost/regex/v4/basic_regex.hpp index cb9ff3c5..09b0467b 100644 --- a/include/boost/regex/v4/basic_regex.hpp +++ b/include/boost/regex/v4/basic_regex.hpp @@ -19,6 +19,9 @@ #ifndef BOOST_REGEX_V4_BASIC_REGEX_HPP #define BOOST_REGEX_V4_BASIC_REGEX_HPP +#include +#include + #ifdef BOOST_MSVC #pragma warning(push) #pragma warning(disable: 4103) @@ -44,12 +47,160 @@ namespace re_detail{ template class basic_regex_parser; +template +void bubble_down_one(I first, I last) +{ + if(first != last) + { + I next = last - 1; + while((next != first) && !(*(next-1) < *next)) + { + (next-1)->swap(*next); + --next; + } + } +} + +// +// Class named_subexpressions +// Contains information about named subexpressions within the regex. +// +template +class named_subexpressions_base +{ +public: + virtual int get_id(const charT* i, const charT* j)const = 0; + virtual int get_id(std::size_t hash)const = 0; +#ifdef __GNUC__ + // warning supression: + virtual ~named_subexpressions_base(){} +#endif +}; + +template +inline std::size_t hash_value_from_capture_name(Iterator i, Iterator j) +{ + std::size_t r = boost::hash_range(i, j); + r %= ((std::numeric_limits::max)() - 10001); + r += 10000; + return r; +} + +template +class named_subexpressions : public named_subexpressions_base +{ + struct name + { + name(const charT* i, const charT* j, int idx) + : /*n(i, j), */ index(idx) + { + hash = hash_value_from_capture_name(i, j); + } + name(std::size_t h, int idx) + : index(idx), hash(h) + { + } + //std::vector n; + int index; + std::size_t hash; + bool operator < (const name& other)const + { + return hash < other.hash; //std::lexicographical_compare(n.begin(), n.end(), other.n.begin(), other.n.end()); + } + bool operator == (const name& other)const + { + return hash == other.hash; //n == other.n; + } + void swap(name& other) + { + //n.swap(other.n); + std::swap(index, other.index); + std::swap(hash, other.hash); + } + }; +public: + named_subexpressions(){} + void set_name(const charT* i, const charT* j, int index) + { + m_sub_names.push_back(name(i, j, index)); + bubble_down_one(m_sub_names.begin(), m_sub_names.end()); + } + int get_id(const charT* i, const charT* j)const + { + name t(i, j, 0); + typename std::vector::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t); + if((pos != m_sub_names.end()) && (*pos == t)) + { + return pos->index; + } + return -1; + } + int get_id(std::size_t h)const + { + name t(h, 0); + typename std::vector::const_iterator pos = std::lower_bound(m_sub_names.begin(), m_sub_names.end(), t); + if((pos != m_sub_names.end()) && (*pos == t)) + { + return pos->index; + } + return -1; + } +private: + std::vector m_sub_names; +}; + +template +class named_subexpressions_converter : public named_subexpressions_base +{ + boost::shared_ptr > m_converter; +public: + named_subexpressions_converter(boost::shared_ptr > s) + : m_converter(s) {} + int get_id(const charT* i, const charT* j)const + { + if(i == j) + return -1; + std::vector v; + while(i != j) + { + v.push_back(*i); + ++i; + } + return m_converter->get_id(&v[0], &v[0] + v.size()); + } + int get_id(std::size_t h)const + { + return m_converter->get_id(h); + } +}; + +template +inline boost::shared_ptr > convert_to_named_subs_imp( + boost::shared_ptr > s, + boost::integral_constant const&) +{ + return s; +} +template +inline boost::shared_ptr > convert_to_named_subs_imp( + boost::shared_ptr > s, + boost::integral_constant const&) +{ + return boost::shared_ptr >(new named_subexpressions_converter(s)); +} +template +inline boost::shared_ptr > convert_to_named_subs( + boost::shared_ptr > s) +{ + typedef typename boost::is_same::type tag_type; + return convert_to_named_subs_imp(s, tag_type()); +} // // class regex_data: // represents the data we wish to expose to the matching algorithms. // template -struct regex_data +struct regex_data : public named_subexpressions { typedef regex_constants::syntax_option_type flag_type; typedef std::size_t size_type; @@ -77,6 +228,7 @@ struct regex_data std::vector< std::pair< std::size_t, std::size_t> > m_subs; // Position of sub-expressions within the *string*. + bool m_has_recursions; // whether we have recursive expressions; }; // // class basic_regex_implementation @@ -520,6 +672,10 @@ public: BOOST_ASSERT(0 != m_pimpl.get()); return m_pimpl->get_data(); } + boost::shared_ptr > get_named_subs()const + { + return m_pimpl; + } private: shared_ptr > m_pimpl; diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index 9f2cbeec..6f005054 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -240,6 +240,7 @@ protected: bool m_has_backrefs; // true if there are actually any backrefs unsigned m_backrefs; // bitmask of permitted backrefs boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; + bool m_has_recursions; // set when we have recursive expresisons to fixup typename traits::char_class_type m_word_mask; // mask used to determine if a character is a word character typename traits::char_class_type m_mask_space; // mask used to determine if a character is a word character typename traits::char_class_type m_lower_mask; // mask used to determine if a character is a lowercase character @@ -250,6 +251,7 @@ private: basic_regex_creator(const basic_regex_creator&); void fixup_pointers(re_syntax_base* state); + void fixup_recursions(re_syntax_base* state); void create_startmaps(re_syntax_base* state); int calculate_backstep(re_syntax_base* state); void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask); @@ -263,7 +265,7 @@ private: template basic_regex_creator::basic_regex_creator(regex_data* data) - : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0) + : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0), m_has_recursions(false) { m_pdata->m_data.clear(); m_pdata->m_status = ::boost::regex_constants::error_ok; @@ -692,6 +694,13 @@ void basic_regex_creator::finalize(const charT* p1, const charT* m_pdata->m_first_state = static_cast(m_pdata->m_data.data()); // fixup pointers in the machine: fixup_pointers(m_pdata->m_first_state); + if(m_has_recursions) + { + m_pdata->m_has_recursions = true; + fixup_recursions(m_pdata->m_first_state); + } + else + m_pdata->m_has_recursions = false; // create nested startmaps: create_startmaps(m_pdata->m_first_state); // create main startmap: @@ -713,6 +722,13 @@ void basic_regex_creator::fixup_pointers(re_syntax_base* state) { switch(state->type) { + case syntax_element_recurse: + m_has_recursions = true; + if(state->next.i) + state->next.p = getaddress(state->next.i, state); + else + state->next.p = 0; + break; case syntax_element_rep: case syntax_element_dot_rep: case syntax_element_char_rep: @@ -738,6 +754,93 @@ void basic_regex_creator::fixup_pointers(re_syntax_base* state) } } +template +void basic_regex_creator::fixup_recursions(re_syntax_base* state) +{ + re_syntax_base* base = state; + while(state) + { + switch(state->type) + { + case syntax_element_assert_backref: + { + // just check that the index is valid: + int id = static_cast(state)->index; + if(id < 0) + { + id = -id-1; + if(id >= 10000) + { + id = m_pdata->get_id(id); + if(id <= 0) + { + // check of sub-expression that doesn't exist: + if(0 == this->m_pdata->m_status) // update the error code if not already set + this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; + // + // clear the expression, we should be empty: + // + this->m_pdata->m_expression = 0; + this->m_pdata->m_expression_len = 0; + // + // and throw if required: + // + if(0 == (this->flags() & regex_constants::no_except)) + { + std::string message = this->m_pdata->m_ptraits->error_string(boost::regex_constants::error_bad_pattern); + boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); + e.raise(); + } + } + } + } + } + break; + case syntax_element_recurse: + { + bool ok = false; + re_syntax_base* p = base; + int id = static_cast(state)->alt.i; + if(id > 10000) + id = m_pdata->get_id(id); + while(p) + { + if((p->type == syntax_element_startmark) && (static_cast(p)->index == id)) + { + static_cast(state)->alt.p = p; + ok = true; + break; + } + p = p->next.p; + } + if(!ok) + { + // recursion to sub-expression that doesn't exist: + if(0 == this->m_pdata->m_status) // update the error code if not already set + this->m_pdata->m_status = boost::regex_constants::error_bad_pattern; + // + // clear the expression, we should be empty: + // + this->m_pdata->m_expression = 0; + this->m_pdata->m_expression_len = 0; + // + // and throw if required: + // + if(0 == (this->flags() & regex_constants::no_except)) + { + std::string message = this->m_pdata->m_ptraits->error_string(boost::regex_constants::error_bad_pattern); + boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0); + e.raise(); + } + } + } + default: + break; + } + state = state->next.p; + } +} + template void basic_regex_creator::create_startmaps(re_syntax_base* state) { @@ -953,6 +1056,7 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, create_startmap(state->next.p, 0, pnull, mask); return; } + case syntax_element_recurse: case syntax_element_backref: // can be null, and any character can match: if(pnull) diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index b8bc9963..7d2f5543 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -78,6 +78,8 @@ private: const charT* m_end; // the end of the string being parsed const charT* m_position; // our current parser position unsigned m_mark_count; // how many sub-expressions we have + int m_mark_reset; // used to indicate that we're inside a (?|...) block. + unsigned m_max_mark; // largest mark count seen inside a (?|...) block. std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted). std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative bool m_has_case_change; // true if somewhere in the current block the case has changed @@ -96,7 +98,7 @@ private: template basic_regex_parser::basic_regex_parser(regex_data* data) - : basic_regex_creator(data), m_mark_count(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false) + : basic_regex_creator(data), m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false) { } @@ -123,8 +125,16 @@ void basic_regex_parser::parse(const charT* p1, const charT* p2, switch(l_flags & regbase::main_option_type) { case regbase::perl_syntax_group: - m_parser_proc = &basic_regex_parser::parse_extended; - break; + { + m_parser_proc = &basic_regex_parser::parse_extended; + // + // Add a leading paren with index zero to give recursions a target: + // + re_brace* br = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); + br->index = 0; + br->icase = this->flags() & regbase::icase; + break; + } case regbase::basic_syntax_group: m_parser_proc = &basic_regex_parser::parse_basic; break; @@ -375,11 +385,17 @@ bool basic_regex_parser::parse_open_paren() if(0 == (this->flags() & regbase::nosubs)) { markid = ++m_mark_count; +#ifndef BOOST_NO_STD_DISTANCE if(this->flags() & regbase::save_subexpression_location) this->m_pdata->m_subs.push_back(std::pair(std::distance(m_base, m_position) - 1, 0)); +#else + if(this->flags() & regbase::save_subexpression_location) + this->m_pdata->m_subs.push_back(std::pair((m_position - m_base) - 1, 0)); +#endif } re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); pb->index = markid; + pb->icase = this->flags() & regbase::icase; std::ptrdiff_t last_paren_start = this->getoffset(pb); // back up insertion point for alternations, and set new point: std::ptrdiff_t last_alt_point = m_alt_insert_point; @@ -392,6 +408,11 @@ bool basic_regex_parser::parse_open_paren() bool old_case_change = m_has_case_change; m_has_case_change = false; // no changes to this scope as yet... // + // Back up branch reset data in case we have a nested (?|...) + // + int mark_reset = m_mark_reset; + m_mark_reset = -1; + // // now recursively add more states, this will terminate when we get to a // matching ')' : // @@ -416,6 +437,10 @@ bool basic_regex_parser::parse_open_paren() this->flags(opts); m_has_case_change = old_case_change; // + // restore branch reset: + // + m_mark_reset = mark_reset; + // // we either have a ')' or we have run out of characters prematurely: // if(m_position == m_end) @@ -424,14 +449,20 @@ bool basic_regex_parser::parse_open_paren() return false; } BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark); +#ifndef BOOST_NO_STD_DISTANCE if(markid && (this->flags() & regbase::save_subexpression_location)) this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position); +#else + if(markid && (this->flags() & regbase::save_subexpression_location)) + this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base); +#endif ++m_position; // // append closing parenthesis state: // pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); pb->index = markid; + pb->icase = this->flags() & regbase::icase; this->m_paren_start = last_paren_start; // // restore the alternate insertion point: @@ -600,6 +631,7 @@ bool basic_regex_parser::parse_extended_escape() // fall through: case regex_constants::escape_type_class: { +escape_type_class_jump: typedef typename traits::char_class_type mask_type; mask_type m = this->m_traits.lookup_classname(m_position, m_position+1); if(m != 0) @@ -709,7 +741,104 @@ bool basic_regex_parser::parse_extended_escape() return true; } fail(regex_constants::error_ctype, m_position - m_base); + return false; } + case regex_constants::escape_type_reset_start_mark: + if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) + { + re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); + pb->index = -5; + pb->icase = this->flags() & regbase::icase; + this->m_pdata->m_data.align(); + ++m_position; + return true; + } + goto escape_type_class_jump; + case regex_constants::escape_type_line_ending: + if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) + { + const charT* e = get_escape_R_string(); + const charT* old_position = m_position; + const charT* old_end = m_end; + const charT* old_base = m_base; + m_position = e; + m_base = e; + m_end = e + traits::length(e); + bool r = parse_all(); + m_position = ++old_position; + m_end = old_end; + m_base = old_base; + return r; + } + goto escape_type_class_jump; + case regex_constants::escape_type_extended_backref: + if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) + { + bool have_brace = false; + bool negative = false; + if(++m_position == m_end) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + // maybe have \g{ddd} + if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) + { + if(++m_position == m_end) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + have_brace = true; + } + negative = (*m_position == static_cast('-')); + if((negative) && (++m_position == m_end)) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + const charT* pc = m_position; + int i = this->m_traits.toi(pc, m_end, 10); + if(i < 0) + { + // Check for a named capture: + const charT* base = m_position; + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) + ++m_position; + i = this->m_pdata->get_id(base, m_position); + pc = m_position; + } + if(negative) + i = 1 + m_mark_count - i; + if((i > 0) && (this->m_backrefs & (1u << (i-1)))) + { + m_position = pc; + re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); + pb->index = i; + pb->icase = this->flags() & regbase::icase; + } + else + { + fail(regex_constants::error_backref, m_position - m_end); + return false; + } + m_position = pc; + if(have_brace) + { + if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + ++m_position; + } + return true; + } + goto escape_type_class_jump; + case regex_constants::escape_type_control_v: + if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex))) + goto escape_type_class_jump; + // fallthrough: default: this->append_literal(unescape_character()); break; @@ -737,6 +866,7 @@ template bool basic_regex_parser::parse_repeat(std::size_t low, std::size_t high) { bool greedy = true; + bool pocessive = false; std::size_t insert_point; // // when we get to here we may have a non-greedy ? mark still to come: @@ -748,12 +878,19 @@ bool basic_regex_parser::parse_repeat(std::size_t low, std::size_ ) ) { - // OK we have a perl regex, check for a '?': + // OK we have a perl or emacs regex, check for a '?': if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question) { greedy = false; ++m_position; } + // for perl regexes only check for pocessive ++ repeats. + if((0 == (this->flags() & regbase::main_option_type)) + && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus)) + { + pocessive = true; + ++m_position; + } } if(0 == this->m_last_state) { @@ -822,6 +959,22 @@ bool basic_regex_parser::parse_repeat(std::size_t low, std::size_ // now fill in the alt jump for the repeat: rep = static_cast(this->getaddress(rep_off)); rep->alt.i = this->m_pdata->m_data.size() - rep_off; + // + // If the repeat is pocessive then bracket the repeat with a (?>...) + // independent sub-expression construct: + // + if(pocessive) + { + re_brace* pb = static_cast(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace))); + pb->index = -3; + pb->icase = this->flags() & regbase::icase; + re_jump* jmp = static_cast(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump))); + this->m_pdata->m_data.align(); + jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp); + pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); + pb->index = -3; + pb->icase = this->flags() & regbase::icase; + } return true; } @@ -944,6 +1097,14 @@ bool basic_regex_parser::parse_alt() fail(regex_constants::error_empty, this->m_position - this->m_base); return false; } + // + // Reset mark count if required: + // + if(m_max_mark < m_mark_count) + m_max_mark = m_mark_count; + if(m_mark_reset >= 0) + m_mark_count = m_mark_reset; + ++m_position; // // we need to append a trailing jump: @@ -1462,7 +1623,7 @@ charT basic_regex_parser::unescape_character() int i = this->m_traits.toi(m_position, m_end, 16); if((m_position == m_end) || (i < 0) - || ((std::numeric_limits::is_specialized) && (charT(i) > (std::numeric_limits::max)())) + || ((std::numeric_limits::is_specialized) && (i > (int)(std::numeric_limits::max)())) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) { fail(regex_constants::error_badbrace, m_position - m_base); @@ -1568,6 +1729,7 @@ bool basic_regex_parser::parse_backref() m_position = pc; re_brace* pb = static_cast(this->append_state(syntax_element_backref, sizeof(re_brace))); pb->index = i; + pb->icase = this->flags() & regbase::icase; } else { @@ -1655,6 +1817,7 @@ bool basic_regex_parser::parse_perl_extension() int markid = 0; std::ptrdiff_t jump_offset = 0; re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); + pb->icase = this->flags() & regbase::icase; std::ptrdiff_t last_paren_start = this->getoffset(pb); // back up insertion point for alternations, and set new point: std::ptrdiff_t last_alt_point = m_alt_insert_point; @@ -1665,11 +1828,18 @@ bool basic_regex_parser::parse_perl_extension() regex_constants::syntax_option_type old_flags = this->flags(); bool old_case_change = m_has_case_change; m_has_case_change = false; + charT name_delim; + int mark_reset = m_mark_reset; + m_mark_reset = -1; + int v; // // select the actual extension used: // switch(this->m_traits.syntax_type(*m_position)) { + case regex_constants::syntax_or: + m_mark_reset = m_mark_count; + // fall through: case regex_constants::syntax_colon: // // a non-capturing mark: @@ -1677,6 +1847,57 @@ bool basic_regex_parser::parse_perl_extension() pb->index = markid = 0; ++m_position; break; + case regex_constants::syntax_digit: + { + // + // a recursive subexpression: + // + v = this->m_traits.toi(m_position, m_end, 10); + if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } +insert_recursion: + pb->index = markid = 0; + static_cast(this->append_state(syntax_element_recurse, sizeof(re_jump)))->alt.i = v; + static_cast( + this->append_state(syntax_element_toggle_case, sizeof(re_case)) + )->icase = this->flags() & regbase::icase; + break; + } + case regex_constants::syntax_plus: + // + // A forward-relative recursive subexpression: + // + ++m_position; + v = this->m_traits.toi(m_position, m_end, 10); + if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + v += m_mark_count; + goto insert_recursion; + case regex_constants::syntax_dash: + // + // Possibly a backward-relative recursive subexpression: + // + ++m_position; + v = this->m_traits.toi(m_position, m_end, 10); + if(v <= 0) + { + --m_position; + // Oops not a relative recursion at all, but a (?-imsx) group: + goto option_group_jump; + } + v = m_mark_count + 1 - v; + if(v <= 0) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + goto insert_recursion; case regex_constants::syntax_equal: pb->index = markid = -1; ++m_position; @@ -1706,8 +1927,10 @@ bool basic_regex_parser::parse_perl_extension() pb->index = markid = -1; else { - fail(regex_constants::error_badrepeat, m_position - m_base); - return false; + // Probably a named capture which also starts (?< : + name_delim = '>'; + --m_position; + goto named_capture_jump; } ++m_position; jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); @@ -1736,7 +1959,95 @@ bool basic_regex_parser::parse_perl_extension() return false; } int v = this->m_traits.toi(m_position, m_end, 10); - if(v > 0) + if(*m_position == charT('R')) + { + if(++m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + if(*m_position == charT('&')) + { + const charT* base = ++m_position; + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + v = -static_cast(hash_value_from_capture_name(base, m_position)); + } + else + { + v = -this->m_traits.toi(m_position, m_end, 10); + } + re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); + br->index = v < 0 ? (v - 1) : 0; + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + if(++m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + } + else if((*m_position == charT('\'')) || (*m_position == charT('<'))) + { + const charT* base = ++m_position; + while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\''))) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + v = static_cast(hash_value_from_capture_name(base, m_position)); + re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); + br->index = v; + if((*m_position != charT('>')) && (*m_position != charT('\'')) || (++m_position == m_end)) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + if(++m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + } + else if(*m_position == charT('D')) + { + const char* def = "DEFINE"; + while(*def && (m_position != m_end) && (*m_position == charT(*def))) + ++m_position, ++def; + if((m_position == m_end) || *def) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); + br->index = 9999; // special magic value! + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + if(++m_position == m_end) + { + fail(regex_constants::error_badrepeat, m_position - m_base); + return false; + } + } + else if(v > 0) { re_brace* br = static_cast(this->append_state(syntax_element_assert_backref, sizeof(re_brace))); br->index = v; @@ -1784,7 +2095,7 @@ bool basic_regex_parser::parse_perl_extension() if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not)) { - fail(regex_constants::error_badrepeat, m_position - m_base); + fail(regex_constants::error_paren, m_position - m_base); return false; } m_position -= 2; @@ -1795,10 +2106,93 @@ bool basic_regex_parser::parse_perl_extension() case regex_constants::syntax_close_mark: fail(regex_constants::error_badrepeat, m_position - m_base); return false; + case regex_constants::escape_type_end_buffer: + { + name_delim = *m_position; +named_capture_jump: + markid = 0; + if(0 == (this->flags() & regbase::nosubs)) + { + markid = ++m_mark_count; + #ifndef BOOST_NO_STD_DISTANCE + if(this->flags() & regbase::save_subexpression_location) + this->m_pdata->m_subs.push_back(std::pair(std::distance(m_base, m_position) - 2, 0)); + #else + if(this->flags() & regbase::save_subexpression_location) + this->m_pdata->m_subs.push_back(std::pair((m_position - m_base) - 2, 0)); + #endif + } + pb->index = markid; + const charT* base = ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_paren, m_position - m_base); + return false; + } + while((m_position != m_end) && (*m_position != name_delim)) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_paren, m_position - m_base); + return false; + } + this->m_pdata->set_name(base, m_position, markid); + ++m_position; + break; + } default: + if(*m_position == charT('R')) + { + ++m_position; + v = 0; + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + goto insert_recursion; + } + if(*m_position == charT('&')) + { + ++m_position; + const charT* base = m_position; + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + v = static_cast(hash_value_from_capture_name(base, m_position)); + goto insert_recursion; + } + if(*m_position == charT('P')) + { + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + if(*m_position == charT('>')) + { + ++m_position; + const charT* base = m_position; + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_backref, m_position - m_base); + return false; + } + v = static_cast(hash_value_from_capture_name(base, m_position)); + goto insert_recursion; + } + } // // lets assume that we have a (?imsx) group and try and parse it: // +option_group_jump: regex_constants::syntax_option_type opts = parse_options(); if(m_position == m_end) return false; @@ -1897,9 +2291,20 @@ bool basic_regex_parser::parse_perl_extension() } else if(this->getaddress(static_cast(b)->alt.i, b)->type == syntax_element_alt) { + // Can't have seen more than one alternative: fail(regex_constants::error_bad_pattern, m_position - m_base); return false; } + else + { + // We must *not* have seen an alternative inside a (DEFINE) block: + b = this->getaddress(b->next.i, b); + if((b->type == syntax_element_assert_backref) && (static_cast(b)->index == 9999)) + { + fail(regex_constants::error_bad_pattern, m_position - m_base); + return false; + } + } // check for invalid repetition of next state: b = this->getaddress(expected_alt_point); b = this->getaddress(static_cast(b)->next.i, b); @@ -1915,6 +2320,7 @@ bool basic_regex_parser::parse_perl_extension() // pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); pb->index = markid; + pb->icase = this->flags() & regbase::icase; this->m_paren_start = last_paren_start; // // restore the alternate insertion point: @@ -1924,6 +2330,31 @@ bool basic_regex_parser::parse_perl_extension() // and the case change data: // m_has_case_change = old_case_change; + // + // And the mark_reset data: + // + if(m_max_mark > m_mark_count) + { + m_mark_count = m_max_mark; + } + m_mark_reset = mark_reset; + + + if(markid > 0) + { +#ifndef BOOST_NO_STD_DISTANCE + if(this->flags() & regbase::save_subexpression_location) + this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position) - 1; +#else + if(this->flags() & regbase::save_subexpression_location) + this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1; +#endif + // + // allow backrefs to this mark: + // + if((markid > 0) && (markid < (int)(sizeof(unsigned) * CHAR_BIT))) + this->m_backrefs |= 1u << (markid - 1); + } return true; } diff --git a/include/boost/regex/v4/cpp_regex_traits.hpp b/include/boost/regex/v4/cpp_regex_traits.hpp index 89fe49d8..7ce3ed30 100644 --- a/include/boost/regex/v4/cpp_regex_traits.hpp +++ b/include/boost/regex/v4/cpp_regex_traits.hpp @@ -394,7 +394,9 @@ enum char_class_graph=char_class_alnum|char_class_punct, char_class_blank=1<<9, char_class_word=1<<10, - char_class_unicode=1<<11 + char_class_unicode=1<<11, + char_class_horizontal_space=1<<12, + char_class_vertical_space=1<<13 }; #endif @@ -413,6 +415,8 @@ public: BOOST_STATIC_CONSTANT(char_class_type, mask_blank = 1u << 24); BOOST_STATIC_CONSTANT(char_class_type, mask_word = 1u << 25); BOOST_STATIC_CONSTANT(char_class_type, mask_unicode = 1u << 26); + BOOST_STATIC_CONSTANT(char_class_type, mask_horizontal = 1u << 27); + BOOST_STATIC_CONSTANT(char_class_type, mask_vertical = 1u << 28); #endif typedef std::basic_string string_type; @@ -477,6 +481,10 @@ template typename cpp_regex_traits_implementation::char_class_type const cpp_regex_traits_implementation::mask_word; template typename cpp_regex_traits_implementation::char_class_type const cpp_regex_traits_implementation::mask_unicode; +template +typename cpp_regex_traits_implementation::char_class_type const cpp_regex_traits_implementation::mask_vertical; +template +typename cpp_regex_traits_implementation::char_class_type const cpp_regex_traits_implementation::mask_horizontal; #endif #endif @@ -688,18 +696,20 @@ void cpp_regex_traits_implementation::init() // Custom class names: // #ifndef BOOST_REGEX_BUGGY_CTYPE_FACET - static const char_class_type masks[14] = + static const char_class_type masks[16] = { std::ctype::alnum, std::ctype::alpha, std::ctype::cntrl, std::ctype::digit, std::ctype::graph, + cpp_regex_traits_implementation::mask_horizontal, std::ctype::lower, std::ctype::print, std::ctype::punct, std::ctype::space, std::ctype::upper, + cpp_regex_traits_implementation::mask_vertical, std::ctype::xdigit, cpp_regex_traits_implementation::mask_blank, cpp_regex_traits_implementation::mask_word, @@ -713,11 +723,13 @@ void cpp_regex_traits_implementation::init() ::boost::re_detail::char_class_cntrl, ::boost::re_detail::char_class_digit, ::boost::re_detail::char_class_graph, + ::boost::re_detail::char_class_horizontal_space, ::boost::re_detail::char_class_lower, ::boost::re_detail::char_class_print, ::boost::re_detail::char_class_punct, ::boost::re_detail::char_class_space, ::boost::re_detail::char_class_upper, + ::boost::re_detail::char_class_vertical_space, ::boost::re_detail::char_class_xdigit, ::boost::re_detail::char_class_blank, ::boost::re_detail::char_class_word, @@ -744,7 +756,7 @@ typename cpp_regex_traits_implementation::char_class_type cpp_regex_traits_implementation::lookup_classname_imp(const charT* p1, const charT* p2) const { #ifndef BOOST_REGEX_BUGGY_CTYPE_FACET - static const char_class_type masks[20] = + static const char_class_type masks[22] = { 0, std::ctype::alnum, @@ -754,6 +766,7 @@ typename cpp_regex_traits_implementation::char_class_type std::ctype::digit, std::ctype::digit, std::ctype::graph, + cpp_regex_traits_implementation::mask_horizontal, std::ctype::lower, std::ctype::lower, std::ctype::print, @@ -763,12 +776,13 @@ typename cpp_regex_traits_implementation::char_class_type std::ctype::upper, cpp_regex_traits_implementation::mask_unicode, std::ctype::upper, + cpp_regex_traits_implementation::mask_vertical, std::ctype::alnum | cpp_regex_traits_implementation::mask_word, std::ctype::alnum | cpp_regex_traits_implementation::mask_word, std::ctype::xdigit, }; #else - static const char_class_type masks[20] = + static const char_class_type masks[22] = { 0, ::boost::re_detail::char_class_alnum, @@ -778,6 +792,7 @@ typename cpp_regex_traits_implementation::char_class_type ::boost::re_detail::char_class_digit, ::boost::re_detail::char_class_digit, ::boost::re_detail::char_class_graph, + ::boost::re_detail::char_class_horizontal_space, ::boost::re_detail::char_class_lower, ::boost::re_detail::char_class_lower, ::boost::re_detail::char_class_print, @@ -787,6 +802,7 @@ typename cpp_regex_traits_implementation::char_class_type ::boost::re_detail::char_class_upper, ::boost::re_detail::char_class_unicode, ::boost::re_detail::char_class_upper, + ::boost::re_detail::char_class_vertical_space, ::boost::re_detail::char_class_alnum | ::boost::re_detail::char_class_word, ::boost::re_detail::char_class_alnum | ::boost::re_detail::char_class_word, ::boost::re_detail::char_class_xdigit, @@ -820,7 +836,9 @@ bool cpp_regex_traits_implementation::isctype(const charT c, char_class_t || ((mask & ::boost::re_detail::char_class_xdigit) && (m_pctype->is(std::ctype::xdigit, c))) || ((mask & ::boost::re_detail::char_class_blank) && (m_pctype->is(std::ctype::space, c)) && !::boost::re_detail::is_separator(c)) || ((mask & ::boost::re_detail::char_class_word) && (c == '_')) - || ((mask & ::boost::re_detail::char_class_unicode) && ::boost::re_detail::is_extended(c)); + || ((mask & ::boost::re_detail::char_class_unicode) && ::boost::re_detail::is_extended(c)) + || ((mask & ::boost::re_detail::char_class_vertical) && (is_separator(c) || (c == '\v'))) + || ((mask & ::boost::re_detail::char_class_horizontal) && m_pctype->is(std::ctype::space, c) && !(is_separator(c) || (c == '\v'))); } #endif @@ -930,6 +948,12 @@ public: && m_pimpl->m_pctype->is(std::ctype::space, c) && !re_detail::is_separator(c)) return true; + else if((f & re_detail::cpp_regex_traits_implementation::mask_vertical) + && (::boost::re_detail::is_separator(c) || (c == '\v'))) + return true; + else if((f & re_detail::cpp_regex_traits_implementation::mask_horizontal) + && this->isctype(c, std::ctype::space) && !this->isctype(c, re_detail::cpp_regex_traits_implementation::mask_vertical)) + return true; return false; #else return m_pimpl->isctype(c, f); diff --git a/include/boost/regex/v4/iterator_category.hpp b/include/boost/regex/v4/iterator_category.hpp index 20870a0c..9e401423 100644 --- a/include/boost/regex/v4/iterator_category.hpp +++ b/include/boost/regex/v4/iterator_category.hpp @@ -31,10 +31,14 @@ namespace detail{ template struct is_random_imp { +#ifndef BOOST_NO_STD_ITERATOR_TRAITS private: typedef typename std::iterator_traits::iterator_category cat; public: BOOST_STATIC_CONSTANT(bool, value = (::boost::is_convertible::value)); +#else + BOOST_STATIC_CONSTANT(bool, value = false); +#endif }; template diff --git a/include/boost/regex/v4/match_results.hpp b/include/boost/regex/v4/match_results.hpp index acf509fa..09dd31f0 100644 --- a/include/boost/regex/v4/match_results.hpp +++ b/include/boost/regex/v4/match_results.hpp @@ -36,6 +36,13 @@ namespace boost{ #pragma warning(disable : 4251 4231 4660) #endif +namespace re_detail{ + +template +class named_subexpressions; + +} + template class match_results { @@ -62,13 +69,14 @@ public: typedef typename re_detail::regex_iterator_traits< BidiIterator>::value_type char_type; typedef std::basic_string string_type; + typedef re_detail::named_subexpressions_base named_sub_type; // construct/copy/destroy: explicit match_results(const Allocator& a = Allocator()) #ifndef BOOST_NO_STD_ALLOCATOR - : m_subs(a), m_base() {} + : m_subs(a), m_base(), m_last_closed_paren(0) {} #else - : m_subs(), m_base() { (void)a; } + : m_subs(), m_base(), m_last_closed_paren(0) { (void)a; } #endif match_results(const match_results& m) : m_subs(m.m_subs), m_base(m.m_base) {} @@ -95,6 +103,24 @@ public: return m_subs[sub].length(); return 0; } + difference_type length(const char_type* sub) const + { + const char_type* end = sub; + while(*end) ++end; + return length(named_subexpression_index(sub, end)); + } + template + difference_type length(const charT* sub) const + { + const charT* end = sub; + while(*end) ++end; + return length(named_subexpression_index(sub, end)); + } + template + difference_type length(const std::basic_string& sub) const + { + return length(sub.c_str()); + } difference_type position(size_type sub = 0) const { sub += 2; @@ -108,6 +134,24 @@ public: } return ~static_cast(0); } + difference_type position(const char_type* sub) const + { + const char_type* end = sub; + while(*end) ++end; + return position(named_subexpression_index(sub, end)); + } + template + difference_type position(const charT* sub) const + { + const charT* end = sub; + while(*end) ++end; + return position(named_subexpression_index(sub, end)); + } + template + difference_type position(const std::basic_string& sub) const + { + return position(sub.c_str()); + } string_type str(int sub = 0) const { sub += 2; @@ -122,6 +166,25 @@ public: } return result; } + string_type str(const char_type* sub) const + { + return (*this)[sub].str(); + } + template + string_type str(const std::basic_string& sub) const + { + return (*this)[sub].str(); + } + template + string_type str(const charT* sub) const + { + return (*this)[sub].str(); + } + template + string_type str(const std::basic_string& sub) const + { + return (*this)[sub].str(); + } const_reference operator[](int sub) const { sub += 2; @@ -131,6 +194,75 @@ public: } return m_null; } + // + // Named sub-expressions: + // + const_reference named_subexpression(const char_type* i, const char_type* j) const + { + int index = m_named_subs->get_id(i, j); + return index > 0 ? (*this)[index] : m_null; + } + template + const_reference named_subexpression(const charT* i, const charT* j) const + { + BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type)); + if(i == j) + return m_null; + std::vector s; + while(i != j) + s.insert(s.end(), *i++); + return named_subexpression(&*s.begin(), &*s.begin() + s.size()); + } + int named_subexpression_index(const char_type* i, const char_type* j) const + { + int index = m_named_subs->get_id(i, j); + return index > 0 ? index : -20; + } + template + int named_subexpression_index(const charT* i, const charT* j) const + { + BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type)); + if(i == j) + return -20; + std::vector s; + while(i != j) + s.insert(s.end(), *i++); + return named_subexpression_index(&*s.begin(), &*s.begin() + s.size()); + } + template + const_reference operator[](const std::basic_string& s) const + { + return named_subexpression(s.c_str(), s.c_str() + s.size()); + } + const_reference operator[](const char_type* p) const + { + const char_type* e = p; + while(*e) ++e; + return named_subexpression(p, e); + } + + template + const_reference operator[](const charT* p) const + { + BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type)); + if(*p == 0) + return m_null; + std::vector s; + while(*p) + s.insert(s.end(), *p++); + return named_subexpression(&*s.begin(), &*s.begin() + s.size()); + } + template + const_reference operator[](const std::basic_string& ns) const + { + BOOST_STATIC_ASSERT(sizeof(charT) <= sizeof(char_type)); + if(ns.empty()) + return m_null; + std::vector s; + for(unsigned i = 0; i < ns.size(); ++i) + s.insert(s.end(), ns[i]); + return named_subexpression(&*s.begin(), &*s.begin() + s.size()); + } const_reference prefix() const { @@ -186,6 +318,10 @@ public: ::boost::re_detail::regex_format_imp(i, *this, fmt.data(), fmt.data() + fmt.size(), flags, re.get_traits()); return result; } + const_reference get_last_closed_paren()const + { + return m_last_closed_paren == 0 ? m_null : (*this)[m_last_closed_paren]; + } allocator_type get_allocator() const { @@ -230,13 +366,15 @@ public: m_null.matched = false; } - void BOOST_REGEX_CALL set_second(BidiIterator i, size_type pos, bool m = true) + void BOOST_REGEX_CALL set_second(BidiIterator i, size_type pos, bool m = true, bool escape_k = false) { + if(pos) + m_last_closed_paren = pos; pos += 2; BOOST_ASSERT(m_subs.size() > pos); m_subs[pos].second = i; m_subs[pos].matched = m; - if(pos == 2) + if((pos == 2) && !escape_k) { m_subs[0].first = i; m_subs[0].matched = (m_subs[0].first != m_subs[0].second); @@ -261,6 +399,7 @@ public: m_subs.insert(m_subs.end(), n+2-len, v); } m_subs[1].first = i; + m_last_closed_paren = 0; } void BOOST_REGEX_CALL set_base(BidiIterator pos) { @@ -284,21 +423,34 @@ public: m_subs[n].matched = false; } } - void BOOST_REGEX_CALL set_first(BidiIterator i, size_type pos) + void BOOST_REGEX_CALL set_first(BidiIterator i, size_type pos, bool escape_k = false) { BOOST_ASSERT(pos+2 < m_subs.size()); - if(pos) + if(pos || escape_k) + { m_subs[pos+2].first = i; + if(escape_k) + { + m_subs[1].second = i; + m_subs[1].matched = (m_subs[1].first != m_subs[1].second); + } + } else set_first(i); } void BOOST_REGEX_CALL maybe_assign(const match_results& m); + void BOOST_REGEX_CALL set_named_subs(boost::shared_ptr subs) + { + m_named_subs = subs; + } private: vector_type m_subs; // subexpressions BidiIterator m_base; // where the search started from sub_match m_null; // a null match + boost::shared_ptr m_named_subs; + int m_last_closed_paren; }; template diff --git a/include/boost/regex/v4/perl_matcher.hpp b/include/boost/regex/v4/perl_matcher.hpp index 33afe6e9..726c2881 100644 --- a/include/boost/regex/v4/perl_matcher.hpp +++ b/include/boost/regex/v4/perl_matcher.hpp @@ -285,7 +285,8 @@ public: } ~repeater_count() { - *stack = next; + if(next) + *stack = next; } std::size_t get_count() { return count; } int get_id() { return state_id; } @@ -325,6 +326,17 @@ enum saved_state_type saved_state_count = 14 }; +template +struct recursion_info +{ + typedef typename Results::value_type value_type; + typedef typename value_type::iterator iterator; + int id; + const re_syntax_base* preturn_address; + Results results; + repeater_count* repeater_stack; +}; + #ifdef BOOST_MSVC #pragma warning(push) #pragma warning(disable : 4251 4231 4660) @@ -340,6 +352,7 @@ public: typedef std::size_t traits_size_type; typedef typename is_byte::width_type width_type; typedef typename regex_iterator_traits::difference_type difference_type; + typedef match_results results_type; perl_matcher(BidiIterator first, BidiIterator end, match_results& what, @@ -348,7 +361,7 @@ public: BidiIterator l_base) : m_result(what), base(first), last(end), position(first), backstop(l_base), re(e), traits_inst(e.get_traits()), - m_independent(false), next_count(&rep_obj), rep_obj(&next_count) + m_independent(false), next_count(&rep_obj), rep_obj(&next_count), recursion_stack_position(0) { construct_init(e, f); } @@ -403,12 +416,17 @@ private: bool match_char_repeat(); bool match_dot_repeat_fast(); bool match_dot_repeat_slow(); + bool match_dot_repeat_dispatch() + { + return ::boost::is_random_access_iterator::value ? match_dot_repeat_fast() : match_dot_repeat_slow(); + } bool match_backstep(); bool match_assert_backref(); bool match_toggle_case(); #ifdef BOOST_REGEX_RECURSIVE bool backtrack_till_match(std::size_t count); #endif + bool match_recursion(); // find procs stored in s_find_vtable: bool find_restart_any(); @@ -464,6 +482,9 @@ private: typename traits::char_class_type m_word_mask; // the bitmask to use when determining whether a match_any matches a newline or not: unsigned char match_any_mask; + // recursion information: + recursion_info recursion_stack[50]; + unsigned recursion_stack_position; #ifdef BOOST_REGEX_NON_RECURSIVE // @@ -487,6 +508,8 @@ private: bool unwind_short_set_repeat(bool); bool unwind_long_set_repeat(bool); bool unwind_non_greedy_repeat(bool); + bool unwind_recursion(bool); + bool unwind_recursion_pop(bool); void destroy_single_repeat(); void push_matched_paren(int index, const sub_match& sub); void push_recursion_stopper(); @@ -495,7 +518,8 @@ private: void push_repeater_count(int i, repeater_count** s); void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int state_id); void push_non_greedy_repeat(const re_syntax_base* ps); - + void push_recursion(int id, const re_syntax_base* p, results_type* presults); + void push_recursion_pop(); // pointer to base of stack: saved_state* m_stack_base; diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index 399caa3b..fd439f84 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -200,12 +200,13 @@ bool perl_matcher::match_imp() m_match_flags |= regex_constants::match_all; m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), search_base, last); m_presult->set_base(base); + m_presult->set_named_subs(re_detail::convert_to_named_subs::char_type>(this->re.get_named_subs())); if(m_match_flags & match_posix) m_result = *m_presult; verify_options(re.flags(), m_match_flags); if(0 == match_prefix()) return false; - return m_result[0].second == last; + return (m_result[0].second == last) && (m_result[0].first == base); #if defined(BOOST_REGEX_NON_RECURSIVE) && !defined(BOOST_NO_EXCEPTIONS) } @@ -261,6 +262,7 @@ bool perl_matcher::find_imp() pstate = re.get_first_state(); m_presult->set_size((m_match_flags & match_nosubs) ? 1 : re.mark_count(), base, last); m_presult->set_base(base); + m_presult->set_named_subs(re_detail::convert_to_named_subs::char_type>(this->re.get_named_subs())); m_match_flags |= regex_constants::match_init; } else @@ -343,25 +345,6 @@ bool perl_matcher::match_prefix() return m_has_found_match; } -template -bool perl_matcher::match_endmark() -{ - int index = static_cast(pstate)->index; - if(index > 0) - { - if((m_match_flags & match_nosubs) == 0) - m_presult->set_second(position, index); - } - else if((index < 0) && (index != -4)) - { - // matched forward lookahead: - pstate = 0; - return true; - } - pstate = pstate->next.p; - return true; -} - template bool perl_matcher::match_literal() { @@ -462,35 +445,6 @@ bool perl_matcher::match_wild() return true; } -template -bool perl_matcher::match_match() -{ - if((m_match_flags & match_not_null) && (position == (*m_presult)[0].first)) - return false; - if((m_match_flags & match_all) && (position != last)) - return false; - if((m_match_flags & regex_constants::match_not_initial_null) && (position == search_base)) - return false; - m_presult->set_second(position); - pstate = 0; - m_has_found_match = true; - if((m_match_flags & match_posix) == match_posix) - { - m_result.maybe_assign(*m_presult); - if((m_match_flags & match_any) == 0) - return false; - } -#ifdef BOOST_REGEX_MATCH_EXTRA - if(match_extra & m_match_flags) - { - for(unsigned i = 0; i < m_presult->size(); ++i) - if((*m_presult)[i].matched) - ((*m_presult)[i]).get_captures().push_back((*m_presult)[i]); - } -#endif - return true; -} - template bool perl_matcher::match_word_boundary() { @@ -758,8 +712,32 @@ template inline bool perl_matcher::match_assert_backref() { // return true if marked sub-expression N has been matched: - bool result = (*m_presult)[static_cast(pstate)->index].matched; - pstate = pstate->next.p; + int index = static_cast(pstate)->index; + bool result; + if(index == 9999) + { + // Magic value for a (DEFINE) block: + return false; + } + else if(index > 0) + { + // Check if index is a hash value: + if(index >= 10000) + index = re.get_data().get_id(index); + // Have we matched subexpression "index"? + result = (*m_presult)[index].matched; + pstate = pstate->next.p; + } + else + { + // Have we recursed into subexpression "index"? + // If index == 0 then check for any recursion at all, otherwise for recursion to -index-1. + int id = -index-1; + if(id >= 10000) + id = re.get_data().get_id(id); + result = recursion_stack_position && ((recursion_stack[recursion_stack_position-1].id == id) || (index == 0)); + pstate = pstate->next.p; + } return result; } diff --git a/include/boost/regex/v4/perl_matcher_non_recursive.hpp b/include/boost/regex/v4/perl_matcher_non_recursive.hpp index 10e03477..2ce7ebe0 100644 --- a/include/boost/regex/v4/perl_matcher_non_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_non_recursive.hpp @@ -127,10 +127,21 @@ struct saved_single_repeat : public saved_state : saved_state(arg_id), count(c), rep(r), last_position(lp){} }; +template +struct saved_recursion : public saved_state +{ + saved_recursion(int id, const re_syntax_base* p, Results* pr) + : saved_state(14), recursion_id(id), preturn_address(p), results(*pr) + {} + int recursion_id; + const re_syntax_base* preturn_address; + Results results; +}; + template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[29] = + static matcher_proc_type const s_match_vtable[30] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -154,13 +165,18 @@ bool perl_matcher::match_all_states() &perl_matcher::match_combining, &perl_matcher::match_soft_buffer_end, &perl_matcher::match_restart_continue, - (::boost::is_random_access_iterator::value ? &perl_matcher::match_dot_repeat_fast : &perl_matcher::match_dot_repeat_slow), + // Although this next line *should* be evaluated at compile time, in practice + // some compilers (VC++) emit run-time initialisation which breaks thread + // safety, so use a dispatch function instead: + //(::boost::is_random_access_iterator::value ? &perl_matcher::match_dot_repeat_fast : &perl_matcher::match_dot_repeat_slow), + &perl_matcher::match_dot_repeat_dispatch, &perl_matcher::match_char_repeat, &perl_matcher::match_set_repeat, &perl_matcher::match_long_set_repeat, &perl_matcher::match_backstep, &perl_matcher::match_assert_backref, &perl_matcher::match_toggle_case, + &perl_matcher::match_recursion, }; push_recursion_stopper(); @@ -209,7 +225,7 @@ void perl_matcher::extend_stack() template inline void perl_matcher::push_matched_paren(int index, const sub_match& sub) { - BOOST_ASSERT(index); + //BOOST_ASSERT(index); saved_matched_paren* pmp = static_cast*>(m_backup_state); --pmp; if(pmp < m_stack_base) @@ -312,10 +328,26 @@ inline void perl_matcher::push_single_repeat(st m_backup_state = pmp; } +template +inline void perl_matcher::push_recursion(int id, const re_syntax_base* p, results_type* presults) +{ + saved_recursion* pmp = static_cast*>(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast*>(m_backup_state); + --pmp; + } + (void) new (pmp)saved_recursion(id, p, presults); + m_backup_state = pmp; +} + template bool perl_matcher::match_startmark() { int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; switch(index) { case 0: @@ -400,6 +432,13 @@ bool perl_matcher::match_startmark() break; } } + case -5: + { + push_matched_paren(0, (*m_presult)[0]); + m_presult->set_first(position, 0, true); + pstate = pstate->next.p; + break; + } default: { BOOST_ASSERT(index > 0); @@ -848,6 +887,100 @@ bool perl_matcher::match_long_set_repeat() #endif } +template +bool perl_matcher::match_recursion() +{ + BOOST_ASSERT(pstate->type == syntax_element_recurse); + // + // Backup call stack: + // + push_recursion_pop(); + // + // Set new call stack: + // + if(recursion_stack_position >= static_cast(sizeof(recursion_stack)/sizeof(recursion_stack[0]))) + { + return false; + } + recursion_stack[recursion_stack_position].preturn_address = pstate->next.p; + recursion_stack[recursion_stack_position].results = *m_presult; + pstate = static_cast(pstate)->alt.p; + recursion_stack[recursion_stack_position].id = static_cast(pstate)->index; + ++recursion_stack_position; + //BOOST_ASSERT(recursion_stack[recursion_stack_position-1].id); + return true; +} + +template +bool perl_matcher::match_endmark() +{ + int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; + if(index > 0) + { + if((m_match_flags & match_nosubs) == 0) + { + m_presult->set_second(position, index); + } + if(recursion_stack_position) + { + if(index == recursion_stack[recursion_stack_position-1].id) + { + --recursion_stack_position; + pstate = recursion_stack[recursion_stack_position].preturn_address; + *m_presult = recursion_stack[recursion_stack_position].results; + push_recursion(recursion_stack[recursion_stack_position].id, recursion_stack[recursion_stack_position].preturn_address, &recursion_stack[recursion_stack_position].results); + } + } + } + else if((index < 0) && (index != -4)) + { + // matched forward lookahead: + pstate = 0; + return true; + } + pstate = pstate->next.p; + return true; +} + +template +bool perl_matcher::match_match() +{ + if(recursion_stack_position) + { + BOOST_ASSERT(0 == recursion_stack[recursion_stack_position-1].id); + --recursion_stack_position; + pstate = recursion_stack[recursion_stack_position].preturn_address; + *m_presult = recursion_stack[recursion_stack_position].results; + push_recursion(recursion_stack[recursion_stack_position].id, recursion_stack[recursion_stack_position].preturn_address, &recursion_stack[recursion_stack_position].results); + return true; + } + if((m_match_flags & match_not_null) && (position == (*m_presult)[0].first)) + return false; + if((m_match_flags & match_all) && (position != last)) + return false; + if((m_match_flags & regex_constants::match_not_initial_null) && (position == search_base)) + return false; + m_presult->set_second(position); + pstate = 0; + m_has_found_match = true; + if((m_match_flags & match_posix) == match_posix) + { + m_result.maybe_assign(*m_presult); + if((m_match_flags & match_any) == 0) + return false; + } +#ifdef BOOST_REGEX_MATCH_EXTRA + if(match_extra & m_match_flags) + { + for(unsigned i = 0; i < m_presult->size(); ++i) + if((*m_presult)[i].matched) + ((*m_presult)[i]).get_captures().push_back((*m_presult)[i]); + } +#endif + return true; +} + /**************************************************************************** Unwind and associated proceedures follow, these perform what normal stack @@ -858,7 +991,7 @@ unwinding does in the recursive implementation. template bool perl_matcher::unwind(bool have_match) { - static unwind_proc_type const s_unwind_table[14] = + static unwind_proc_type const s_unwind_table[18] = { &perl_matcher::unwind_end, &perl_matcher::unwind_paren, @@ -874,6 +1007,8 @@ bool perl_matcher::unwind(bool have_match) &perl_matcher::unwind_short_set_repeat, &perl_matcher::unwind_long_set_repeat, &perl_matcher::unwind_non_greedy_repeat, + &perl_matcher::unwind_recursion, + &perl_matcher::unwind_recursion_pop, }; m_recursive_result = have_match; @@ -907,8 +1042,8 @@ bool perl_matcher::unwind_paren(bool have_match // restore previous values if no match was found: if(have_match == false) { - m_presult->set_first(pmp->sub.first, pmp->index); - m_presult->set_second(pmp->sub.second, pmp->index, pmp->sub.matched); + m_presult->set_first(pmp->sub.first, pmp->index, pmp->index == 0); + m_presult->set_second(pmp->sub.second, pmp->index, pmp->sub.matched, pmp->index == 0); } #ifdef BOOST_REGEX_MATCH_EXTRA // @@ -1377,6 +1512,106 @@ bool perl_matcher::unwind_non_greedy_repeat(boo return r; } +template +bool perl_matcher::unwind_recursion(bool r) +{ + saved_recursion* pmp = static_cast*>(m_backup_state); + if(!r) + { + recursion_stack[recursion_stack_position].id = pmp->recursion_id; + recursion_stack[recursion_stack_position].preturn_address = pmp->preturn_address; + recursion_stack[recursion_stack_position].results = pmp->results; + ++recursion_stack_position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +bool perl_matcher::unwind_recursion_pop(bool r) +{ + saved_state* pmp = static_cast(m_backup_state); + if(!r) + { + --recursion_stack_position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +void perl_matcher::push_recursion_pop() +{ + saved_state* pmp = static_cast(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast(m_backup_state); + --pmp; + } + (void) new (pmp)saved_state(15); + m_backup_state = pmp; +} +/* +template +bool perl_matcher::unwind_parenthesis_pop(bool r) +{ + saved_state* pmp = static_cast(m_backup_state); + if(!r) + { + --parenthesis_stack_position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +void perl_matcher::push_parenthesis_pop() +{ + saved_state* pmp = static_cast(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast(m_backup_state); + --pmp; + } + (void) new (pmp)saved_state(16); + m_backup_state = pmp; +} + +template +bool perl_matcher::unwind_parenthesis_push(bool r) +{ + saved_position* pmp = static_cast*>(m_backup_state); + if(!r) + { + parenthesis_stack[parenthesis_stack_position++] = pmp->position; + } + boost::re_detail::inplace_destroy(pmp++); + m_backup_state = pmp; + return true; +} + +template +inline void perl_matcher::push_parenthesis_push(BidiIterator p) +{ + saved_position* pmp = static_cast*>(m_backup_state); + --pmp; + if(pmp < m_stack_base) + { + extend_stack(); + pmp = static_cast*>(m_backup_state); + --pmp; + } + (void) new (pmp)saved_position(0, p, 17); + m_backup_state = pmp; +} +*/ } // namespace re_detail } // namespace boost diff --git a/include/boost/regex/v4/perl_matcher_recursive.hpp b/include/boost/regex/v4/perl_matcher_recursive.hpp index 68e1aac9..48f08b7b 100644 --- a/include/boost/regex/v4/perl_matcher_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_recursive.hpp @@ -51,8 +51,8 @@ public: template void restore(match_results& w) { - w.set_first(sub.first, index); - w.set_second(sub.second, index, sub.matched); + w.set_first(sub.first, index, index == 0); + w.set_second(sub.second, index, sub.matched, index == 0); } const sub_match& get() { return sub; } }; @@ -60,7 +60,7 @@ public: template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[29] = + static matcher_proc_type const s_match_vtable[30] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -84,13 +84,18 @@ bool perl_matcher::match_all_states() &perl_matcher::match_combining, &perl_matcher::match_soft_buffer_end, &perl_matcher::match_restart_continue, - (::boost::is_random_access_iterator::value ? &perl_matcher::match_dot_repeat_fast : &perl_matcher::match_dot_repeat_slow), + // Although this next line *should* be evaluated at compile time, in practice + // some compilers (VC++) emit run-time initialisation which breaks thread + // safety, so use a dispatch function instead: + //(::boost::is_random_access_iterator::value ? &perl_matcher::match_dot_repeat_fast : &perl_matcher::match_dot_repeat_slow), + &perl_matcher::match_dot_repeat_dispatch, &perl_matcher::match_char_repeat, &perl_matcher::match_set_repeat, &perl_matcher::match_long_set_repeat, &perl_matcher::match_backstep, &perl_matcher::match_assert_backref, &perl_matcher::match_toggle_case, + &perl_matcher::match_recursion, }; if(state_count > max_state_count) @@ -113,6 +118,7 @@ template bool perl_matcher::match_startmark() { int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; bool r = true; switch(index) { @@ -205,6 +211,17 @@ bool perl_matcher::match_startmark() break; } } + case -5: + { + // Reset start of $0, since we have a \K escape + backup_subex sub(*m_presult, 0); + m_presult->set_first(position, 0, true); + pstate = pstate->next.p; + r = match_all_states(); + if(r == false) + sub.restore(*m_presult); + break; + } default: { BOOST_ASSERT(index > 0); @@ -833,6 +850,127 @@ bool perl_matcher::backtrack_till_match(std::si #endif } +template +bool perl_matcher::match_recursion() +{ + BOOST_ASSERT(pstate->type == syntax_element_recurse); + // + // Set new call stack: + // + if(recursion_stack_position >= static_cast(sizeof(recursion_stack)/sizeof(recursion_stack[0]))) + { + return false; + } + recursion_stack[recursion_stack_position].preturn_address = pstate->next.p; + recursion_stack[recursion_stack_position].results = *m_presult; + recursion_stack[recursion_stack_position].repeater_stack = next_count; + pstate = static_cast(pstate)->alt.p; + recursion_stack[recursion_stack_position].id = static_cast(pstate)->index; + ++recursion_stack_position; + + repeater_count* saved = next_count; + repeater_count r(&next_count); // resets all repeat counts since we're recursing and starting fresh on those + next_count = &r; + bool result = match_all_states(); + next_count = saved; + + if(!result) + { + --recursion_stack_position; + next_count = recursion_stack[recursion_stack_position].repeater_stack; + *m_presult = recursion_stack[recursion_stack_position].results; + return false; + } + return true; +} + +template +bool perl_matcher::match_endmark() +{ + int index = static_cast(pstate)->index; + icase = static_cast(pstate)->icase; + if(index > 0) + { + if((m_match_flags & match_nosubs) == 0) + { + m_presult->set_second(position, index); + } + if(recursion_stack_position) + { + if(index == recursion_stack[recursion_stack_position-1].id) + { + --recursion_stack_position; + recursion_info saved = recursion_stack[recursion_stack_position]; + const re_syntax_base* saved_state = pstate = saved.preturn_address; + repeater_count* saved_count = next_count; + next_count = saved.repeater_stack; + *m_presult = saved.results; + if(!match_all_states()) + { + recursion_stack[recursion_stack_position] = saved; + ++recursion_stack_position; + next_count = saved_count; + return false; + } + } + } + } + else if((index < 0) && (index != -4)) + { + // matched forward lookahead: + pstate = 0; + return true; + } + pstate = pstate ? pstate->next.p : 0; + return true; +} + +template +bool perl_matcher::match_match() +{ + if(recursion_stack_position) + { + BOOST_ASSERT(0 == recursion_stack[recursion_stack_position-1].id); + --recursion_stack_position; + const re_syntax_base* saved_state = pstate = recursion_stack[recursion_stack_position].preturn_address; + *m_presult = recursion_stack[recursion_stack_position].results; + if(!match_all_states()) + { + recursion_stack[recursion_stack_position].preturn_address = saved_state; + recursion_stack[recursion_stack_position].results = *m_presult; + ++recursion_stack_position; + return false; + } + return true; + } + if((m_match_flags & match_not_null) && (position == (*m_presult)[0].first)) + return false; + if((m_match_flags & match_all) && (position != last)) + return false; + if((m_match_flags & regex_constants::match_not_initial_null) && (position == search_base)) + return false; + m_presult->set_second(position); + pstate = 0; + m_has_found_match = true; + if((m_match_flags & match_posix) == match_posix) + { + m_result.maybe_assign(*m_presult); + if((m_match_flags & match_any) == 0) + return false; + } +#ifdef BOOST_REGEX_MATCH_EXTRA + if(match_extra & m_match_flags) + { + for(unsigned i = 0; i < m_presult->size(); ++i) + if((*m_presult)[i].matched) + ((*m_presult)[i]).get_captures().push_back((*m_presult)[i]); + } +#endif + return true; +} + + + } // namespace re_detail } // namespace boost #ifdef BOOST_MSVC diff --git a/include/boost/regex/v4/regex_format.hpp b/include/boost/regex/v4/regex_format.hpp index d114c2ed..4e95112f 100644 --- a/include/boost/regex/v4/regex_format.hpp +++ b/include/boost/regex/v4/regex_format.hpp @@ -107,6 +107,7 @@ private: void format_escape(); void format_conditional(); void format_until_scope_end(); + bool handle_perl_verb(bool have_brace); const traits& m_traits; // the traits class for localised formatting operations const Results& m_results; // the match_results being used. @@ -250,6 +251,25 @@ void basic_regex_formatter::format_perl() case '$': put(*m_position++); break; + case '+': + if((++m_position != m_end) && (*m_position == '{')) + { + const char_type* base = ++m_position; + while((m_position != m_end) && (*m_position != '}')) ++m_position; + if(m_position != m_end) + { + // Named sub-expression: + put(this->m_results.named_subexpression(base, m_position)); + ++m_position; + break; + } + else + { + m_position = --base; + } + } + put((this->m_results)[this->m_results.size() > 1 ? this->m_results.size() - 1 : 1]); + break; case '{': have_brace = true; ++m_position; @@ -258,14 +278,18 @@ void basic_regex_formatter::format_perl() // see if we have a number: { std::ptrdiff_t len = ::boost::re_detail::distance(m_position, m_end); - len = (std::min)(static_cast(2), len); + //len = (std::min)(static_cast(2), len); int v = m_traits.toi(m_position, m_position + len, 10); if((v < 0) || (have_brace && ((m_position == m_end) || (*m_position != '}')))) { - // leave the $ as is, and carry on: - m_position = --save_position; - put(*m_position); - ++m_position; + // Look for a Perl-5.10 verb: + if(!handle_perl_verb(have_brace)) + { + // leave the $ as is, and carry on: + m_position = --save_position; + put(*m_position); + ++m_position; + } break; } // otherwise output sub v: @@ -276,6 +300,123 @@ void basic_regex_formatter::format_perl() } } +template +bool basic_regex_formatter::handle_perl_verb(bool have_brace) +{ + // + // We may have a capitalised string containing a Perl action: + // + static const char_type MATCH[] = { 'M', 'A', 'T', 'C', 'H' }; + static const char_type PREMATCH[] = { 'P', 'R', 'E', 'M', 'A', 'T', 'C', 'H' }; + static const char_type POSTMATCH[] = { 'P', 'O', 'S', 'T', 'M', 'A', 'T', 'C', 'H' }; + static const char_type LAST_PAREN_MATCH[] = { 'L', 'A', 'S', 'T', '_', 'P', 'A', 'R', 'E', 'N', '_', 'M', 'A', 'T', 'C', 'H' }; + static const char_type LAST_SUBMATCH_RESULT[] = { 'L', 'A', 'S', 'T', '_', 'S', 'U', 'B', 'M', 'A', 'T', 'C', 'H', '_', 'R', 'E', 'S', 'U', 'L', 'T' }; + static const char_type LAST_SUBMATCH_RESULT_ALT[] = { '^', 'N' }; + + if(have_brace && (*m_position == '^')) + ++m_position; + + int max_len = m_end - m_position; + + if((max_len >= 5) && std::equal(m_position, m_position + 5, MATCH)) + { + m_position += 5; + if(have_brace) + { + if(*m_position == '}') + ++m_position; + else + { + m_position -= 5; + return false; + } + } + put(this->m_results[0]); + return true; + } + if((max_len >= 8) && std::equal(m_position, m_position + 8, PREMATCH)) + { + m_position += 8; + if(have_brace) + { + if(*m_position == '}') + ++m_position; + else + { + m_position -= 8; + return false; + } + } + put(this->m_results.prefix()); + return true; + } + if((max_len >= 9) && std::equal(m_position, m_position + 9, POSTMATCH)) + { + m_position += 9; + if(have_brace) + { + if(*m_position == '}') + ++m_position; + else + { + m_position -= 9; + return false; + } + } + put(this->m_results.suffix()); + return true; + } + if((max_len >= 16) && std::equal(m_position, m_position + 16, LAST_PAREN_MATCH)) + { + m_position += 16; + if(have_brace) + { + if(*m_position == '}') + ++m_position; + else + { + m_position -= 16; + return false; + } + } + put((this->m_results)[this->m_results.size() > 1 ? this->m_results.size() - 1 : 1]); + return true; + } + if((max_len >= 20) && std::equal(m_position, m_position + 20, LAST_SUBMATCH_RESULT)) + { + m_position += 20; + if(have_brace) + { + if(*m_position == '}') + ++m_position; + else + { + m_position -= 20; + return false; + } + } + put(this->m_results.get_last_closed_paren()); + return true; + } + if((max_len >= 2) && std::equal(m_position, m_position + 2, LAST_SUBMATCH_RESULT_ALT)) + { + m_position += 2; + if(have_brace) + { + if(*m_position == '}') + ++m_position; + else + { + m_position -= 2; + return false; + } + } + put(this->m_results.get_last_closed_paren()); + return true; + } + return false; +} + template void basic_regex_formatter::format_escape() { @@ -440,9 +581,35 @@ void basic_regex_formatter::format_conditional( put(static_cast('?')); return; } - std::ptrdiff_t len = ::boost::re_detail::distance(m_position, m_end); - len = (std::min)(static_cast(2), len); - int v = m_traits.toi(m_position, m_position + len, 10); + int v; + if(*m_position == '{') + { + const char_type* base = m_position; + ++m_position; + v = m_traits.toi(m_position, m_end, 10); + if(v < 0) + { + // Try a named subexpression: + while((m_position != m_end) && (*m_position != '}')) + ++m_position; + v = m_results.named_subexpression_index(base + 1, m_position); + } + if((v < 0) || (*m_position != '}')) + { + m_position = base; + // oops trailing '?': + put(static_cast('?')); + return; + } + // Skip trailing '}': + ++m_position; + } + else + { + std::ptrdiff_t len = ::boost::re_detail::distance(m_position, m_end); + len = (std::min)(static_cast(2), len); + v = m_traits.toi(m_position, m_position + len, 10); + } if(v < 0) { // oops not a number: diff --git a/include/boost/regex/v4/regex_traits_defaults.hpp b/include/boost/regex/v4/regex_traits_defaults.hpp index 42428dd8..5b2c6bc3 100644 --- a/include/boost/regex/v4/regex_traits_defaults.hpp +++ b/include/boost/regex/v4/regex_traits_defaults.hpp @@ -159,7 +159,7 @@ struct character_pointer_range template int get_default_class_id(const charT* p1, const charT* p2) { - static const charT data[72] = { + static const charT data[73] = { 'a', 'l', 'n', 'u', 'm', 'a', 'l', 'p', 'h', 'a', 'b', 'l', 'a', 'n', 'k', @@ -172,11 +172,12 @@ int get_default_class_id(const charT* p1, const charT* p2) 's', 'p', 'a', 'c', 'e', 'u', 'n', 'i', 'c', 'o', 'd', 'e', 'u', 'p', 'p', 'e', 'r', + 'v', 'w', 'o', 'r', 'd', 'x', 'd', 'i', 'g', 'i', 't', }; - static const character_pointer_range ranges[19] = + static const character_pointer_range ranges[21] = { {data+0, data+5,}, // alnum {data+5, data+10,}, // alpha @@ -185,6 +186,7 @@ int get_default_class_id(const charT* p1, const charT* p2) {data+20, data+21,}, // d {data+20, data+25,}, // digit {data+25, data+30,}, // graph + {data+29, data+30,}, // h {data+30, data+31,}, // l {data+30, data+35,}, // lower {data+35, data+40,}, // print @@ -194,9 +196,10 @@ int get_default_class_id(const charT* p1, const charT* p2) {data+57, data+58,}, // u {data+50, data+57,}, // unicode {data+57, data+62,}, // upper - {data+62, data+63,}, // w - {data+62, data+66,}, // word - {data+66, data+72,}, // xdigit + {data+62, data+63,}, // v + {data+63, data+64,}, // w + {data+63, data+67,}, // word + {data+67, data+73,}, // xdigit }; static const character_pointer_range* ranges_begin = ranges; static const character_pointer_range* ranges_end = ranges + (sizeof(ranges)/sizeof(ranges[0])); @@ -314,6 +317,43 @@ int global_toi(const charT*& p1, const charT* p2, int radix, const traits& t) return result; } +template +inline const charT* get_escape_R_string() +{ +#ifdef BOOST_MSVC +# pragma warning(push) +# pragma warning(disable:4309) +#endif + static const charT e1[] = { '(', '?', '>', '\x0D', '\x0A', '?', + '|', '[', '\x0A', '\x0B', '\x0C', '\x85', '\\', 'x', '{', '2', '0', '2', '8', '}', + '\\', 'x', '{', '2', '0', '2', '9', '}', ']', ')', '\0' }; + static const charT e2[] = { '(', '?', '>', '\x0D', '\x0A', '?', + '|', '[', '\x0A', '\x0B', '\x0C', '\x85', ']', ')', '\0' }; + + charT c = static_cast(0x2029u); + bool b = (static_cast(c) == 0x2029u); + + return (b ? e1 : e2); +#ifdef BOOST_MSVC +# pragma warning(pop) +#endif +} + +template <> +inline const char* get_escape_R_string() +{ +#ifdef BOOST_MSVC +# pragma warning(push) +# pragma warning(disable:4309) +#endif + static const char e2[] = { '(', '?', '>', '\x0D', '\x0A', '?', + '|', '[', '\x0A', '\x0B', '\x0C', '\x85', ']', ')', '\0' }; + return e2; +#ifdef BOOST_MSVC +# pragma warning(pop) +#endif +} + } // re_detail } // boost diff --git a/include/boost/regex/v4/regex_workaround.hpp b/include/boost/regex/v4/regex_workaround.hpp index fc3c2123..06527f1a 100644 --- a/include/boost/regex/v4/regex_workaround.hpp +++ b/include/boost/regex/v4/regex_workaround.hpp @@ -124,7 +124,7 @@ inline void pointer_construct(T* p, const T& t) #ifdef __cplusplus namespace boost{ namespace re_detail{ -#if BOOST_WORKAROUND(BOOST_MSVC,>=1400) && defined(_CPPLIB_VER) && defined(BOOST_DINKUMWARE_STDLIB) && !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) +#if BOOST_WORKAROUND(BOOST_MSVC,>=1400) && BOOST_WORKAROUND(BOOST_MSVC, <1600) && defined(_CPPLIB_VER) && defined(BOOST_DINKUMWARE_STDLIB) && !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) // // MSVC 8 will either emit warnings or else refuse to compile // code that makes perfectly legitimate use of std::copy, when diff --git a/include/boost/regex/v4/states.hpp b/include/boost/regex/v4/states.hpp index 44dd2b4a..efdebbe5 100644 --- a/include/boost/regex/v4/states.hpp +++ b/include/boost/regex/v4/states.hpp @@ -118,7 +118,9 @@ enum syntax_element_type syntax_element_backstep = syntax_element_long_set_rep + 1, // an assertion that a mark was matched: syntax_element_assert_backref = syntax_element_backstep + 1, - syntax_element_toggle_case = syntax_element_assert_backref + 1 + syntax_element_toggle_case = syntax_element_assert_backref + 1, + // a recursive expression: + syntax_element_recurse = syntax_element_toggle_case + 1 }; #ifdef BOOST_REGEX_DEBUG @@ -156,6 +158,7 @@ struct re_brace : public re_syntax_base // The index to match, can be zero (don't mark the sub-expression) // or negative (for perl style (?...) extentions): int index; + bool icase; }; /*** struct re_dot ************************************************** diff --git a/include/boost/regex/v4/syntax_type.hpp b/include/boost/regex/v4/syntax_type.hpp index 92c00d4c..3efdf0b0 100644 --- a/include/boost/regex/v4/syntax_type.hpp +++ b/include/boost/regex/v4/syntax_type.hpp @@ -92,8 +92,11 @@ static const escape_syntax_type escape_type_G = 52; / static const escape_syntax_type escape_type_property = 54; // for \p static const escape_syntax_type escape_type_not_property = 55; // for \P static const escape_syntax_type escape_type_named_char = 56; // for \N +static const escape_syntax_type escape_type_extended_backref = 57; // for \g +static const escape_syntax_type escape_type_reset_start_mark = 58; // for \K +static const escape_syntax_type escape_type_line_ending = 59; // for \R -static const escape_syntax_type syntax_max = 57; +static const escape_syntax_type syntax_max = 60; } } diff --git a/include/boost/regex/v4/w32_regex_traits.hpp b/include/boost/regex/v4/w32_regex_traits.hpp index 21a9694a..d5562072 100644 --- a/include/boost/regex/v4/w32_regex_traits.hpp +++ b/include/boost/regex/v4/w32_regex_traits.hpp @@ -294,6 +294,8 @@ public: typedef typename w32_regex_traits::char_class_type char_class_type; BOOST_STATIC_CONSTANT(char_class_type, mask_word = 0x0400); // must be C1_DEFINED << 1 BOOST_STATIC_CONSTANT(char_class_type, mask_unicode = 0x0800); // must be C1_DEFINED << 2 + BOOST_STATIC_CONSTANT(char_class_type, mask_horizontal = 0x1000); // must be C1_DEFINED << 3 + BOOST_STATIC_CONSTANT(char_class_type, mask_vertical = 0x2000); // must be C1_DEFINED << 4 BOOST_STATIC_CONSTANT(char_class_type, mask_base = 0x3ff); // all the masks used by the CT_CTYPE1 group typedef std::basic_string string_type; @@ -510,7 +512,7 @@ template typename w32_regex_traits_implementation::char_class_type w32_regex_traits_implementation::lookup_classname_imp(const charT* p1, const charT* p2) const { - static const char_class_type masks[20] = + static const char_class_type masks[22] = { 0, 0x0104u, // C1_ALPHA | C1_DIGIT @@ -520,6 +522,7 @@ typename w32_regex_traits_implementation::char_class_type 0x0004u, // C1_DIGIT 0x0004u, // C1_DIGIT (~(0x0020u|0x0008u|0x0040) & 0x01ffu) | 0x0400u, // not C1_CNTRL or C1_SPACE or C1_BLANK + w32_regex_traits_implementation::mask_horizontal, 0x0002u, // C1_LOWER 0x0002u, // C1_LOWER (~0x0020u & 0x01ffu) | 0x0400, // not C1_CNTRL @@ -529,6 +532,7 @@ typename w32_regex_traits_implementation::char_class_type 0x0001u, // C1_UPPER w32_regex_traits_implementation::mask_unicode, 0x0001u, // C1_UPPER + w32_regex_traits_implementation::mask_vertical, 0x0104u | w32_regex_traits_implementation::mask_word, 0x0104u | w32_regex_traits_implementation::mask_word, 0x0080u, // C1_XDIGIT @@ -628,6 +632,12 @@ public: return true; else if((f & re_detail::w32_regex_traits_implementation::mask_word) && (c == '_')) return true; + else if((f & re_detail::w32_regex_traits_implementation::mask_vertical) + && (::boost::re_detail::is_separator(c) || (c == '\v'))) + return true; + else if((f & re_detail::w32_regex_traits_implementation::mask_horizontal) + && this->isctype(c, 0x0008u) && !this->isctype(c, re_detail::w32_regex_traits_implementation::mask_vertical)) + return true; return false; } int toi(const charT*& p1, const charT* p2, int radix)const diff --git a/src/c_regex_traits.cpp b/src/c_regex_traits.cpp index da960eb0..6466bc42 100644 --- a/src/c_regex_traits.cpp +++ b/src/c_regex_traits.cpp @@ -122,7 +122,9 @@ enum char_class_graph=char_class_alnum|char_class_punct, char_class_blank=1<<9, char_class_word=1<<10, - char_class_unicode=1<<11 + char_class_unicode=1<<11, + char_class_horizontal=1<<12, + char_class_vertical=1<<13 }; c_regex_traits::char_class_type BOOST_REGEX_CALL c_regex_traits::lookup_classname(const char* p1, const char* p2) @@ -137,6 +139,7 @@ c_regex_traits::char_class_type BOOST_REGEX_CALL c_regex_traits::loo char_class_digit, char_class_digit, char_class_graph, + char_class_horizontal, char_class_lower, char_class_lower, char_class_print, @@ -146,6 +149,7 @@ c_regex_traits::char_class_type BOOST_REGEX_CALL c_regex_traits::loo char_class_upper, char_class_unicode, char_class_upper, + char_class_vertical, char_class_alnum | char_class_word, char_class_alnum | char_class_word, char_class_xdigit, @@ -176,7 +180,9 @@ bool BOOST_REGEX_CALL c_regex_traits::isctype(char c, char_class_type mask || ((mask & char_class_punct) && (std::ispunct)(static_cast(c))) || ((mask & char_class_xdigit) && (std::isxdigit)(static_cast(c))) || ((mask & char_class_blank) && (std::isspace)(static_cast(c)) && !::boost::re_detail::is_separator(c)) - || ((mask & char_class_word) && (c == '_')); + || ((mask & char_class_word) && (c == '_')) + || ((mask & char_class_vertical) && (::boost::re_detail::is_separator(c) || (c == '\v'))) + || ((mask & char_class_horizontal) && (std::isspace)(static_cast(c)) && !::boost::re_detail::is_separator(c) && (c != '\v')); } c_regex_traits::string_type BOOST_REGEX_CALL c_regex_traits::lookup_collatename(const char* p1, const char* p2) diff --git a/src/icu.cpp b/src/icu.cpp index e06c3176..a815e915 100644 --- a/src/icu.cpp +++ b/src/icu.cpp @@ -101,6 +101,8 @@ const icu_regex_traits::char_class_type icu_regex_traits::mask_underscore = icu_ const icu_regex_traits::char_class_type icu_regex_traits::mask_unicode = icu_regex_traits::char_class_type(1) << offset_unicode; const icu_regex_traits::char_class_type icu_regex_traits::mask_any = icu_regex_traits::char_class_type(1) << offset_any; const icu_regex_traits::char_class_type icu_regex_traits::mask_ascii = icu_regex_traits::char_class_type(1) << offset_ascii; +const icu_regex_traits::char_class_type icu_regex_traits::mask_horizontal = icu_regex_traits::char_class_type(1) << offset_horizontal; +const icu_regex_traits::char_class_type icu_regex_traits::mask_vertical = icu_regex_traits::char_class_type(1) << offset_vertical; icu_regex_traits::char_class_type icu_regex_traits::lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2) { @@ -370,6 +372,7 @@ icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_ U_GC_ND_MASK, U_GC_ND_MASK, (0x3FFFFFFFu) & ~(U_GC_CC_MASK | U_GC_CF_MASK | U_GC_CS_MASK | U_GC_CN_MASK | U_GC_Z_MASK), + mask_horizontal, U_GC_LL_MASK, U_GC_LL_MASK, ~(U_GC_C_MASK), @@ -379,6 +382,7 @@ icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_ U_GC_LU_MASK, mask_unicode, U_GC_LU_MASK, + mask_vertical, char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore, char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore, char_class_type(U_GC_ND_MASK) | mask_xdigit, @@ -487,6 +491,10 @@ bool icu_regex_traits::isctype(char_type c, char_class_type f) const return true; if(((f & mask_ascii) != 0) && (c <= 0x7F)) return true; + if(((f & mask_vertical) != 0) && (::boost::re_detail::is_separator(c) || (c == static_cast('\v')) || (m == U_GC_ZL_MASK) || (m == U_GC_ZP_MASK))) + return true; + if(((f & mask_horizontal) != 0) && !::boost::re_detail::is_separator(c) && u_isspace(c) && (c != static_cast('\v'))) + return true; return false; } diff --git a/src/regex_traits_defaults.cpp b/src/regex_traits_defaults.cpp index 8f76c09e..c9596a3d 100644 --- a/src/regex_traits_defaults.cpp +++ b/src/regex_traits_defaults.cpp @@ -100,6 +100,9 @@ BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants "p", "P", "N", + "g", + "K", + "R", }; return ((n >= (sizeof(messages) / sizeof(messages[1]))) ? "" : messages[n]); @@ -375,14 +378,14 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul regex_constants::escape_type_not_class, /*H*/ regex_constants::escape_type_not_class, /*I*/ regex_constants::escape_type_not_class, /*J*/ - regex_constants::escape_type_not_class, /*K*/ + regex_constants::escape_type_reset_start_mark, /*K*/ regex_constants::escape_type_not_class, /*L*/ regex_constants::escape_type_not_class, /*M*/ regex_constants::escape_type_named_char, /*N*/ regex_constants::escape_type_not_class, /*O*/ regex_constants::escape_type_not_property, /*P*/ regex_constants::escape_type_Q, /*Q*/ - regex_constants::escape_type_not_class, /*R*/ + regex_constants::escape_type_line_ending, /*R*/ regex_constants::escape_type_not_class, /*S*/ regex_constants::escape_type_not_class, /*T*/ regex_constants::escape_type_not_class, /*U*/ @@ -403,11 +406,11 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul regex_constants::escape_type_class, /*d*/ regex_constants::escape_type_e, /*e*/ regex_constants::escape_type_control_f, /*f*/ - regex_constants::escape_type_class, /*g*/ + regex_constants::escape_type_extended_backref, /*g*/ regex_constants::escape_type_class, /*h*/ regex_constants::escape_type_class, /*i*/ regex_constants::escape_type_class, /*j*/ - regex_constants::escape_type_class, /*k*/ + regex_constants::escape_type_extended_backref, /*k*/ regex_constants::escape_type_class, /*l*/ regex_constants::escape_type_class, /*m*/ regex_constants::escape_type_control_n, /*n*/ @@ -534,7 +537,7 @@ BOOST_REGEX_DECL regex_constants::syntax_type BOOST_REGEX_CALL get_default_synta regex_constants::syntax_dollar, /*$*/ regex_constants::syntax_char, /*%*/ regex_constants::syntax_char, /*&*/ - regex_constants::syntax_char, /*'*/ + regex_constants::escape_type_end_buffer, /*'*/ regex_constants::syntax_open_mark, /*(*/ regex_constants::syntax_close_mark, /*)*/ regex_constants::syntax_star, /***/ diff --git a/src/usinstances.cpp b/src/usinstances.cpp index 56653662..ca7b1dc9 100644 --- a/src/usinstances.cpp +++ b/src/usinstances.cpp @@ -23,7 +23,8 @@ #include #if defined(_DLL_CPPLIB) && !defined(_M_CEE_PURE) && defined(_NATIVE_WCHAR_T_DEFINED) \ - && !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) || defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER)) + && !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) || defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER))\ + && BOOST_WORKAROUND(BOOST_MSVC, <1600) // // This is a horrible workaround, but without declaring these symbols extern we get // duplicate symbol errors when linking if the application is built without diff --git a/src/wc_regex_traits.cpp b/src/wc_regex_traits.cpp index 3640f292..fb622b5a 100644 --- a/src/wc_regex_traits.cpp +++ b/src/wc_regex_traits.cpp @@ -24,7 +24,8 @@ #include #if defined(_DLL_CPPLIB) && !defined(_M_CEE_PURE) && defined(_NATIVE_WCHAR_T_DEFINED) \ - && !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) || defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER)) + && !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION) || defined(__STD_RWCOMPILER_H__) || defined(_RWSTD_VER))\ + && BOOST_WORKAROUND(BOOST_MSVC, <1600) // // This is a horrible workaround, but without declaring these symbols extern we get // duplicate symbol errors when linking if the application is built without @@ -161,7 +162,9 @@ enum char_class_graph=char_class_alnum|char_class_punct, char_class_blank=1<<9, char_class_word=1<<10, - char_class_unicode=1<<11 + char_class_unicode=1<<11, + char_class_horizontal=1<<12, + char_class_vertical=1<<13 }; c_regex_traits::char_class_type BOOST_REGEX_CALL c_regex_traits::lookup_classname(const wchar_t* p1, const wchar_t* p2) @@ -176,6 +179,7 @@ c_regex_traits::char_class_type BOOST_REGEX_CALL c_regex_traits::char_class_type BOOST_REGEX_CALL c_regex_traits::isctype(wchar_t c, char_class_typ || ((mask & char_class_xdigit) && (std::iswxdigit)(c)) || ((mask & char_class_blank) && (std::iswspace)(c) && !::boost::re_detail::is_separator(c)) || ((mask & char_class_word) && (c == '_')) - || ((mask & char_class_unicode) && (c & ~static_cast(0xff))); + || ((mask & char_class_unicode) && (c & ~static_cast(0xff))) + || ((mask & char_class_vertical) && (::boost::re_detail::is_separator(c) || (c == L'\v'))) + || ((mask & char_class_horizontal) && (std::iswspace)(c) && !::boost::re_detail::is_separator(c) && (c != L'\v')); } c_regex_traits::string_type BOOST_REGEX_CALL c_regex_traits::lookup_collatename(const wchar_t* p1, const wchar_t* p2) diff --git a/test/Jamfile.v2 b/test/Jamfile.v2 index 4d1a2967..40847731 100644 --- a/test/Jamfile.v2 +++ b/test/Jamfile.v2 @@ -87,6 +87,10 @@ test-suite regex ../build//boost_regex ] + [ run named_subexpressions/named_subexpressions_test.cpp + ../build//boost_regex + ] + [ run unicode/unicode_iterator_test.cpp ../build//boost_regex ] [ run static_mutex/static_mutex_test.cpp ../../thread/build//boost_thread ../build//boost_regex diff --git a/test/named_subexpressions/named_subexpressions_test.cpp b/test/named_subexpressions/named_subexpressions_test.cpp new file mode 100644 index 00000000..41011415 --- /dev/null +++ b/test/named_subexpressions/named_subexpressions_test.cpp @@ -0,0 +1,109 @@ +/* + * + * Copyright (c) 2009 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + +#include +#include + + +template +void test_named_subexpressions(charT) +{ + // + // Really this is just a test that the overloaded access functions work correctly: + // + static const charT e[] = + { + '(', '?', '\'', 'o', 'n', 'e', '\'', 'a', '+', ')', '(', '?', '<', 't', 'w', 'o', '>', 'b', '+', ')', '\0' + }; + static const charT t[] = + { + 'm', 'm', 'a', 'a', 'a', 'b', 'b', 'n', 'n', '\0' + }; + static const charT one[] = + { + 'o', 'n', 'e', '\0' + }; + static const charT two[] = + { + 't', 'w', 'o', '\0' + }; + static const std::basic_string s_one(one); + static const std::basic_string s_two(two); + static const charT result1[] = { 'a', 'a', 'a', '\0' }; + static const charT result2[] = { 'b', 'b', '\0' }; + static const std::basic_string s_result1(result1); + static const std::basic_string s_result2(result2); + + static const char* c_one = "one"; + static const char* c_two = "two"; + static const std::string cs_one(c_one); + static const std::string cs_two(c_two); + + boost::basic_regex expression(e); + boost::match_results what; + if(regex_search(t, what, expression)) + { + BOOST_CHECK(what.length(1) == 3); + BOOST_CHECK(what.length(one) == 3); + BOOST_CHECK(what.length(s_one) == 3); + BOOST_CHECK(what.length(c_one) == 3); + BOOST_CHECK(what.length(cs_one) == 3); + BOOST_CHECK(what.position(1) == 2); + BOOST_CHECK(what.position(one) == 2); + BOOST_CHECK(what.position(s_one) == 2); + BOOST_CHECK(what.position(c_one) == 2); + BOOST_CHECK(what.position(cs_one) == 2); + BOOST_CHECK(what.str(1) == s_result1); + BOOST_CHECK(what.str(one) == s_result1); + BOOST_CHECK(what.str(s_one) == s_result1); + BOOST_CHECK(what.str(c_one) == s_result1); + BOOST_CHECK(what.str(cs_one) == s_result1); + BOOST_CHECK(what[1] == s_result1); + BOOST_CHECK(what[one] == s_result1); + BOOST_CHECK(what[s_one] == s_result1); + BOOST_CHECK(what[c_one] == s_result1); + BOOST_CHECK(what[cs_one] == s_result1); + + BOOST_CHECK(what.length(2) == 2); + BOOST_CHECK(what.length(two) == 2); + BOOST_CHECK(what.length(s_two) == 2); + BOOST_CHECK(what.length(c_two) == 2); + BOOST_CHECK(what.length(cs_two) == 2); + BOOST_CHECK(what.position(2) == 5); + BOOST_CHECK(what.position(two) == 5); + BOOST_CHECK(what.position(s_two) == 5); + BOOST_CHECK(what.position(c_two) == 5); + BOOST_CHECK(what.position(cs_two) == 5); + BOOST_CHECK(what.str(2) == s_result2); + BOOST_CHECK(what.str(two) == s_result2); + BOOST_CHECK(what.str(s_two) == s_result2); + BOOST_CHECK(what.str(c_two) == s_result2); + BOOST_CHECK(what.str(cs_two) == s_result2); + BOOST_CHECK(what[2] == s_result2); + BOOST_CHECK(what[two] == s_result2); + BOOST_CHECK(what[s_two] == s_result2); + BOOST_CHECK(what[c_two] == s_result2); + BOOST_CHECK(what[cs_two] == s_result2); + } + else + { + BOOST_ERROR("Expected match not found"); + } +} + +int test_main( int , char* [] ) +{ + test_named_subexpressions(char(0)); + test_named_subexpressions(wchar_t(0)); + return 0; +} + +#include diff --git a/test/regress/main.cpp b/test/regress/main.cpp index ffbe5efe..e741db50 100644 --- a/test/regress/main.cpp +++ b/test/regress/main.cpp @@ -70,6 +70,9 @@ void run_tests() RUN_TESTS(test_operators); RUN_TESTS(test_overloads); RUN_TESTS(test_unicode); + RUN_TESTS(test_pocessive_repeats); + RUN_TESTS(test_mark_resets); + RUN_TESTS(test_recursion); } int cpp_main(int /*argc*/, char * /*argv*/[]) diff --git a/test/regress/test.hpp b/test/regress/test.hpp index 046b2d56..9e271288 100644 --- a/test/regress/test.hpp +++ b/test/regress/test.hpp @@ -258,6 +258,8 @@ void test_emacs(); void test_operators(); void test_overloads(); void test_unicode(); - +void test_pocessive_repeats(); +void test_mark_resets(); +void test_recursion(); #endif diff --git a/test/regress/test_backrefs.cpp b/test/regress/test_backrefs.cpp index 3702b486..58f4dedb 100644 --- a/test/regress/test_backrefs.cpp +++ b/test/regress/test_backrefs.cpp @@ -55,5 +55,53 @@ void test_backrefs() TEST_REGEX_SEARCH("\\(a\\)\\1bc*[ce]d", basic, "aabcccd", match_default, make_array(0, 7, 0, 1, -2, -2)); TEST_REGEX_SEARCH("^\\(a\\)\\1b\\(c\\)*cd$", basic, "aabcccd", match_default, make_array(0, 7, 0, 1, 4, 5, -2, -2)); TEST_REGEX_SEARCH("\\(ab*\\)[ab]*\\1", basic, "ababaaa", match_default, make_array(0, 7, 0, 1, -2, -2)); + // + // Now test the \g version: + // + TEST_INVALID_REGEX("a(b)\\g2c", perl); + TEST_INVALID_REGEX("a(b\\g1)c", perl); + TEST_INVALID_REGEX("a(b\\g0)c", perl); + TEST_REGEX_SEARCH("a(b*)c\\g1d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g1d", perl, "abbcbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g1d", perl, "abbcbbbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(.)\\g1", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a([bc])\\g1d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + TEST_INVALID_REGEX("a(b)\\g{2}c", perl); + TEST_INVALID_REGEX("a(b\\g{1})c", perl); + TEST_INVALID_REGEX("a(b\\g{0})c", perl); + TEST_REGEX_SEARCH("a(b*)c\\g{1}d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g{1}d", perl, "abbcbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g{1}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(.)\\g{1}", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a([bc])\\g{1}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + // And again but with negative indexes: + TEST_INVALID_REGEX("a(b)\\g-2c", perl); + TEST_INVALID_REGEX("a(b\\g-1)c", perl); + TEST_INVALID_REGEX("a(b\\g-0)c", perl); + TEST_REGEX_SEARCH("a(b*)c\\g-1d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g-1d", perl, "abbcbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g-1d", perl, "abbcbbbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(.)\\g1", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a([bc])\\g1d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + TEST_INVALID_REGEX("a(b)\\g{-2}c", perl); + TEST_INVALID_REGEX("a(b\\g{-1})c", perl); + TEST_REGEX_SEARCH("a(b*)c\\g{-1}d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g{-1}d", perl, "abbcbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(b*)c\\g{-1}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(.)\\g{-1}", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a([bc])\\g{-1}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + + // And again but with named subexpressions: + TEST_REGEX_SEARCH("a(?(?(?(?b*))))c\\g{foo}d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, 1, 3, 1, 3, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("a(?(?(?(?b*))))c\\g{foo}d", perl, "abbcbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(?(?(?(?b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(?[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); + + TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbd", match_default, make_array(0, 7, 1, 3, 1, 3, 1, 3, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(?'foo'(?'bar'(?'bb'(?'aa'b*))))c\\g{foo}d", perl, "abbcbbbd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?'foo'.)\\g{foo}", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a(?'foo'[bc])\\g{foo}d", perl, "abcdabbd", match_default, make_array(4, 8, 5, 6, -2, -2)); } diff --git a/test/regress/test_deprecated.cpp b/test/regress/test_deprecated.cpp index af50ec76..84eefb9f 100644 --- a/test/regress/test_deprecated.cpp +++ b/test/regress/test_deprecated.cpp @@ -107,7 +107,7 @@ void test_deprecated(const char&, const test_regex_search_tag&) int i = 0; while(results[2*i] != -2) { - if(max_subs > i) + if((int)max_subs > i) { if(results[2*i] != matches[i].rm_so) { @@ -231,7 +231,7 @@ void test_deprecated(const wchar_t&, const test_regex_search_tag&) int i = 0; while(results[2*i] != -2) { - if(max_subs > i) + if((int)max_subs > i) { if(results[2*i] != matches[i].rm_so) { diff --git a/test/regress/test_escapes.cpp b/test/regress/test_escapes.cpp index ba78c454..d2dbbe43 100644 --- a/test/regress/test_escapes.cpp +++ b/test/regress/test_escapes.cpp @@ -144,5 +144,23 @@ void test_assertion_escapes() TEST_REGEX_SEARCH("a\\Gbc", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a\\Aab", perl, "abc", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("abc(?:\\Z|$)", perl, "abc\n\n", match_default, make_array(0, 3, -2, -2)); + + // Buffer reset \K: + TEST_REGEX_SEARCH("(foo)\\Kbar", perl, "foobar", match_default, make_array(3, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(foo)(\\Kbar|baz)", perl, "foobar", match_default, make_array(3, 6, 0, 3, 3, 6, -2, -2)); + TEST_REGEX_SEARCH("(foo)(\\Kbar|baz)", perl, "foobaz", match_default, make_array(0, 6, 0, 3, 3, 6, -2, -2)); + TEST_REGEX_SEARCH("(foo\\Kbar)baz", perl, "foobarbaz", match_default, make_array(3, 9, 0, 6, -2, -2)); + + // Line ending \R: + TEST_REGEX_SEARCH("\\R", perl, "foo\nbar", match_default, make_array(3, 4, -2, -2)); + TEST_REGEX_SEARCH("\\R", perl, "foo\rbar", match_default, make_array(3, 4, -2, -2)); + TEST_REGEX_SEARCH("\\R", perl, "foo\r\nbar", match_default, make_array(3, 5, -2, -2)); + // see if \u works: + const wchar_t* w = L"\u2028"; + if(*w == 0x2028u) + { + TEST_REGEX_SEARCH_W(L"\\R", perl, L"foo\u2028bar", match_default, make_array(3, 4, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\R", perl, L"foo\u2029bar", match_default, make_array(3, 4, -2, -2)); + } } diff --git a/test/regress/test_non_greedy_repeats.cpp b/test/regress/test_non_greedy_repeats.cpp index 5cf507d2..3196f5ac 100644 --- a/test/regress/test_non_greedy_repeats.cpp +++ b/test/regress/test_non_greedy_repeats.cpp @@ -41,6 +41,5 @@ void test_non_greedy_repeats() TEST_REGEX_SEARCH("xx[/-]{0,2}?(?:[+-][0-9])??\\z", perl, "xx--", match_default, make_array(0, 4, -2, -2)); TEST_INVALID_REGEX("a{1,3}{1}", perl); TEST_INVALID_REGEX("a**", perl); - TEST_INVALID_REGEX("a++", perl); } diff --git a/test/regress/test_perl_ex.cpp b/test/regress/test_perl_ex.cpp index 6aa31684..3c58fa42 100644 --- a/test/regress/test_perl_ex.cpp +++ b/test/regress/test_perl_ex.cpp @@ -651,3 +651,246 @@ void test_options3() #endif } +void test_mark_resets() +{ + using namespace boost::regex_constants; + + TEST_REGEX_SEARCH("(?|(abc)|(xyz))", perl, "abc", match_default, make_array(0, 3, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))", perl, "xyz", match_default, make_array(0, 3, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(x)(?|(abc)|(xyz))(x)", perl, "xabcx", match_default, make_array(0, 5, 0, 1, 1, 4, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("(x)(?|(abc)|(xyz))(x)", perl, "xxyzx", match_default, make_array(0, 5, 0, 1, 1, 4, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("(x)(?|(abc)(pqr)|(xyz))(x)", perl, "xabcpqrx", match_default, make_array(0, 8, 0, 1, 1, 4, 4, 7, 7, 8, -2, -2)); + TEST_REGEX_SEARCH("(x)(?|(abc)(pqr)|(xyz))(x)", perl, "xxyzx", match_default, make_array(0, 5, 0, 1, 1, 4, -1, -1, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "abcabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "xyzxyz", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "abcxyz", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))\\1", perl, "xyzabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "abcabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzxyz", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^X(?5)(a)(?|(b)|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_INVALID_REGEX("^X(?5)(a)(?|(b)|(q))(c)(d)Y", perl); + TEST_REGEX_SEARCH("^X(?&N)(a)(?|(b)|(q))(c)(d)(?Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); +} + +void test_recursion() +{ + using namespace boost::regex_constants; + + TEST_INVALID_REGEX("(a(?2)b)", perl); + TEST_INVALID_REGEX("(a(?1b))", perl); + TEST_REGEX_SEARCH("(a(?1)b)", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(a(?1)+b)", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "abc", match_default, make_array(0, 3, 2, 3, -2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "a(b)c", match_default, make_array(0, 5, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "a(b(c))d", match_default, make_array(0, 8, 7, 8, -2, -2)); + TEST_REGEX_SEARCH("^([^()]|\\((?1)*\\))*$", perl, "a(b(c)d", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^>abc>([^()]|\\((?1)*\\))*abc>123abc>([^()]|\\((?1)*\\))*abc>1(2)3abc>([^()]|\\((?1)*\\))*abc>(1(2)3)P)", perl|icase, "abcPXP123", match_default, make_array(3, 6, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("(abc)(?i:(?1))", perl|icase, "defabcabcxyz", match_default, make_array(3, 9, 3, 6, -2, -2)); + TEST_REGEX_SEARCH("(abc)(?i:(?1))", perl, "DEFabcABCXYZ", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(abc)(?i:(?1)abc)", perl, "DEFabcABCABCXYZ", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(abc)(?:(?i)(?1))", perl, "defabcabcxyz", match_default, make_array(3, 9, 3, 6, -2, -2)); + TEST_REGEX_SEARCH("(abc)(?:(?i)(?1))", perl, "DEFabcABCXYZ", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "abcabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzabc", match_default, make_array(0, 6, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?|(abc)|(xyz))(?1)", perl, "xyzxyz", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?1)[]a()b](abc)", perl, "abcbabc", match_default, make_array(0, 7, 4, 7, -2, -2)); + TEST_REGEX_SEARCH("(?1)[]a()b](abc)", perl, "abcXabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?1)[^]a()b](abc)", perl, "abcXabc", match_default, make_array(0, 7, 4, 7, -2, -2)); + TEST_REGEX_SEARCH("(?1)[^]a()b](abc)", perl, "abcbabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?2)[]a()b](abc)(xyz)", perl, "xyzbabcxyz", match_default, make_array(0, 10, 4, 7, 7, 10, -2, -2)); + TEST_REGEX_SEARCH("^X(?5)(a)(?|(b)|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_INVALID_REGEX("^X(?5)(a)(?|(b)|(q))(c)(d)Y", perl); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b)|(q)(r)(s))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(r)(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH("^X(?7)(a)(?|(b|(?|(r)|(t))(s))|(q))(c)(d)(Y)", perl, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, -1, -1, -1, -1, 4, 5, 5, 6, 6, 7, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"1234", match_default, make_array(0, 4, 0, 4, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\"1234\"", match_default, make_array(0, 6, 0, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100"L"1234", match_default, make_array(0, 5, 1, 5, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\"\x100"L"1234\"", match_default, make_array(1, 6, 2, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100\x100"L"12ab", match_default, make_array(0, 4, 2, 4, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100\x100"L"\"12\"", match_default, make_array(0, 6, 2, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{100}*(\\d+|\"(?1)\")", perl, L"\x100\x100"L"abcd", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(ab|c)(?-1)", perl, "abc", match_default, make_array(0, 3, 0, 2, -2, -2)); + TEST_REGEX_SEARCH("xy(?+1)(abc)", perl, "xyabcabc", match_default, make_array(0, 8, 5, 8, -2, -2)); + TEST_REGEX_SEARCH("xy(?+1)(abc)", perl, "xyabc", match_default, make_array(-2, -2)); + TEST_INVALID_REGEX("x(?-0)y", perl); + TEST_INVALID_REGEX("x(?-1)y", perl); + TEST_INVALID_REGEX("x(?+0)y", perl); + TEST_INVALID_REGEX("x(?+1)y", perl); + TEST_REGEX_SEARCH("^(?+1)(?x|y){0}z", perl, "xzxx", match_default, make_array(0, 2, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("^(?+1)(?x|y){0}z", perl, "yzyy", match_default, make_array(0, 2, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("^(?+1)(?x|y){0}z", perl, "xxz", match_default, make_array(-2, -2)); + + // Now recurse to sub-expression zero: + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(abcd)xyz", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "xyz(abcd)", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(ab(xy)cd)pqr", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(ab(xycd)pqr", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "() abc ()", match_default, make_array(0, 2, -2, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(2, 31, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "abcd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "abcd)", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?0))*\\)", perl, "(abcd", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?0) )* \\) ", perl|mod_x, "(ab(xy)cd)pqr", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?0) )* \\) ", perl|mod_x, "1(abcd)(x(y)z)pqr", match_default, make_array(1, 7, 2, 6, -2, 7, 14, 12, 13, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "(a(b(c)d)e)", match_default, make_array(4, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "((ab))", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) ) \\) ", perl|mod_x, "()", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) )? \\) ", perl|mod_x, "()", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?0) )? \\) ", perl|mod_x, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(8, 13, -2, 20, 25, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?0) )* \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, -1, -1, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 4, 4, 12, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, -1, -1, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?0) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 12, 1, 4, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( (((((((((( ( (?>[^()]+) | (?0) )* )))))))))) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?0) )* ) \\) ", perl|mod_x, "(abcd(xyz

    qrs)123)", match_default, make_array(0, 20, 1, 19, 16, 19, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?0)) )* ) \\) ", perl|mod_x, "(ab(cd)ef)", match_default, make_array(0, 10, 1, 9, 7, 9, 3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?0)) )* ) \\) ", perl|mod_x, "(ab(cd(ef)gh)ij)", match_default, make_array(0, 16, 1, 15, 13, 15, 3, 13, -2, -2)); + // Again with (?R): + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(abcd)xyz", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "xyz(abcd)", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(ab(xy)cd)pqr", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(ab(xycd)pqr", match_default, make_array(3, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "() abc ()", match_default, make_array(0, 2, -2, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(2, 31, -2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "abcd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "abcd)", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\((?:(?>[^()]+)|(?R))*\\)", perl, "(abcd", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?R) )* \\) ", perl|mod_x, "(ab(xy)cd)pqr", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?R) )* \\) ", perl|mod_x, "1(abcd)(x(y)z)pqr", match_default, make_array(1, 7, 2, 6, -2, 7, 14, 12, 13, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "(abcd)", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "(a(b(c)d)e)", match_default, make_array(4, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "((ab))", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) ) \\) ", perl|mod_x, "()", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) )? \\) ", perl|mod_x, "()", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("\\( (?: (?>[^()]+) | (?R) )? \\) ", perl|mod_x, "12(abcde(fsh)xyz(foo(bar))lmno)89", match_default, make_array(8, 13, -2, 20, 25, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (?>[^()]+) | (?R) )* \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, -1, -1, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( (123)? ( ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 4, 4, 12, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, -1, -1, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( (123)? ( (?>[^()]+) | (?R) )* ) \\) ", perl|mod_x, "(123ab(xy)cd)", match_default, make_array(0, 13, 1, 12, 1, 4, 10, 12, -2, -2)); + TEST_REGEX_SEARCH("\\( (((((((((( ( (?>[^()]+) | (?R) )* )))))))))) \\) ", perl|mod_x, "(ab(xy)cd)", match_default, make_array(0, 10, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 1, 9, 7, 9, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()<>]+) | ((?>[^()]+)) | (?R) )* ) \\) ", perl|mod_x, "(abcd(xyz

    qrs)123)", match_default, make_array(0, 20, 1, 19, 16, 19, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?R)) )* ) \\) ", perl|mod_x, "(ab(cd)ef)", match_default, make_array(0, 10, 1, 9, 7, 9, 3, 7, -2, -2)); + TEST_REGEX_SEARCH("\\( ( ( (?>[^()]+) | ((?R)) )* ) \\) ", perl|mod_x, "(ab(cd(ef)gh)ij)", match_default, make_array(0, 16, 1, 15, 13, 15, 3, 13, -2, -2)); + // And some extra cases: + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xab", match_default, make_array(0, 3, 1, 3, -1, -1, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xbc", match_default, make_array(0, 3, 1, 3, 1, 3, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xde", match_default, make_array(0, 3, 1, 3, 1, 3, 1, 3, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xxab", match_default, make_array(0, 4, 1, 4, 1, 4, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xxxab", match_default, make_array(0, 5, 1, 5, 1, 5, 1, 5, -2, -2)); + TEST_REGEX_SEARCH("x(ab|(bc|(de|(?R))))", perl|mod_x, "xyab", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("[^()]*(?:\\((?R)\\)[^()]*)*", perl|mod_x, "(this(and)that)", match_default, make_array(0, 15, -2, 15, 15, -2, -2)); + TEST_REGEX_SEARCH("[^()]*(?:\\((?R)\\)[^()]*)*", perl|mod_x, "(this(and)that)stuff", match_default, make_array(0, 20, -2, 20, 20, -2, -2)); + TEST_REGEX_SEARCH("[^()]*(?:\\((?>(?R))\\)[^()]*)*", perl|mod_x, "(this(and)that)", match_default, make_array(0, 15, -2, 15, 15, -2, -2)); + + // More complex cases involving (?(R): + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "<>", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, " hij>", match_default, make_array(0, 15, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, " hij>", match_default, make_array(5, 10, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "def>", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "", match_default, make_array(4, 6, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "(?.)\\W*(?&one)\\W*\\k|)|(?(?.)\\W*(?&three)\\W*\\k'four'|\\W*.\\W*))\\W*$", perl|mod_x|icase, "Satan, oscillate my metallic sonatas!", match_default, make_array(0, 37, -1, -1, -1, -1, 0, 36, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(?'abc'a|b)(?d|e)(?&abc){2}", perl|mod_x, "bdaa", match_default, make_array(0, 4, 0, 1, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?'abc'a|b)(?d|e)(?&abc){2}", perl|mod_x, "bdab", match_default, make_array(0, 4, 0, 1, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?'abc'a|b)(?d|e)(?&abc){2}", perl|mod_x, "bddd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?&abc)X(?P)", perl|mod_x, "abcPXP123", match_default, make_array(3, 6, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("(?:a(?&abc)b)*(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_REGEX_SEARCH("(?:a(?&abc)b){1,5}(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_REGEX_SEARCH("(?:a(?&abc)b){2,5}(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_REGEX_SEARCH("(?:a(?&abc)b){2,}(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_INVALID_REGEX("(?)(?&)", perl|mod_x); + TEST_INVALID_REGEX("(?)(?&a)", perl|mod_x); + TEST_INVALID_REGEX("(?)(?&aaaaaaaaaaaaaaaaaaaaaaa)", perl|mod_x); + TEST_INVALID_REGEX("(?&N)[]a(?)](?abc)", perl|mod_x); + TEST_INVALID_REGEX("(?&N)[]a(?)](abc)", perl|mod_x); + TEST_INVALID_REGEX("(?&N)[]a(?)](abc)", perl|mod_x); + TEST_REGEX_SEARCH("^X(?&N)(a)(?|(b)|(q))(c)(d)(?Y)", perl|mod_x, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + // And again with (?P> : + //TEST_REGEX_SEARCH("^\\W*(?:(?(?.)\\W*(?&one)\\W*\\k|)|(?(?.)\\W*(?&three)\\W*\\k'four'|\\W*.\\W*))\\W*$", perl|mod_x|icase, "Satan, oscillate my metallic sonatas!", match_default, make_array(0, 37, -1, -1, -1, -1, 0, 36, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(?'abc'a|b)(?d|e)(?P>abc){2}", perl|mod_x, "bdaa", match_default, make_array(0, 4, 0, 1, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?'abc'a|b)(?d|e)(?P>abc){2}", perl|mod_x, "bdab", match_default, make_array(0, 4, 0, 1, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("(?'abc'a|b)(?d|e)(?P>abc){2}", perl|mod_x, "bddd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?P>abc)X(?P)", perl|mod_x, "abcPXP123", match_default, make_array(3, 6, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("(?:a(?P>abc)b)*(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_REGEX_SEARCH("(?:a(?P>abc)b){1,5}(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_REGEX_SEARCH("(?:a(?P>abc)b){2,5}(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_REGEX_SEARCH("(?:a(?P>abc)b){2,}(?x)", perl|mod_x, "123axbaxbaxbx456", match_default, make_array(3, 13, 12, 13 , -2, -2)); + TEST_INVALID_REGEX("(?)(?P>)", perl|mod_x); + TEST_INVALID_REGEX("(?)(?P>a)", perl|mod_x); + TEST_INVALID_REGEX("(?)(?P>aaaaaaaaaaaaaaaaaaaaaaa)", perl|mod_x); + TEST_INVALID_REGEX("(?P>N)[]a(?)](?abc)", perl|mod_x); + TEST_INVALID_REGEX("(?P>N)[]a(?)](abc)", perl|mod_x); + TEST_INVALID_REGEX("(?P>N)[]a(?)](abc)", perl|mod_x); + TEST_REGEX_SEARCH("^X(?P>N)(a)(?|(b)|(q))(c)(d)(?Y)", perl|mod_x, "XYabcdY", match_default, make_array(0, 7, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, -2, -2)); + // Now check (?(R&NAME) : + TEST_REGEX_SEARCH("(? (?'B' abc (?(R) (?(R&A)1) (?(R&B)2) X | (?1) (?2) (?R) ))) ", perl|mod_x, "abcabc1Xabc2XabcXabcabc", match_default, make_array(0, 17, 0, 17, 0, 17, -2, -2)); + TEST_INVALID_REGEX("(? (?'B' abc (?(R) (?(R&1)1) (?(R&B)2) X | (?1) (?2) (?R) ))) ", perl|mod_x); + TEST_REGEX_SEARCH("(?<1> (?'B' abc (?(R) (?(R&1)1) (?(R&B)2) X | (?1) (?2) (?R) ))) ", perl|mod_x, "abcabc1Xabc2XabcXabcabc", match_default, make_array(0, 17, 0, 17, 0, 17, -2, -2)); + + // Now check for named conditionals: + TEST_REGEX_SEARCH("^(?a)? (?()b|c) (?('ab')d|e)", perl|mod_x, "abd", match_default, make_array(0, 3, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("^(?a)? (?()b|c) (?('ab')d|e)", perl|mod_x, "ce", match_default, make_array(0, 2, -1, -1, -2, -2)); + + // Recursions in combination with (DEFINE): + TEST_REGEX_SEARCH("^(?(DEFINE) (? a) (? b) ) (?&A) (?&B) ", perl|mod_x, "abcd", match_default, make_array(0, 2, -1, -1, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("(?(?&NAME_PAT))\\s+(?(?&ADDRESS_PAT)) (?(DEFINE) (?[a-z]+) (?\\d+))", perl|mod_x, "metcalfe 33", match_default, make_array(0, 11, 0, 8, 9, 11, -1, -1, -1, -1, -2, -2)); + TEST_INVALID_REGEX("^(?(DEFINE) abc | xyz ) ", perl|mod_x); + //TEST_INVALID_REGEX("(?(DEFINE) abc){3} xyz", perl|mod_x); + TEST_REGEX_SEARCH("(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))\\b(?&byte)(\\.(?&byte)){3}", perl|mod_x, "1.2.3.4", match_default, make_array(0, 7, -1, -1, 5, 7, -2, -2)); + TEST_REGEX_SEARCH("(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))\\b(?&byte)(\\.(?&byte)){3}", perl|mod_x, "131.111.10.206", match_default, make_array(0, 14, -1, -1, 10, 14, -2, -2)); + TEST_REGEX_SEARCH("(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))\\b(?&byte)(\\.(?&byte)){3}", perl|mod_x, "10.0.0.0", match_default, make_array(0, 8, -1, -1, 6, 8, -2, -2)); + TEST_REGEX_SEARCH("(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))\\b(?&byte)(\\.(?&byte)){3}", perl|mod_x, "10.6", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))\\b(?&byte)(\\.(?&byte)){3}", perl|mod_x, "455.3.4.5", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\b(?&byte)(\\.(?&byte)){3}(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))", perl|mod_x, "1.2.3.4", match_default, make_array(0, 7, 5, 7, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\b(?&byte)(\\.(?&byte)){3}(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))", perl|mod_x, "131.111.10.206", match_default, make_array(0, 14, 10, 14, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\b(?&byte)(\\.(?&byte)){3}(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))", perl|mod_x, "10.0.0.0", match_default, make_array(0, 8, 6, 8, -1, -1, -2, -2)); + TEST_REGEX_SEARCH("\\b(?&byte)(\\.(?&byte)){3}(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))", perl|mod_x, "10.6", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("\\b(?&byte)(\\.(?&byte)){3}(?(DEFINE)(?2[0-4]\\d|25[0-5]|1\\d\\d|[1-9]?\\d))", perl|mod_x, "455.3.4.5", match_default, make_array(-2, -2)); +} + diff --git a/test/regress/test_replace.cpp b/test/regress/test_replace.cpp index e2acf380..06c137bf 100644 --- a/test/regress/test_replace.cpp +++ b/test/regress/test_replace.cpp @@ -88,6 +88,16 @@ void test_replace() TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?1A:B", "...AB,,,AB*AB?"); TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?1A:B)C", "...ACBC,,,ACBC*ACBC?"); TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?1:B", "...B,,,B*B?"); + + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?{1}A)(?{2}B)", "...AB,,,AB*AB?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?{1}A:B", "...AB,,,AB*AB?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?{1}A:B)C", "...ACBC,,,ACBC*ACBC?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?{1}:B", "...B,,,B*B?"); + TEST_REGEX_REPLACE("(?a+)|(?b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?{one}A)(?{two}B)", "...AB,,,AB*AB?"); + TEST_REGEX_REPLACE("(?a+)|(?b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?{one}A:B", "...AB,,,AB*AB?"); + TEST_REGEX_REPLACE("(?a+)|(?b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?{one}A:B)C", "...ACBC,,,ACBC*ACBC?"); + TEST_REGEX_REPLACE("(?a+)|(?b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?{one}:B", "...B,,,B*B?"); + // move to copying unmatched data, but replace first occurance only: TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_all|format_first_only, "bbb", "...bbb,,,"); TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,", match_default|format_all|format_first_only, "$1", "...bb,,,"); @@ -126,5 +136,53 @@ void test_replace() TEST_REGEX_REPLACE("(a+)", perl, "...aaa,,,", match_default, "/${10}/", "...//,,,"); TEST_REGEX_REPLACE("((((((((((a+))))))))))", perl, "...aaa,,,", match_default, "/${10}/", ".../aaa/,,,"); TEST_REGEX_REPLACE("(a+)", perl, "...aaa,,,", match_default, "/${1}0/", ".../aaa0/,,,"); + + // New Perl style operators: + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$MATCH", "aaa"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${MATCH}", "aaa"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${^MATCH}", "aaa"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$MATC", "$MATC"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${MATCH", "${MATCH"); + + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$PREMATCH", "..."); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${PREMATCH}", "..."); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${^PREMATCH}", "..."); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$PREMATC", "$PREMATC"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${PREMATCH", "${PREMATCH"); + + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$POSTMATCH", ",,,"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${POSTMATCH}", ",,,"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${^POSTMATCH}", ",,,"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$POSTMATC", "$POSTMATC"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "${POSTMATCH", "${POSTMATCH"); + + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_PAREN_MATCH", ""); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_PAREN_MATC", "$LAST_PAREN_MATC"); + TEST_REGEX_REPLACE("(a+)", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_PAREN_MATCH", "aaa"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, "...aaabb,,,", match_default|format_no_copy, "$LAST_PAREN_MATCH", "bb"); + + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$+", ""); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$+foo", "foo"); + TEST_REGEX_REPLACE("(a+)", perl, "...aaa,,,", match_default|format_no_copy, "$+", "aaa"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, "...aaabb,,,", match_default|format_no_copy, "$+foo", "bbfoo"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, "...aaabb,,,", match_default|format_no_copy, "$+{", "bb{"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, "...aaabb,,,", match_default|format_no_copy, "$+{foo", "bb{foo"); + + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_SUBMATCH_RESULT", ""); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_SUBMATCH_RESUL", "$LAST_SUBMATCH_RESUL"); + TEST_REGEX_REPLACE("(a+)", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_SUBMATCH_RESULT", "aaa"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, "...aaabb,,,", match_default|format_no_copy, "$LAST_SUBMATCH_RESULT", "bb"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaa,,,", match_default|format_no_copy, "$LAST_SUBMATCH_RESULT", "aaa"); + + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$^N", ""); + TEST_REGEX_REPLACE("(a+)", perl, "...aaa,,,", match_default|format_no_copy, "$^N", "aaa"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, "...aaabb,,,", match_default|format_no_copy, "$^N", "bb"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaa,,,", match_default|format_no_copy, "$^N", "aaa"); + + TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "$&", "aabb"); + TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "$1", "aa"); + TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "$2", "bb"); + TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "d$+{one}c", "daac"); + TEST_REGEX_REPLACE("(?a+)(?b+)", perl, " ...aabb,,", match_default|format_no_copy, "c$+{two}d", "cbbd"); } diff --git a/test/regress/test_sets.cpp b/test/regress/test_sets.cpp index 61374819..3cd7a520 100644 --- a/test/regress/test_sets.cpp +++ b/test/regress/test_sets.cpp @@ -266,6 +266,10 @@ void test_sets2() TEST_REGEX_SEARCH("[\\W]+", perl, "AB_ AB", match_default, make_array(3, 6, -2, -2)); TEST_REGEX_SEARCH("[[:^word:]]+", perl, "AB_ AB", match_default, make_array(3, 6, -2, -2)); TEST_REGEX_SEARCH("\\W+", perl, "AB_ AB", match_default, make_array(3, 6, -2, -2)); + TEST_REGEX_SEARCH("\\h+", perl, "\v\f\r\n \t\n", match_default, make_array(4, 6, -2, -2)); + TEST_REGEX_SEARCH("\\V+", perl, "\v\f\r\n \t\n", match_default, make_array(4, 6, -2, -2)); + TEST_REGEX_SEARCH("\\H+", perl, " \t\v\f\r\n ", match_default, make_array(2, 6, -2, -2)); + TEST_REGEX_SEARCH("\\v+", perl, " \t\v\f\r\n ", match_default, make_array(2, 6, -2, -2)); test_sets2c(); } diff --git a/test/regress/test_simple_repeats.cpp b/test/regress/test_simple_repeats.cpp index 6811e5f9..690c71eb 100644 --- a/test/regress/test_simple_repeats.cpp +++ b/test/regress/test_simple_repeats.cpp @@ -436,3 +436,44 @@ void test_fast_repeats2() } +void test_pocessive_repeats() +{ + using namespace boost::regex_constants; + // and again for sets: + TEST_REGEX_SEARCH("^(\\w++|\\s++)*$", perl, "now is the time for all good men to come to the aid of the party", match_default, make_array(0, 64, 59, 64, -2, -2)); + TEST_REGEX_SEARCH("^(\\w++|\\s++)*$", perl, "this is not a line with only words and spaces!", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(\\d++)(\\w)", perl, "12345a", match_default, make_array(0, 6, 0, 5, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("(\\d++)(\\w)", perl, "12345+", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(\\d++)(\\w)", perl, "12345", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a++b", perl, "aaab", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("(a++b)", perl, "aaab", match_default, make_array(0, 4, 0, 4, -2, -2)); + TEST_REGEX_SEARCH("([^()]++|\\([^()]*\\))+", perl, "((abc(ade)ufh()()x", match_default, make_array(2, 18, 17, 18, -2, -2)); + TEST_REGEX_SEARCH("\\(([^()]++|\\([^()]+\\))+\\)", perl, "(abc)", match_default, make_array(0, 5, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("\\(([^()]++|\\([^()]+\\))+\\)", perl, "(abc(def)xyz)", match_default, make_array(0, 13, 9, 12, -2, -2)); + TEST_REGEX_SEARCH("\\(([^()]++|\\([^()]+\\))+\\)", perl, "((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", match_default, make_array(-2, -2)); + /* + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "<>", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, " hij>", match_default, make_array(0, 15, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, " hij>", match_default, make_array(5, 10, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "def>", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "", match_default, make_array(4, 6, -2, -2)); + TEST_REGEX_SEARCH("< (?: (?(R) \\d++ | [^<>]*+) | (?R)) * >", perl|mod_x, "]*+) | (?2)) * >))", perl|mod_x, "<>", match_default, make_array(0, 2, 0, 2, 0, 2, -2, -2)); + TEST_REGEX_SEARCH("((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))", perl|mod_x, "", match_default, make_array(0, 6, 0, 6, 0, 6, -2, -2)); + TEST_REGEX_SEARCH("((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))", perl|mod_x, " hij>", match_default, make_array(0, 15, 0, 15, 0, 15, -2, -2)); + TEST_REGEX_SEARCH("((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))", perl|mod_x, " hij>", match_default, make_array(5, 10, 5, 10, 5, 10, -2, -2)); + TEST_REGEX_SEARCH("((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))", perl|mod_x, "def>", match_default, make_array(0, 10, 0, 10, 0, 10, -2, -2)); + TEST_REGEX_SEARCH("((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))", perl|mod_x, "", match_default, make_array(4, 6, 4, 6, 4, 6, -2, -2)); + TEST_REGEX_SEARCH("((< (?: (?(R) \d++ | [^<>]*+) | (?2)) * >))", perl|mod_x, "