From 641d60b05976d2412242d44cee2dc49f02cf81a8 Mon Sep 17 00:00:00 2001 From: John Maddock Date: Mon, 19 Apr 2004 12:23:41 +0000 Subject: [PATCH] Almost complete implementation... [SVN r22669] --- include/boost/regex/config.hpp | 6 +- include/boost/regex/static_mutex.hpp | 17 +- include/boost/regex/v4/basic_regex.hpp | 2 +- .../boost/regex/v4/basic_regex_creator.hpp | 136 +- include/boost/regex/v4/basic_regex_parser.hpp | 290 ++++- include/boost/regex/v4/cpp_regex_traits.hpp | 125 +- include/boost/regex/v4/cregex.hpp | 8 + include/boost/regex/v4/error_type.hpp | 6 +- .../regex/v4/perl_matcher_non_recursive.hpp | 2 +- .../boost/regex/v4/perl_matcher_recursive.hpp | 2 +- include/boost/regex/v4/regex_raw_buffer.hpp | 2 +- .../boost/regex/v4/regex_traits_defaults.hpp | 13 +- src/cpp_regex_traits.cpp | 1 + src/regex_raw_buffer.cpp | 1 + src/regex_traits_defaults.cpp | 84 +- src/static_mutex.cpp | 7 +- test/regress/basic_tests.cpp | 1108 +++++++++-------- test/regress/main.cpp | 8 + test/regress/test.hpp | 49 +- test/regress/test_regex_search.hpp | 103 +- 20 files changed, 1378 insertions(+), 592 deletions(-) diff --git a/include/boost/regex/config.hpp b/include/boost/regex/config.hpp index 8a7e081b..24683fea 100644 --- a/include/boost/regex/config.hpp +++ b/include/boost/regex/config.hpp @@ -236,9 +236,9 @@ namespace boost{ typedef wchar_t regex_wchar_type; } # if defined(BOOST_REGEX_DYN_LINK) || defined(BOOST_ALL_DYN_LINK) # define BOOST_DYN_LINK # endif -#ifdef BOOST_REGEX_DIAG -# define BOOST_LIB_DIAGNOSTIC -#endif +# ifdef BOOST_REGEX_DIAG +# define BOOST_LIB_DIAGNOSTIC +# endif # include #endif diff --git a/include/boost/regex/static_mutex.hpp b/include/boost/regex/static_mutex.hpp index 9ff5d600..9ea0e7ca 100644 --- a/include/boost/regex/static_mutex.hpp +++ b/include/boost/regex/static_mutex.hpp @@ -22,6 +22,7 @@ #define BOOST_REGEX_STATIC_MUTEX_HPP #include +#include // dll import/export options. #ifdef BOOST_HAS_PTHREADS #include @@ -35,7 +36,7 @@ // namespace boost{ -class scoped_static_mutex_lock; +class BOOST_REGEX_DECL scoped_static_mutex_lock; class static_mutex { @@ -46,7 +47,7 @@ public: #define BOOST_STATIC_MUTEX_INIT { PTHREAD_MUTEX_INITIALIZER, } -class scoped_static_mutex_lock +class BOOST_REGEX_DECL scoped_static_mutex_lock { public: scoped_static_mutex_lock(static_mutex& mut, bool lk = true); @@ -82,7 +83,7 @@ inline bool scoped_static_mutex_lock::locked()const namespace boost{ -class scoped_static_mutex_lock; +class BOOST_REGEX_DECL scoped_static_mutex_lock; class static_mutex { @@ -93,7 +94,7 @@ public: #define BOOST_STATIC_MUTEX_INIT { 0, } -class scoped_static_mutex_lock +class BOOST_REGEX_DECL scoped_static_mutex_lock { public: scoped_static_mutex_lock(static_mutex& mut, bool lk = true); @@ -134,10 +135,10 @@ inline bool scoped_static_mutex_lock::locked()const namespace boost{ -class scoped_static_mutex_lock; -extern "C" void free_static_mutex(); +class BOOST_REGEX_DECL scoped_static_mutex_lock; +extern "C" BOOST_REGEX_DECL void free_static_mutex(); -class static_mutex +class BOOST_REGEX_DECL static_mutex { public: typedef scoped_static_mutex_lock scoped_lock; @@ -148,7 +149,7 @@ public: #define BOOST_STATIC_MUTEX_INIT { } -class scoped_static_mutex_lock +class BOOST_REGEX_DECL scoped_static_mutex_lock { public: scoped_static_mutex_lock(static_mutex& mut, bool lk = true); diff --git a/include/boost/regex/v4/basic_regex.hpp b/include/boost/regex/v4/basic_regex.hpp index 9e7f6455..15a5fce6 100644 --- a/include/boost/regex/v4/basic_regex.hpp +++ b/include/boost/regex/v4/basic_regex.hpp @@ -68,7 +68,7 @@ struct regex_data // template class basic_regex_implementation - : protected regex_data + : public regex_data { public: typedef regex_constants::syntax_option_type flag_type; diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index 11e82416..649c7009 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -45,8 +45,8 @@ template class basic_char_set { public: - typedef digraph digraph_type; - typedef std::basic_string string_type; + typedef digraph digraph_type; + typedef typename traits::string_type string_type; typedef typename traits::char_class_type mask_type; basic_char_set() @@ -68,8 +68,16 @@ public: { m_ranges.push_back(first); m_ranges.push_back(end); - if(first.second || end.second) + if(first.second) + { m_has_digraphs = true; + add_single(first); + } + if(end.second) + { + m_has_digraphs = true; + add_single(end); + } m_empty = false; } void add_class(mask_type m) @@ -77,10 +85,20 @@ public: m_classes |= m; m_empty = false; } + void add_equivalent(const digraph_type& s) + { + m_equivalents.push_back(s); + if(s.second) + { + m_has_digraphs = true; + add_single(s); + } + m_empty = false; + } void negate() { m_negate = true; - m_empty = false; + //m_empty = false; } // @@ -111,6 +129,14 @@ public: { return m_ranges.end(); } + list_iterator equivalents_begin()const + { + return m_equivalents.begin(); + } + list_iterator equivalents_end()const + { + return m_equivalents.end(); + } mask_type classes()const { return m_classes; @@ -126,6 +152,7 @@ private: bool m_has_digraphs; // true if we have digraphs present mask_type m_classes; // character classes to match bool m_empty; // whether we've added anything yet + std::vector m_equivalents; // a list of equivalence classes }; template @@ -189,6 +216,7 @@ private: void set_all_masks(unsigned char* bits, unsigned char); bool is_bad_repeat(re_syntax_base* pt); void set_bad_repeat(re_syntax_base* pt); + syntax_element_type get_repeat_type(re_syntax_base* state); }; template @@ -297,7 +325,7 @@ re_syntax_base* basic_regex_creator::append_set( // result->csingles = static_cast(std::distance(char_set.singles_begin(), char_set.singles_end())); result->cranges = static_cast(std::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2; - result->cequivalents = 0; + result->cequivalents = static_cast(std::distance(char_set.equivalents_begin(), char_set.equivalents_end())); result->cclasses = char_set.classes(); if(flags() & regbase::icase) { @@ -377,6 +405,27 @@ re_syntax_base* basic_regex_creator::append_set( std::memcpy(p, s2.c_str(), sizeof(charT) * (s2.size() + 1)); } // + // now process the equivalence classes: + // + first = char_set.equivalents_begin(); + last = char_set.equivalents_end(); + while(first != last) + { + string_type s; + if(first->second) + { + charT cs[2] = { first->first, first->second, }; + s = m_traits.transform_primary(cs, cs+2); + } + else + s = m_traits.transform_primary(&first->first, &first->first+1); + if(s.empty()) + return 0; // invalid or unsupported equivalence class + charT* p = static_cast(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) ); + std::memcpy(p, s.c_str(), sizeof(charT) * (s.size() + 1)); + ++first; + } + // // finally reset the address of our last state: // m_last_state = result = static_cast*>(getaddress(offset)); @@ -470,6 +519,32 @@ re_syntax_base* basic_regex_creator::append_set( result->_map[i] = true; } } + // + // now process the equivalence classes: + // + first = char_set.equivalents_begin(); + last = char_set.equivalents_end(); + while(first != last) + { + string_type s; + if(first->second) + { + charT cs[2] = { first->first, first->second, }; + s = m_traits.transform_primary(cs, cs+2); + } + else + s = m_traits.transform_primary(&first->first, &first->first+1); + if(s.empty()) + return 0; // invalid or unsupported equivalence class + for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) + { + charT c(i); + string_type s2 = this->m_traits.transform_primary(&c, &c+1); + if(s == s2) + result->_map[i] = true; + } + ++first; + } if(negate) { for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) @@ -567,6 +642,8 @@ void basic_regex_creator::create_startmaps(re_syntax_base* state) create_startmap(state->next.p, static_cast(state)->_map, &static_cast(state)->can_be_null, mask_take); m_bad_repeats = 0; create_startmap(static_cast(state)->alt.p, static_cast(state)->_map, &static_cast(state)->can_be_null, mask_skip); + // adjust the type of the state to allow for faster matching: + state->type = this->get_repeat_type(state); return; default: state = state->next.p; @@ -613,6 +690,10 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, return; } case syntax_element_backref: + // can be null, and any character can match: + if(pnull) + *pnull |= mask; + // fall through: case syntax_element_wild: { // can't be null, any character can match: @@ -668,13 +749,18 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, if(map) { typedef typename traits::char_class_type mask_type; - map[0] |= mask_init; - for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) + if(static_cast*>(state)->singleton) { - charT c = static_cast(i); - if(&c != re_is_set_member(&c, &c + 1, static_cast*>(state), *m_pdata)) - map[i] |= mask; + map[0] |= mask_init; + for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) + { + charT c = static_cast(i); + if(&c != re_is_set_member(&c, &c + 1, static_cast*>(state), *m_pdata)) + map[i] |= mask; + } } + else + set_all_masks(map, mask); } return; case syntax_element_set: @@ -772,7 +858,6 @@ unsigned basic_regex_creator::get_restart_type(re_syntax_base* st continue; case syntax_element_start_line: return regbase::restart_line; - case syntax_element_word_boundary: case syntax_element_word_start: return regbase::restart_word; case syntax_element_buffer_start: @@ -848,6 +933,35 @@ void basic_regex_creator::set_bad_repeat(re_syntax_base* pt) } } +template +syntax_element_type basic_regex_creator::get_repeat_type(re_syntax_base* state) +{ + typedef typename traits::char_class_type mask_type; + if(state->type == syntax_element_rep) + { + // check to see if we are repeating a single state: + if(state->next.p->next.p->next.p == static_cast(state)->alt.p) + { + switch(state->next.p->type) + { + case re_detail::syntax_element_wild: + return re_detail::syntax_element_dot_rep; + case re_detail::syntax_element_literal: + return re_detail::syntax_element_char_rep; + case re_detail::syntax_element_set: + return re_detail::syntax_element_short_set_rep; + case re_detail::syntax_element_long_set: + if(static_cast*>(state->next.p)->singleton) + return re_detail::syntax_element_long_set_rep; + break; + default: + break; + } + } + } + return state->type; +} + } // namespace re_detail } // namespace boost diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 4272314b..19f2efbd 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -49,11 +49,15 @@ public: bool parse_backref(); void parse_set_literal(basic_char_set& char_set); bool parse_inner_set(basic_char_set& char_set); - digraph get_next_set_literal(); + bool parse_QE(); + bool parse_perl_extension(); + digraph get_next_set_literal(basic_char_set& char_set); charT unescape_character(); private: typedef bool (basic_regex_parser::*parser_proc_type)(); + typedef typename traits::string_type string_type; + typedef typename traits::char_class_type char_class_type; parser_proc_type m_parser_proc; // the main parser to use const charT* m_base; // the start of the string being parsed const charT* m_end; // the end of the string being parsed @@ -235,13 +239,29 @@ bool basic_regex_parser::parse_literal() template bool basic_regex_parser::parse_open_paren() { + // + // skip the '(' and error check: + // + if(++m_position == m_end) + fail(REG_EPAREN, m_position - m_base); + // + // begin by checking for a perl-style (?...) extension: + // + if((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0) + { + if(m_traits.syntax_type(*m_position) == regex_constants::syntax_question) + return parse_perl_extension(); + } // // update our mark count, and append the required state: // - unsigned markid = ++m_mark_count; + unsigned markid; + if(this->flags() & regbase::nosubs) + markid = 0; + else + markid = ++m_mark_count; re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); pb->index = markid; - ++m_position; std::ptrdiff_t last_paren_start = this->getoffset(pb); // back up insertion point for alternations, and set new point: std::ptrdiff_t last_alt_point = m_alt_insert_point; @@ -392,6 +412,18 @@ bool basic_regex_parser::parse_extended_escape() ++m_position; this->append_state(syntax_element_soft_buffer_end); break; + case regex_constants::escape_type_Q: + return parse_QE(); + case regex_constants::escape_type_C: + return parse_match_any(); + case regex_constants::escape_type_X: + ++m_position; + this->append_state(syntax_element_combining); + break; + case regex_constants::escape_type_G: + ++m_position; + this->append_state(syntax_element_restart_continue); + break; default: this->append_literal(unescape_character()); break; @@ -465,6 +497,7 @@ bool basic_regex_parser::parse_repeat(std::size_t low, std::size_ case syntax_element_alt: case syntax_element_soft_buffer_end: case syntax_element_restart_continue: + case syntax_element_jump: // can't legally repeat any of the above: fail(REG_BADRPT, m_position - m_base); default: @@ -653,6 +686,38 @@ bool basic_regex_parser::parse_set() if(parse_inner_set(char_set)) break; return true; + case regex_constants::syntax_escape: + { + // + // look ahead and see if this is a character class shortcut + // \d \w \s etc... + // + ++m_position; + if(this->m_traits.escape_syntax_type(*m_position) + == regex_constants::escape_type_class) + { + char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1); + if(m) + { + char_set.add_class(m); + break; + } + } + else if(this->m_traits.escape_syntax_type(*m_position) + == regex_constants::escape_type_not_class) + { + // negated character classes aren't supported: + char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1); + if(m) + { + fail(REG_EESCAPE, m_position - m_base); + } + } + // not a character class, just a regular escape: + --m_position; + parse_set_literal(char_set); + break; + } default: parse_set_literal(char_set); break; @@ -673,6 +738,13 @@ bool basic_regex_parser::parse_inner_set(basic_char_setm_traits.syntax_type(*m_position)) { + case regex_constants::syntax_dot: + // + // a collating element is treated as a literal: + // + --m_position; + parse_set_literal(char_set); + return true; case regex_constants::syntax_colon: { // check that character classes are actually enabled: @@ -733,6 +805,37 @@ bool basic_regex_parser::parse_inner_set(basic_char_setm_traits.syntax_type(*m_position) != regex_constants::syntax_equal)) + ++m_position; + const charT* name_last = m_position; + if(m_end == m_position) + fail(REG_EBRACK, m_position - m_base); + if((m_end == ++m_position) + || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) + fail(REG_EBRACK, m_position - m_base); + string_type m = this->m_traits.lookup_collatename(name_first, name_last); + if((0 == m.size()) || (m.size() > 2)) + fail(REG_ECOLLATE, name_first - m_base); + digraph d; + d.first = m[0]; + if(m.size() > 1) + d.second = m[1]; + else + d.second = 0; + char_set.add_equivalent(d); + ++m_position; + break; + } default: --m_position; parse_set_literal(char_set); @@ -744,7 +847,7 @@ bool basic_regex_parser::parse_inner_set(basic_char_set void basic_regex_parser::parse_set_literal(basic_char_set& char_set) { - digraph start_range = get_next_set_literal(); + digraph start_range = get_next_set_literal(char_set); if(m_end == m_position) fail(REG_EBRACK, m_position - m_base); if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash) @@ -754,7 +857,7 @@ void basic_regex_parser::parse_set_literal(basic_char_setm_traits.syntax_type(*m_position) != regex_constants::syntax_close_set) { - digraph end_range = get_next_set_literal(); + digraph end_range = get_next_set_literal(char_set); char_set.add_range(start_range, end_range); if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash) fail(REG_ERANGE, m_position - m_base); @@ -766,11 +869,22 @@ void basic_regex_parser::parse_set_literal(basic_char_set -digraph basic_regex_parser::get_next_set_literal() +digraph basic_regex_parser::get_next_set_literal(basic_char_set& char_set) { + typedef typename traits::string_type string_type; digraph result; switch(this->m_traits.syntax_type(*m_position)) { + case regex_constants::syntax_dash: + if(!char_set.empty()) + { + // see if we are at the end of the set: + if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) + fail(REG_ERANGE, m_position - m_base); + --m_position; + } + result.first = *m_position++; + return result; case regex_constants::syntax_escape: // check to see if escapes are supported first: if(this->flags() & regex_constants::no_escape_in_lists) @@ -781,6 +895,43 @@ digraph basic_regex_parser::get_next_set_literal() ++m_position; result = unescape_character(); break; + case regex_constants::syntax_open_set: + { + if(m_end == ++m_position) + fail(REG_ECOLLATE, m_position - m_base); + if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot) + { + --m_position; + result.first = *m_position; + ++m_position; + return result; + } + if(m_end == ++m_position) + fail(REG_ECOLLATE, m_position - m_base); + const charT* name_first = m_position; + // skip at least one character, then find the matching ':]' + if(m_end == ++m_position) + fail(REG_ECOLLATE, name_first - m_base); + while((m_position != m_end) + && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)) + ++m_position; + const charT* name_last = m_position; + if(m_end == m_position) + fail(REG_ECOLLATE, name_first - m_base); + if((m_end == ++m_position) + || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)) + fail(REG_ECOLLATE, name_first - m_base); + ++m_position; + string_type s = this->m_traits.lookup_collatename(name_first, name_last); + if(s.empty() || (s.size() > 2)) + fail(REG_ECOLLATE, name_first - m_base); + result.first = s[0]; + if(s.size() > 1) + result.second = s[1]; + else + result.second = 0; + return result; + } default: result = *m_position++; } @@ -916,6 +1067,133 @@ bool basic_regex_parser::parse_backref() return true; } +template +bool basic_regex_parser::parse_QE() +{ + // + // parse a \Q...\E sequence: + // + ++m_position; // skip the Q + const charT* start = m_position; + const charT* end; + do + { + while((m_position != m_end) + && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape)) + ++m_position; + if((m_position == m_end) || (++m_position == m_end)) // skip the escape + { + fail(REG_EESCAPE, m_position - m_base); + return false; + } + // check to see if it's a \E: + if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_E) + { + ++m_position; + end = m_position - 2; + break; + } + // otherwise go round again: + }while(true); + // + // now add all the character between the two escapes as literals: + // + while(start != end) + { + this->append_literal(*start); + ++start; + } + return true; +} + +template +bool basic_regex_parser::parse_perl_extension() +{ + if(++m_position == m_end) + fail(REG_BADRPT, m_position - m_base); + // + // backup some state, and prepare the way: + // + int markid; + std::ptrdiff_t jump_offset = 0; + re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); + std::ptrdiff_t last_paren_start = this->getoffset(pb); + // back up insertion point for alternations, and set new point: + std::ptrdiff_t last_alt_point = m_alt_insert_point; + this->m_pdata->m_data.align(); + m_alt_insert_point = this->m_pdata->m_data.size(); + // + // select the actual extension used: + // + switch(this->m_traits.syntax_type(*m_position)) + { + case regex_constants::syntax_colon: + // + // a non-capturing mark: + // + pb->index = markid = 0; + ++m_position; + break; + case regex_constants::syntax_hash: + // + // a comment; this actually becomes an empty non-capturing mark: + // + pb->index = markid = 0; + while((m_position != m_end) + && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)) + ++m_position; + break; + case regex_constants::syntax_equal: + pb->index = markid = -1; + ++m_position; + jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); + this->m_pdata->m_data.align(); + m_alt_insert_point = this->m_pdata->m_data.size(); + break; + case regex_constants::syntax_not: + pb->index = markid = -2; + ++m_position; + jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); + this->m_pdata->m_data.align(); + m_alt_insert_point = this->m_pdata->m_data.size(); + break; + default: + fail(REG_BADRPT, m_position - m_base); + } + // + // now recursively add more states, this will terminate when we get to a + // matching ')' : + // + parse_all(); + // + // we either have a ')' or we have run out of characters prematurely: + // + if(m_position == m_end) + this->fail(REG_EPAREN, std::distance(m_base, m_end)); + BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark); + ++m_position; + // + // set up the jump pointer if we have one: + // + if(jump_offset) + { + this->m_pdata->m_data.align(); + re_jump* jmp = static_cast(this->getaddress(jump_offset)); + jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp); + } + // + // append closing parenthesis state: + // + pb = static_cast(this->append_state(syntax_element_endmark, sizeof(re_brace))); + pb->index = markid; + this->m_paren_start = last_paren_start; + // + // restore the alternate insertion point: + // + this->m_alt_insert_point = last_alt_point; + return true; +} + } // namespace re_detail } // namespace boost diff --git a/include/boost/regex/v4/cpp_regex_traits.hpp b/include/boost/regex/v4/cpp_regex_traits.hpp index 75ce8cd3..7975f96f 100644 --- a/include/boost/regex/v4/cpp_regex_traits.hpp +++ b/include/boost/regex/v4/cpp_regex_traits.hpp @@ -25,6 +25,9 @@ #ifdef BOOST_HAS_THREADS #include #endif +#ifndef BOOST_REGEX_PRIMARY_TRANSFORM +#include +#endif namespace boost{ @@ -272,7 +275,7 @@ typename cpp_regex_traits_char_layer::string_type // specialised version for narrow characters: // template <> -class cpp_regex_traits_char_layer : public cpp_regex_traits_base +class BOOST_REGEX_DECL cpp_regex_traits_char_layer : public cpp_regex_traits_base { typedef std::string string_type; public: @@ -326,6 +329,7 @@ public: typedef std::basic_string string_type; + typedef charT char_type; //cpp_regex_traits_implementation(); cpp_regex_traits_implementation(const std::locale& l); std::string error_string(regex_constants::error_type n) const @@ -348,16 +352,88 @@ public: } return result; } + string_type lookup_collatename(const charT* p1, const charT* p2) const; + string_type transform_primary(const charT* p1, const charT* p2) const; + string_type transform(const charT* p1, const charT* p2) const + { + return this->m_pcollate->transform(p1, p2); + } re_detail::parser_buf m_sbuf; // buffer for parsing numbers. std::basic_istream m_is; // stream for parsing numbers. private: std::map m_error_strings; // error messages indexed by numberic ID + std::map m_custom_class_names; // character class names + std::map m_custom_collate_names; // collating element names + unsigned m_collate_type; // the form of the collation string + charT m_collate_delim; // the collation group delimiter // // helpers: // char_class_type lookup_classname_imp(const charT* p1, const charT* p2) const; }; +template +typename cpp_regex_traits_implementation::string_type + cpp_regex_traits_implementation::transform_primary(const charT* p1, const charT* p2) const +{ + string_type result; + // + // What we do here depends upon the format of the sort key returned by + // sort key returned by this->transform: + // + switch(m_collate_type) + { + case sort_C: + case sort_unknown: + // the best we can do is translate to lower case, then get a regular sort key: + { + result.assign(p1, p2); + m_pctype->tolower(&*result.begin(), &*result.end()); + result = this->m_pcollate->transform(&*result.begin(), &*result.end()); + break; + } + case sort_fixed: + { + // get a regular sort key, and then truncate it: + result.assign(this->m_pcollate->transform(&*result.begin(), &*result.end())); + result.erase(this->m_collate_delim); + break; + } + case sort_delim: + // get a regular sort key, and then truncate everything after the delim: + result.assign(this->m_pcollate->transform(&*result.begin(), &*result.end())); + std::size_t i; + for(i = 0; i < result.size(); ++i) + { + if(result[i] == m_collate_delim) + break; + } + result.erase(i); + break; + } + return result; +} + +template +typename cpp_regex_traits_implementation::string_type + cpp_regex_traits_implementation::lookup_collatename(const charT* p1, const charT* p2) const +{ + typedef typename std::map::const_iterator iter_type; + if(m_custom_collate_names.size()) + { + iter_type pos = m_custom_collate_names.find(string_type(p1, p2)); + if(pos != m_custom_collate_names.end()) + return pos->second; + } + std::string name(p1, p2); + name = lookup_default_collate_name(name); + if(name.size()) + return string_type(name.begin(), name.end()); + if(p2 - p1 == 1) + return string_type(1, *p1); + return string_type(); +} + template cpp_regex_traits_implementation::cpp_regex_traits_implementation(const std::locale& l) : cpp_regex_traits_char_layer(l), m_is(&m_sbuf) @@ -385,6 +461,9 @@ cpp_regex_traits_implementation::cpp_regex_traits_implementation(const st // if((int)cat >= 0) { + // + // Error messages: + // for(boost::regex_constants::error_type i = 0; i <= boost::regex_constants::error_unknown; ++i) { const char* p = get_default_error_string(i); @@ -402,7 +481,38 @@ cpp_regex_traits_implementation::cpp_regex_traits_implementation(const st } m_error_strings[i] = result; } + // + // Custom class names: + // + static const char_class_type masks[] = + { + std::ctype::alnum, + std::ctype::alpha, + std::ctype::cntrl, + std::ctype::digit, + std::ctype::graph, + std::ctype::lower, + std::ctype::print, + std::ctype::punct, + std::ctype::space, + std::ctype::upper, + std::ctype::xdigit, + cpp_regex_traits_implementation::mask_blank, + cpp_regex_traits_implementation::mask_word, + cpp_regex_traits_implementation::mask_unicode, + }; + static const string_type null_string; + for(unsigned int j = 0; j <= 13; ++j) + { + string_type s(this->m_pmessages->get(cat, 0, j+300, null_string)); + if(s.size()) + this->m_custom_class_names[s] = masks[j]; + } } + // + // get the collation format used by m_pcollate: + // + m_collate_type = re_detail::find_sort_syntax(this, &m_collate_delim); } template @@ -432,6 +542,13 @@ typename cpp_regex_traits_implementation::char_class_type std::ctype::alnum | cpp_regex_traits_implementation::mask_word, std::ctype::xdigit, }; + if(m_custom_class_names.size()) + { + typedef typename std::map, char_class_type>::const_iterator map_iter; + map_iter pos = m_custom_class_names.find(string_type(p1, p2)); + if(pos != m_custom_class_names.end()) + return pos->second; + } std::size_t id = 1 + re_detail::get_default_class_id(p1, p2); assert(id < sizeof(masks) / sizeof(masks[0])); return masks[id]; @@ -491,9 +608,9 @@ public: { return m_pimpl->m_pcollate->transform(p1, p2); } - string_type transform_primary(const charT* , const charT* ) const + string_type transform_primary(const charT* p1, const charT* p2) const { - return string_type(); + return m_pimpl->transform_primary(p1, p2); } char_class_type lookup_classname(const charT* p1, const charT* p2) const { @@ -501,7 +618,7 @@ public: } string_type lookup_collatename(const charT* p1, const charT* p2) const { - return string_type(); + return m_pimpl->lookup_collatename(p1, p2); } bool is_class(charT c, char_class_type f) const { diff --git a/include/boost/regex/v4/cregex.hpp b/include/boost/regex/v4/cregex.hpp index 28dc4796..c82856e7 100644 --- a/include/boost/regex/v4/cregex.hpp +++ b/include/boost/regex/v4/cregex.hpp @@ -47,7 +47,11 @@ typedef size_t regsize_t; typedef struct { unsigned int re_magic; +#ifdef __cplusplus std::size_t re_nsub; /* number of parenthesized subexpressions */ +#else + size_t re_nsub; +#endif const char* re_endp; /* end pointer for REG_PEND */ void* guts; /* none of your business :-) */ match_flag_type eflags; /* none of your business :-) */ @@ -57,7 +61,11 @@ typedef struct typedef struct { unsigned int re_magic; +#ifdef __cplusplus std::size_t re_nsub; /* number of parenthesized subexpressions */ +#else + size_t re_nsub; +#endif const wchar_t* re_endp; /* end pointer for REG_PEND */ void* guts; /* none of your business :-) */ match_flag_type eflags; /* none of your business :-) */ diff --git a/include/boost/regex/v4/error_type.hpp b/include/boost/regex/v4/error_type.hpp index 866c7d12..b89ff01c 100644 --- a/include/boost/regex/v4/error_type.hpp +++ b/include/boost/regex/v4/error_type.hpp @@ -52,13 +52,13 @@ static const reg_error_t REG_ESPACE = 12; /* Ran out of memory. */ static const reg_error_t REG_BADRPT = 13; /* No preceding re for repetition op. */ static const reg_error_t REG_EEND = 14; /* unexpected end of expression */ static const reg_error_t REG_ESIZE = 15; /* expression too big */ -static const reg_error_t REG_ERPAREN = REG_EPAREN; /* unmatched right parenthesis */ +static const reg_error_t REG_ERPAREN = 8; /* = REG_EPAREN : unmatched right parenthesis */ static const reg_error_t REG_EMPTY = 17; /* empty expression */ -static const reg_error_t REG_E_MEMORY = REG_ESIZE; /* out of memory */ +static const reg_error_t REG_E_MEMORY = 15; /* = REG_ESIZE : out of memory */ static const reg_error_t REG_ECOMPLEXITY = 18; /* complexity too high */ static const reg_error_t REG_ESTACK = 19; /* out of stack space */ static const reg_error_t REG_E_UNKNOWN = 20; /* unknown error */ -static const reg_error_t REG_ENOSYS = REG_E_UNKNOWN; /* Reserved. */ +static const reg_error_t REG_ENOSYS = 20; /* = REG_E_UNKNOWN : Reserved. */ #ifdef __cplusplus namespace regex_constants{ diff --git a/include/boost/regex/v4/perl_matcher_non_recursive.hpp b/include/boost/regex/v4/perl_matcher_non_recursive.hpp index 0aa9d55e..ced7a8d8 100644 --- a/include/boost/regex/v4/perl_matcher_non_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_non_recursive.hpp @@ -153,7 +153,7 @@ bool perl_matcher::match_all_states() { if(state_count > max_state_count) raise_error(traits_inst, REG_ESPACE); - if((m_match_flags & match_partial) && (position == last)) + if((m_match_flags & match_partial) && (position == last) && (position != search_base)) m_has_partial_match = true; if(false == unwind(false)) return m_recursive_result; diff --git a/include/boost/regex/v4/perl_matcher_recursive.hpp b/include/boost/regex/v4/perl_matcher_recursive.hpp index 155dfec0..586bb973 100644 --- a/include/boost/regex/v4/perl_matcher_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_recursive.hpp @@ -86,7 +86,7 @@ bool perl_matcher::match_all_states() ++state_count; if(!(this->*proc)()) { - if((m_match_flags & match_partial) && (position == last)) + if((m_match_flags & match_partial) && (position == last) && (position != search_base)) m_has_partial_match = true; return 0; } diff --git a/include/boost/regex/v4/regex_raw_buffer.hpp b/include/boost/regex/v4/regex_raw_buffer.hpp index 06b7ef7b..d4740d05 100644 --- a/include/boost/regex/v4/regex_raw_buffer.hpp +++ b/include/boost/regex/v4/regex_raw_buffer.hpp @@ -96,7 +96,7 @@ enum{ // this is used by basic_regex for expression storage // -class raw_storage +class BOOST_REGEX_DECL raw_storage { public: typedef std::size_t size_type; diff --git a/include/boost/regex/v4/regex_traits_defaults.hpp b/include/boost/regex/v4/regex_traits_defaults.hpp index 724d5197..5e1d6cea 100644 --- a/include/boost/regex/v4/regex_traits_defaults.hpp +++ b/include/boost/regex/v4/regex_traits_defaults.hpp @@ -21,11 +21,11 @@ namespace boost{ namespace re_detail{ -const char* get_default_syntax(regex_constants::syntax_type n); -const char* get_default_error_string(regex_constants::error_type n); +BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants::syntax_type n); +BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_error_string(regex_constants::error_type n); // is charT c a combining character? -bool is_combining_implementation(uint_least16_t s); +BOOST_REGEX_DECL bool BOOST_REGEX_CALL is_combining_implementation(uint_least16_t s); template inline bool is_combining(charT c) @@ -75,9 +75,14 @@ inline bool is_combining(wchar_t c) template inline bool is_separator(charT c) { - return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r')); + return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r') || (static_cast(c) == 0x2028) || (static_cast(c) == 0x2029)); } +// +// get a default collating element: +// +BOOST_REGEX_DECL std::string BOOST_REGEX_CALL lookup_default_collate_name(const std::string& name); + // // get the id of a character clasification, the individual // traits classes then transform that id into a bitmask: diff --git a/src/cpp_regex_traits.cpp b/src/cpp_regex_traits.cpp index faeeab16..be8ce1b6 100644 --- a/src/cpp_regex_traits.cpp +++ b/src/cpp_regex_traits.cpp @@ -16,6 +16,7 @@ * DESCRIPTION: Implements cpp_regex_traits (and associated helper classes). */ +#define BOOST_REGEX_SOURCE #include namespace boost{ namespace re_detail{ diff --git a/src/regex_raw_buffer.cpp b/src/regex_raw_buffer.cpp index 28c03825..dc13cd78 100644 --- a/src/regex_raw_buffer.cpp +++ b/src/regex_raw_buffer.cpp @@ -1,5 +1,6 @@ +#define BOOST_REGEX_SOURCE #include namespace boost{ namespace re_detail{ diff --git a/src/regex_traits_defaults.cpp b/src/regex_traits_defaults.cpp index 44a239fe..d0d54e2c 100644 --- a/src/regex_traits_defaults.cpp +++ b/src/regex_traits_defaults.cpp @@ -16,11 +16,12 @@ * DESCRIPTION: Declares API's for access to regex_traits default properties. */ +#define BOOST_REGEX_SOURCE #include namespace boost{ namespace re_detail{ -const char* get_default_syntax(regex_constants::syntax_type n) +BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants::syntax_type n) { // if the user hasn't supplied a message catalog, then this supplies // default "messages" for us to load in the range 1-100. @@ -77,13 +78,13 @@ const char* get_default_syntax(regex_constants::syntax_type n) "X", "C", "Z", - "G" + "G", "!", }; return ((n >= (sizeof(messages) / sizeof(messages[1]))) ? "" : messages[n]); } -const char* get_default_error_string(regex_constants::error_type n) +BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_error_string(regex_constants::error_type n) { static const char* const s_default_error_messages[] = { "Success", /* REG_NOERROR */ @@ -115,7 +116,7 @@ const char* get_default_error_string(regex_constants::error_type n) return (n > REG_E_UNKNOWN) ? s_default_error_messages[REG_E_UNKNOWN] : s_default_error_messages[n]; } -bool is_combining_implementation(boost::uint_least16_t c) +BOOST_REGEX_DECL bool BOOST_REGEX_CALL is_combining_implementation(boost::uint_least16_t c) { const boost::uint_least16_t combining_ranges[] = { 0x0300, 0x0361, 0x0483, 0x0486, @@ -164,5 +165,80 @@ bool is_combining_implementation(boost::uint_least16_t c) return false; } +// +// these are the POSIX collating names: +// +BOOST_REGEX_DECL const char* def_coll_names[] = { +"NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "alert", "backspace", "tab", "newline", +"vertical-tab", "form-feed", "carriage-return", "SO", "SI", "DLE", "DC1", "DC2", "DC3", "DC4", "NAK", +"SYN", "ETB", "CAN", "EM", "SUB", "ESC", "IS4", "IS3", "IS2", "IS1", "space", "exclamation-mark", +"quotation-mark", "number-sign", "dollar-sign", "percent-sign", "ampersand", "apostrophe", +"left-parenthesis", "right-parenthesis", "asterisk", "plus-sign", "comma", "hyphen", +"period", "slash", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", +"colon", "semicolon", "less-than-sign", "equals-sign", "greater-than-sign", +"question-mark", "commercial-at", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", +"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "left-square-bracket", "backslash", +"right-square-bracket", "circumflex", "underscore", "grave-accent", "a", "b", "c", "d", "e", "f", +"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "left-curly-bracket", +"vertical-line", "right-curly-bracket", "tilde", "DEL", "", +}; + +// these multi-character collating elements +// should keep most Western-European locales +// happy - we should really localise these a +// little more - but this will have to do for +// now: + +BOOST_REGEX_DECL const char* def_multi_coll[] = { + "ae", + "Ae", + "AE", + "ch", + "Ch", + "CH", + "ll", + "Ll", + "LL", + "ss", + "Ss", + "SS", + "nj", + "Nj", + "NJ", + "dz", + "Dz", + "DZ", + "lj", + "Lj", + "LJ", + "", +}; + + + +BOOST_REGEX_DECL std::string BOOST_REGEX_CALL lookup_default_collate_name(const std::string& name) +{ + unsigned int i = 0; + while(*def_coll_names[i]) + { + if(def_coll_names[i] == name) + { + return std::string(1, char(i)); + } + ++i; + } + i = 0; + while(*def_multi_coll[i]) + { + if(def_multi_coll[i] == name) + { + return def_multi_coll[i]; + } + ++i; + } + return std::string(); +} + + } // re_detail } // boost diff --git a/src/static_mutex.cpp b/src/static_mutex.cpp index d1d683dd..e290ec27 100644 --- a/src/static_mutex.cpp +++ b/src/static_mutex.cpp @@ -16,6 +16,7 @@ * DESCRIPTION: Declares static_mutex lock type. */ +#define BOOST_REGEX_SOURCE #include #ifdef BOOST_HAS_THREADS @@ -86,7 +87,7 @@ void scoped_static_mutex_lock::lock() { if(0 == m_have_lock) { -#if defined(BOOST_MSVC) && (BOOST_MSVC <=1200) +#if !defined(InterlockedCompareExchangePointer) while(0 != InterlockedCompareExchange(reinterpret_cast((boost::uint_least16_t*)&(m_mutex.m_mutex)), (void*)1, 0)) #else while(0 != InterlockedCompareExchange(reinterpret_cast(&(m_mutex.m_mutex)), 1, 0)) @@ -102,7 +103,7 @@ void scoped_static_mutex_lock::unlock() { if(m_have_lock) { -#if defined(BOOST_MSVC) && (BOOST_MSVC <=1200) +#if !defined(InterlockedCompareExchangePointer) InterlockedExchange((LONG*)&(m_mutex.m_mutex), 0); #else InterlockedExchange(reinterpret_cast(&(m_mutex.m_mutex)), 0); @@ -121,7 +122,7 @@ void scoped_static_mutex_lock::unlock() boost::recursive_mutex* static_mutex::m_pmutex = 0; boost::once_flag static_mutex::m_once = BOOST_ONCE_INIT; -extern "C" void free_static_mutex() +extern "C" BOOST_REGEX_DECL void free_static_mutex() { delete static_mutex::m_pmutex; static_mutex::m_pmutex = 0; diff --git a/test/regress/basic_tests.cpp b/test/regress/basic_tests.cpp index 91a0eb1c..78183294 100644 --- a/test/regress/basic_tests.cpp +++ b/test/regress/basic_tests.cpp @@ -10,8 +10,8 @@ void basic_tests() TEST_REGEX_SEARCH("Z", perl, "aaa", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("Z", perl, "xxxxZZxxx", match_default, make_array(4, 5, -2, 5, 6, -2, -2)); // and some simple brackets: - TEST_REGEX_SEARCH("(a)", perl, "zzzaazz", match_default, make_array(3, 4, 3, 4, 4, 5, 4, 5, -2, -2)); - TEST_REGEX_SEARCH("()", perl, "zzz", match_default, make_array(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, -2, -2)); + TEST_REGEX_SEARCH("(a)", perl, "zzzaazz", match_default, make_array(3, 4, 3, 4, -2, 4, 5, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("()", perl, "zzz", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, 2, 2, 2, 2, -2, 3, 3, 3, 3, -2, -2)); TEST_REGEX_SEARCH("()", perl, "", match_default, make_array(0, 0, 0, 0, -2, -2)); TEST_INVALID_REGEX("(", perl); TEST_INVALID_REGEX(")", perl); @@ -23,11 +23,11 @@ void basic_tests() TEST_INVALID_REGEX("\\()", perl); TEST_INVALID_REGEX("(\\)", perl); TEST_REGEX_SEARCH("p(a)rameter", perl, "ABCparameterXYZ", match_default, make_array(3, 12, 4, 5, -2, -2)); - //TEST_REGEX_SEARCH("[pq](a)rameter", perl, "ABCparameterXYZ", match_default, make_array(3, 12, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("[pq](a)rameter", perl, "ABCparameterXYZ", match_default, make_array(3, 12, 4, 5, -2, -2)); // now try escaped brackets: - TEST_REGEX_SEARCH("\\(a\\)", basic, "zzzaazz", match_default, make_array(3, 4, 3, 4, 4, 5, 4, 5, -2, -2)); - TEST_REGEX_SEARCH("\\(\\)", basic, "zzz", match_default, make_array(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, -2, -2)); + TEST_REGEX_SEARCH("\\(a\\)", basic, "zzzaazz", match_default, make_array(3, 4, 3, 4, -2, 4, 5, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("\\(\\)", basic, "zzz", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, 2, 2, 2, 2, -2, 3, 3, 3, 3, -2, -2)); TEST_REGEX_SEARCH("\\(\\)", basic, "", match_default, make_array(0, 0, 0, 0, -2, -2)); TEST_INVALID_REGEX("\\(", basic); TEST_INVALID_REGEX("\\)", basic); @@ -38,7 +38,7 @@ void basic_tests() TEST_INVALID_REGEX("\\()", basic); TEST_INVALID_REGEX("(\\)", basic); TEST_REGEX_SEARCH("p\\(a\\)rameter", basic, "ABCparameterXYZ", match_default, make_array(3, 12, 4, 5, -2, -2)); - //TEST_REGEX_SEARCH("[pq]\\(a\\)rameter", basic, "ABCparameterXYZ", match_default, make_array(3, 12, 4, 5, -2, -2)); + TEST_REGEX_SEARCH("[pq]\\(a\\)rameter", basic, "ABCparameterXYZ", match_default, make_array(3, 12, 4, 5, -2, -2)); // now move on to "." wildcards TEST_REGEX_SEARCH(".", perl, "a", match_default, make_array(0, 1, -2, -2)); @@ -65,7 +65,7 @@ void basic_tests() TEST_REGEX_SEARCH(".", basic, "\0", match_not_dot_null | match_not_dot_newline, make_array(-2, -2)); // simple repeats: - TEST_REGEX_SEARCH("a*", perl, "b", match_default, make_array(0, 0, -2, -2)); + TEST_REGEX_SEARCH("a*", perl, "b", match_default, make_array(0, 0, -2, 1, 1, -2, -2)); TEST_REGEX_SEARCH("ab*", perl, "ab", match_default, make_array(0, 2, -2, -2)); TEST_REGEX_SEARCH("ab*", perl, "sssabbbbbbsss", match_default, make_array(3, 10, -2, -2)); TEST_REGEX_SEARCH("ab*c*", perl, "a", match_default, make_array(0, 1, -2, -2)); @@ -75,10 +75,10 @@ void basic_tests() TEST_INVALID_REGEX("*a", perl); TEST_INVALID_REGEX("\\<*", perl); TEST_INVALID_REGEX("\\>*", perl); - TEST_REGEX_SEARCH("\n*", perl, "\n\n", match_default, make_array(0, 2, -2, -2)); - TEST_REGEX_SEARCH("\\**", perl, "**", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("\n*", perl, "\n\n", match_default, make_array(0, 2, -2, 2, 2, -2, -2)); + TEST_REGEX_SEARCH("\\**", perl, "**", match_default, make_array(0, 2, -2, 2, 2, -2, -2)); TEST_REGEX_SEARCH("\\*", perl, "*", match_default, make_array(0, 1, -2, -2)); - TEST_REGEX_SEARCH("(ab)*", perl, "abab", match_default, make_array(0, 4, 2, 4, -2, -2)); + TEST_REGEX_SEARCH("(ab)*", perl, "abab", match_default, make_array(0, 4, 2, 4, -2, 4, 4, -2, -2)); // now try operator + : TEST_REGEX_SEARCH("ab+", perl, "a", match_default, make_array(-2, -2)); @@ -100,7 +100,7 @@ void basic_tests() TEST_REGEX_SEARCH("a\\+", basic|bk_plus_qm, "aa", match_default, make_array(0, 2, -2, -2)); // now try operator ? - TEST_REGEX_SEARCH("a?", perl, "b", match_default, make_array(0, 0, -2, -2)); + TEST_REGEX_SEARCH("a?", perl, "b", match_default, make_array(0, 0, -2, 1, 1, -2, -2)); TEST_REGEX_SEARCH("ab?", perl, "a", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("ab?", perl, "ab", match_default, make_array(0, 2, -2, -2)); TEST_REGEX_SEARCH("ab?", perl, "sssabbbbbbsss", match_default, make_array(3, 5, -2, -2)); @@ -111,14 +111,14 @@ void basic_tests() TEST_INVALID_REGEX("?a", perl); TEST_INVALID_REGEX("\\?", perl); - TEST_REGEX_SEARCH("\n?", perl, "\n\n", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("\n?", perl, "\n\n", match_default, make_array(0, 1, -2, 1, 2, -2, 2, 2, -2, -2)); TEST_REGEX_SEARCH("\\?", perl, "?", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("\\?", perl, "?", match_default, make_array(0, 1, -2, -2)); - TEST_REGEX_SEARCH("\\??", perl, "??", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("\\??", perl, "??", match_default, make_array(0, 1, -2, 1, 2, -2, 2, 2, -2, -2)); TEST_REGEX_SEARCH("?", basic|bk_plus_qm, "?", match_default, make_array(0, 1, -2, -2)); TEST_INVALID_REGEX("\\?", basic|bk_plus_qm); - TEST_REGEX_SEARCH("a\\?", basic|bk_plus_qm, "aa", match_default, make_array(0, 1, -2, -2)); - TEST_REGEX_SEARCH("a\\?", basic|bk_plus_qm, "b", match_default, make_array(0, 0, -2, -2)); + TEST_REGEX_SEARCH("a\\?", basic|bk_plus_qm, "aa", match_default, make_array(0, 1, -2, 1, 2, -2, 2, 2, -2, -2)); + TEST_REGEX_SEARCH("a\\?", basic|bk_plus_qm, "b", match_default, make_array(0, 0, -2, 1, 1, -2, -2)); TEST_REGEX_SEARCH("a?", basic, "a?", match_default, make_array(0, 2, -2, -2)); TEST_REGEX_SEARCH("a+", basic, "a+", match_default, make_array(0, 2, -2, -2)); @@ -310,6 +310,75 @@ void test_sets() TEST_REGEX_SEARCH("\\W", perl, "`", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("\\W", perl, "[", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("\\W", perl, "@", match_default, make_array(0, 1, -2, -2)); + + // collating elements + TEST_REGEX_SEARCH("[[.zero.]]", perl, "0", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.one.]]", perl, "1", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.two.]]", perl, "2", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.three.]]", perl, "3", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.a.]]", perl, "bac", match_default, make_array(1, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.right-curly-bracket.]]", perl, "}", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.NUL.]]", perl, "\0", match_default, make_array(0, 1, -2, -2)); + TEST_INVALID_REGEX("[[:<:]z]", perl); + TEST_INVALID_REGEX("[a[:>:]]", perl); + TEST_REGEX_SEARCH("[[.A.]]", extended|icase, "A", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.A.]]", extended|icase, "a", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.A.]-b]+", extended|icase, "AaBb", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("[A-[.b.]]+", extended|icase, "AaBb", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("[[.a.]-B]+", extended|icase, "AaBb", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("[a-[.B.]]+", extended|icase, "AaBb", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("[\x61]", extended, "a", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[\x61-c]+", extended, "abcd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("[a-\x63]+", extended, "abcd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("[[.a.]-c]+", extended, "abcd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("[a-[.c.]]+", extended, "abcd", match_default, make_array(0, 3, -2, -2)); + TEST_INVALID_REGEX("[[:alpha:]-a]", extended); + TEST_INVALID_REGEX("[a-[:alpha:]]", extended); + TEST_REGEX_SEARCH("[[.ae.]]", basic, "ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.ae.]]", basic, "aE", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("[[.AE.]]", basic, "AE", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.Ae.]]", basic, "Ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.ae.]-b]", basic, "a", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("[[.ae.]-b]", basic, "b", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.ae.]-b]", basic, "ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[a-[.ae.]]", basic, "a", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[a-[.ae.]]", basic, "b", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("[a-[.ae.]]", basic, "ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.ae.]]", basic|icase, "AE", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.ae.]]", basic|icase, "Ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.AE.]]", basic|icase, "Ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.Ae.]]", basic|icase, "aE", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[.AE.]-B]", basic|icase, "a", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("[[.Ae.]-b]", basic|icase, "b", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.Ae.]-b]", basic|icase, "B", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[.ae.]-b]", basic|icase, "AE", match_default, make_array(0, 2, -2, -2)); + // + // try some equivalence classes: + // + TEST_REGEX_SEARCH("[[=a=]]", basic, "a", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[=a=]]", basic, "A", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("[[=ae=]]", basic, "ae", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("[[=right-curly-bracket=]]", basic, "}", match_default, make_array(0, 1, -2, -2)); + // + // now some perl style single character classes: + // + TEST_REGEX_SEARCH("\\l+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("[\\l]+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2)); + TEST_INVALID_REGEX("[\\l-a]", perl); + TEST_INVALID_REGEX("[\\L]", perl); + TEST_REGEX_SEARCH("\\L+", perl, "abABCab", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("\\u+", perl, "abABCab", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("[\\u]+", perl, "abABCab", match_default, make_array(2, 5, -2, -2)); + TEST_INVALID_REGEX("[\\U]", perl); + TEST_REGEX_SEARCH("\\U+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("\\d+", perl, "AB012AB", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("[\\d]+", perl, "AB012AB", match_default, make_array(2, 5, -2, -2)); + TEST_INVALID_REGEX("[\\D]", perl); + TEST_REGEX_SEARCH("\\D+", perl, "01abc01", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("\\s+", perl, "AB AB", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("[\\s]+", perl, "AB AB", match_default, make_array(2, 5, -2, -2)); + TEST_INVALID_REGEX("[\\S]", perl); + TEST_REGEX_SEARCH("\\S+", perl, " abc ", match_default, make_array(2, 5, -2, -2)); } void test_anchors() @@ -343,6 +412,13 @@ void test_anchors() TEST_REGEX_SEARCH("ab$", extended, "ab", match_default | match_not_bol | match_not_eol | match_single_line, make_array(-2, -2)); TEST_REGEX_SEARCH("ab$", extended, "abxx", match_default | match_not_bol | match_not_eol | match_single_line, make_array(-2, -2)); TEST_REGEX_SEARCH("ab$", extended, "ab\nzz", match_default | match_not_bol | match_not_eol | match_single_line, make_array(-2, -2)); + // + // changes to newline handling with 2.11: + // + TEST_REGEX_SEARCH("^.", extended, " \n \r\n ", match_default, make_array(0, 1, -2, 3, 4, -2, 7, 8, -2, -2)); + TEST_REGEX_SEARCH(".$", extended, " \n \r\n ", match_default, make_array(1, 2, -2, 4, 5, -2, 8, 9, -2, -2)); + TEST_REGEX_SEARCH_W(L"^.", extended, L"\u2028 \u2028", match_default, make_array(0, 1, -2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH_W(L".$", extended, L" \u2028 \u2028", match_default, make_array(0, 1, -2, 2, 3, -2, 3, 4, -2, -2)); } void test_backrefs() @@ -360,7 +436,7 @@ void test_backrefs() TEST_REGEX_SEARCH("a(([bc])\\2)*d", perl, "abbccd", match_default, make_array(0, 6, 3, 5, 3, 4, -2, -2)); TEST_REGEX_SEARCH("a(([bc])\\2)*d", perl, "abbcbd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("a((b)*\\2)*d", perl, "abbbd", match_default, make_array(0, 5, 1, 4, 2, 3, -2, -2)); - TEST_REGEX_SEARCH("(ab*)[ab]*\\1", perl, "ababaaa", match_default, make_array(0, 4, 0, 2, -2, -2)); + TEST_REGEX_SEARCH("(ab*)[ab]*\\1", perl, "ababaaa", match_default, make_array(0, 4, 0, 2, -2, 4, 7, 4, 5, -2, -2)); TEST_REGEX_SEARCH("(a)\\1bcd", perl, "aabcd", match_default, make_array(0, 5, 0, 1, -2, -2)); TEST_REGEX_SEARCH("(a)\\1bc*d", perl, "aabcd", match_default, make_array(0, 5, 0, 1, -2, -2)); TEST_REGEX_SEARCH("(a)\\1bc*d", perl, "aabd", match_default, make_array(0, 4, 0, 1, -2, -2)); @@ -396,6 +472,26 @@ void test_character_escapes() TEST_INVALID_REGEX("\\c=", extended); TEST_INVALID_REGEX("\\c?", extended); TEST_REGEX_SEARCH("=:", perl, "=:", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("\\e", perl, "\x1B", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("\\x1b", perl, "\x1B", match_default, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("\\x{1b}", perl, "\x1B", match_default, make_array(0, 1, -2, -2)); + TEST_INVALID_REGEX("\\x{}", perl); + TEST_INVALID_REGEX("\\x{", perl); + TEST_INVALID_REGEX("\\x}", perl); + TEST_INVALID_REGEX("\\x", perl); + TEST_INVALID_REGEX("\\x{yy", perl); + TEST_INVALID_REGEX("\\x{1b", perl); + // \Q...\E sequences: + TEST_INVALID_REGEX("\\Qabc", perl); + TEST_INVALID_REGEX("\\Qabc\\", perl); + TEST_REGEX_SEARCH("\\Qabc\\E", perl, "abcd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("\\Qabc\\Ed", perl, "abcde", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("\\Q+*?\\\\E", perl, "+*?\\", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("a\\Q+*?\\\\Eb", perl, "a+*?\\b", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("\\C+", perl, "abcde", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("\\X+", perl, "abcde", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\X", perl, L"a\u0300\u0301", match_default, make_array(0, 3, -2, -2)); } void test_assertion_escapes() @@ -444,12 +540,22 @@ void test_assertion_escapes() TEST_REGEX_SEARCH("abc[[:>:]]", perl, "abcd", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("abc[[:>:]]", perl, "abc\n", match_default, make_array(0, 3, -2, -2)); TEST_REGEX_SEARCH("abc[[:>:]]", perl, "abc::", match_default, make_array(0, 3, -2, -2)); + + TEST_REGEX_SEARCH("\\Aabc", perl, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("\\Aabc", perl, "aabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("abc\\z", perl, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("abc\\z", perl, "abcd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("abc\\Z", perl, "abc\n\n", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("abc\\Z", perl, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("\\Gabc", perl, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("\\Gabc", perl, "dabcd", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a\\Gbc", perl, "abc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("a\\Aab", perl, "abc", match_default, make_array(-2, -2)); } void test_tricky_cases() { using namespace boost::regex_constants; - //TEST_REGEX_SEARCH("", perl, "", match_default, make_array(-2, -2)); // // now follows various complex expressions designed to try and bust the matcher: // @@ -481,7 +587,7 @@ void test_tricky_cases() TEST_REGEX_SEARCH("a(b)?c\\1d", perl, "acd", match_default, make_array(0, 3, -1, -1, -2, -2)); TEST_REGEX_SEARCH("a(b?c)+d", perl, "accd", match_default, make_array(0, 4, 2, 3, -2, -2)); TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl, "weeknights", match_default, make_array(0, 10, 0, 3, 3, 10, -2, -2)); - TEST_REGEX_SEARCH(".*", perl, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH(".*", perl, "abc", match_default, make_array(0, 3, -2, 3, 3, -2, -2)); TEST_REGEX_SEARCH("a(b|(c))d", perl, "abd", match_default, make_array(0, 3, 1, 2, -1, -1, -2, -2)); TEST_REGEX_SEARCH("a(b|(c))d", perl, "acd", match_default, make_array(0, 3, 1, 2, 1, 2, -2, -2)); TEST_REGEX_SEARCH("a(b*|c|e)d", perl, "abbd", match_default, make_array(0, 4, 1, 3, -2, -2)); @@ -502,8 +608,8 @@ void test_tricky_cases() TEST_REGEX_SEARCH("a(bbb+|bb+|b)b", perl, "abb", match_default, make_array(0, 3, 1, 2, -2, -2)); TEST_REGEX_SEARCH("a(bbb+|bb+|b)b", perl, "abbb", match_default, make_array(0, 4, 1, 3, -2, -2)); TEST_REGEX_SEARCH("a(bbb+|bb+|b)bb", perl, "abbb", match_default, make_array(0, 4, 1, 2, -2, -2)); - TEST_REGEX_SEARCH("(.*).*", perl, "abcdef", match_default, make_array(0, 6, 0, 6, -2, -2)); - TEST_REGEX_SEARCH("(a*)*", perl, "bc", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("(.*).*", perl, "abcdef", match_default, make_array(0, 6, 0, 6, -2, 6, 6, 6, 6, -2, -2)); + TEST_REGEX_SEARCH("(a*)*", perl, "bc", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, 2, 2, 2, 2, -2, -2)); TEST_REGEX_SEARCH("xyx*xz", perl, "xyxxxxyxxxz", match_default, make_array(5, 11, -2, -2)); // do we get the right subexpression when it is used more than once? TEST_REGEX_SEARCH("a(b|c)*d", perl, "ad", match_default, make_array(0, 2, -1, -1, -2, -2)); @@ -557,6 +663,10 @@ void test_tricky_cases() // perl only: TEST_REGEX_SEARCH("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*", perl, "#define some_symbol(x) \\ \r\n foo();\\\r\n printf(#x);", match_default, make_array(0, 53, 30, 42, -2, -2)); // literals: +} + +void test_tricky_cases2() +{ TEST_REGEX_SEARCH("((0x[[:xdigit:]]+)|([[:digit:]]+))u?((int(8|16|32|64))|L)?", perl, "0xFF", match_default, make_array(0, 4, 0, 4, 0, 4, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2)); TEST_REGEX_SEARCH("((0x[[:xdigit:]]+)|([[:digit:]]+))u?((int(8|16|32|64))|L)?", perl, "35", match_default, make_array(0, 2, 0, 2, -1, -1, 0, 2, -1, -1, -1, -1, -1, -1, -2, -2)); TEST_REGEX_SEARCH("((0x[[:xdigit:]]+)|([[:digit:]]+))u?((int(8|16|32|64))|L)?", perl, "0xFFu", match_default, make_array(0, 5, 0, 4, 0, 4, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2)); @@ -633,527 +743,445 @@ void test_tricky_cases() TEST_REGEX_SEARCH("x", perl, "x", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH(":", perl, ":", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("(\\.[[:alnum:]]+){2}", perl, "w.a.b ", match_default, make_array(1, 5, 3, 5, -2, -2)); + + // new bugs detected in spring 2003: + TEST_REGEX_SEARCH("b", perl, "abc", match_default|match_continuous, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?!foo)bar", perl, "foobar", match_default, make_array(3, 6, -2, -2)); + TEST_REGEX_SEARCH("(?!foo)bar", perl, "??bar", match_default, make_array(2, 5, -2, -2)); + TEST_REGEX_SEARCH("(?!foo)bar", perl, "barfoo", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?!foo)bar", perl, "bar??", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?!foo)bar", perl, "bar", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a\\Z", perl, "a\nb", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("()", perl, "abc", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, 2, 2, 2, 2, -2, 3, 3, 3, 3, -2, -2)); + TEST_REGEX_SEARCH("^()", perl, "abc", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("^()+", perl, "abc", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("^(){1}", perl, "abc", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("^(){2}", perl, "abc", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("^((){2})", perl, "abc", match_default, make_array(0, 0, 0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("()", perl, "", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("()\\1", perl, "", match_default, make_array(0, 0, 0, 0, -2, -2)); + TEST_REGEX_SEARCH("()\\1", perl, "a", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, -2)); + TEST_REGEX_SEARCH("a()\\1b", perl, "ab", match_default, make_array(0, 2, 1, 1, -2, -2)); + TEST_REGEX_SEARCH("a()b\\1", perl, "ab", match_default, make_array(0, 2, 1, 1, -2, -2)); + + // + // the strings in the next test case are too long for most compilers to cope with, + // we have to break them up and call the testing procs directly rather than rely on the macros: + // + static const char* big_text = "00001 01 \r\n00002 02 1 2 3 4 5 6" + "7 8 9 0\r\n00003 03 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890\r\n" + "00004 04 \r\n00005 05 \r\n00006 06 " + "Seite: 0001\r\n00007 07 " + "StartSeitEEnde: 0001\r\n00008 08 " + "StartSeiTe Ende: 0001\r\n00009 09 " + "Start seiteEnde: 0001\r\n00010 10 " + "28.2.03\r\n00011 11 " + "Page: 0001\r\n00012 12 " + "Juhu die Erste: 0001\r\n00013 13 " + "Es war einmal! 0001\r\n00014 14 ABCDEFGHIJKLMNOPQRSTUVWXYZ0001\r\n" + "00015 15 abcdefghijklmnopqrstuvwxyz0001\r\n" + "00016 16 lars.schmeiser@gft.com\r\n00017 17 \r\n" + "00018 18 \r\n00019 19 \r\n00020 20 \r\n00021 21 1 2 3 4 5 " + "6 7 8 9 0\r\n" + "00022 22 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890\r\n" + "00023 01 \r\n00024 02 1 2 3 4 5 6 7 8 9 0\r\n" + "00025 03 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890\r\n" + "00026 04 \r\n00027 05 \r\n00028 06 " + "Seite: 0002\r\n00029 07 StartSeitEEnde: 0002\r\n" + "00030 08 " + "StartSeiTe Ende: 0002\r\n00031 09 " + "Start seiteEnde: 0002\r\n00032 10 " + "28.02.2003\r\n00033 11 " + "Page: 0002\r\n00034 12 " + "Juhu die Erste: 0002\r\n00035 13 " + "Es war einmal! 0002\r\n00036 14 ABCDEFGHIJKLMNOPQRSTUVWXYZ0002\r\n00037 " + "15 abcdefghijklmnopqrstuvwxyz0002\r\n00038 16 " + "lars.schmeiser@194.1.12.111\r\n00039 17 \r\n00040 18 \r\n00041 19 \r\n" + "00042 20 \r\n00043 21 1 2 3 4 5 6 7 8 9 0\r\n"; + + do{ + test_info::set_info(__FILE__, __LINE__, + "(.*\\r\\n){3}.* abcdefghijklmnopqrstuvwxyz.*\\r\\n", + perl, big_text, match_default|match_not_dot_newline, + make_array(753, 1076, 934, 1005, -2, 2143, 2466, 2324, 2395, -2, -2)); + test(char(0), test_regex_search_tag()); + }while(0); +#ifndef BOOST_NO_WREGEX + do{ + std::string st(big_text); + test_info::set_info(__FILE__, __LINE__, + L"(.*\\r\\n){3}.* abcdefghijklmnopqrstuvwxyz.*\\r\\n", + perl, std::wstring(st.begin(), st.end()), match_default|match_not_dot_newline, + make_array(753, 1076, 934, 1005, -2, 2143, 2466, 2324, 2395, -2, -2)); + test(char(0), test_regex_search_tag()); + }while(0); +#endif +} + +void test_grep() +{ + // + // now test grep, + // basically check all our restart types - line, word, etc + // checking each one for null and non-null matches. + // + using namespace boost::regex_constants; + TEST_REGEX_SEARCH("a", perl|nosubs, " a a a aa", match_default, make_array(1, 2, -2, 3, 4, -2, 5, 6, -2, 7, 8, -2, 8, 9, -2, -2)); + TEST_REGEX_SEARCH("a+b+", perl|nosubs, "aabaabbb ab", match_default, make_array(0, 3, -2, 3, 8, -2, 9, 11, -2, -2)); + TEST_REGEX_SEARCH("a(b*|c|e)d", perl|nosubs, "adabbdacd", match_default, make_array(0, 2, -2, 2, 6, -2, 6, 9, -2, -2)); + TEST_REGEX_SEARCH("a", perl|nosubs, "\na\na\na\naa", match_default, make_array(1, 2, -2, 3, 4, -2, 5, 6, -2, 7, 8, -2, 8, 9, -2, -2)); + TEST_REGEX_SEARCH("^", perl|nosubs, " \n\n \n\n\n", match_default, make_array(0, 0, -2, 4, 4, -2, 5, 5, -2, 8, 8, -2, 9, 9, -2, 10, 10, -2, -2)); + TEST_REGEX_SEARCH("^ab", perl|nosubs, "ab \nab ab\n", match_default, make_array(0, 2, -2, 5, 7, -2, -2)); + TEST_REGEX_SEARCH("^[^\\n]*\n", perl|nosubs, " \n \n\n \n", match_default, make_array(0, 4, -2, 4, 7, -2, 7, 8, -2, 8, 11, -2, -2)); + TEST_REGEX_SEARCH("\\", "<123><><><>"); + TEST_REGEX_REPLACE("[[:digit:]]*", perl, "123ab1", match_default|format_no_copy, "<$0>", "<123><><><1>"); + // and now escapes: + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "$x", "$x"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\a", "\a"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\f", "\f"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\n", "\n"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\r", "\r"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\t", "\t"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\x21", "!"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\x{21}", "!"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\c@", "\0"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\e", "\x1B"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_no_copy, "\\0101", "A"); + // sed format sequences: + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "\\0", "aabb"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "\\1", "aa"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "\\2", "bb"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "&", "aabb"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "$", "$"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "$1", "$1"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "()?:", "()?:"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "\\\\", "\\"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_sed|format_no_copy, "\\&", "&"); + + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "$0", "aabb"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "$1", "aa"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "$2", "bb"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "$&", "aabb"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "&", "&"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\0", "\0"); + TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "()?:", "()?:"); + TEST_REGEX_REPLACE("a+", perl, "...aaa,,", match_default|format_perl|format_no_copy, "\\0101", "A"); + + // move to copying unmatched data: + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_all, "bbb", "...bbb,,,"); + TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,", match_default|format_all, "$1", "...bb,,,"); + TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "$1", "...bb,,,b*bbb?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?1A)(?2B)", "...AB,,,AB*AB?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?1A:B", "...AB,,,AB*AB?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "(?1A:B)C", "...ACBC,,,ACBC*ACBC?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all, "?1:B", "...B,,,B*B?"); + // move to copying unmatched data, but replace first occurance only: + TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_all|format_first_only, "bbb", "...bbb,,,"); + TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,", match_default|format_all|format_first_only, "$1", "...bb,,,"); + TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all|format_first_only, "$1", "...bb,,,ab*abbb?"); + TEST_REGEX_REPLACE("(a+)|(b+)", perl, "...aaabb,,,ab*abbb?", match_default|format_all|format_first_only, "(?1A)(?2B)", "...Abb,,,ab*abbb?"); +} + +void test_non_greedy_repeats() +{ + // + // non-greedy repeats added 21/04/00 + // + using namespace boost::regex_constants; + TEST_REGEX_SEARCH("a*?", perl, "aa", match_default, make_array(0, 0, -2, 0, 1, -2, 1, 1, -2, 1, 2, -2, 2, 2, -2, -2)); + TEST_REGEX_SEARCH("^a*?$", perl, "aa", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("^.*?$", perl, "aa", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("^(a)*?$", perl, "aa", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("^[ab]*?$", perl, "aa", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a??", perl, "aa", match_default, make_array(0, 0, -2, 0, 1, -2, 1, 1, -2, 1, 2, -2, 2, 2, -2, -2)); + TEST_REGEX_SEARCH("a+?", perl, "aa", match_default, make_array(0, 1, -2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH("a{1,3}?", perl, "aaa", match_default, make_array(0, 1, -2, 1, 2, -2, 2, 3, -2, -2)); + TEST_REGEX_SEARCH("\\w+?w", perl, "...ccccccwcccccw", match_default, make_array(3, 10, -2, 10, 16, -2, -2)); + TEST_REGEX_SEARCH("\\W+\\w+?w", perl, "...ccccccwcccccw", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH("abc|\\w+?", perl, "abd", match_default, make_array(0, 1, -2, 1, 2, -2, 2, 3, -2, -2)); + TEST_REGEX_SEARCH("abc|\\w+?", perl, "abcd", match_default, make_array(0, 3, -2, 3, 4, -2, -2)); + TEST_REGEX_SEARCH("<\\s*tag[^>]*>(.*?)<\\s*/tag\\s*>", perl, " here is some text ", match_default, make_array(1, 29, 6, 23, -2, 30, 41, 35, 35, -2, -2)); + TEST_REGEX_SEARCH("<\\s*tag[^>]*>(.*?)<\\s*/tag\\s*>", perl, " < tag attr=\"something\">here is some text< /tag > ", match_default, make_array(1, 49, 24, 41, -2, 50, 61, 55, 55, -2, -2)); + TEST_INVALID_REGEX("a{1,3}{1}", perl); + TEST_INVALID_REGEX("a**", perl); + TEST_INVALID_REGEX("a++", perl); +} + +void test_non_marking_paren() +{ + using namespace boost::regex_constants; + // + // non-marking parenthesis added 25/04/00 + // + TEST_REGEX_SEARCH("(?:abc)+", perl, "xxabcabcxx", match_default, make_array(2, 8, -2, -2)); + TEST_REGEX_SEARCH("(?:a+)(b+)", perl, "xaaabbbx", match_default, make_array(1, 7, 4, 7, -2, -2)); + TEST_REGEX_SEARCH("(a+)(?:b+)", perl, "xaaabbba", match_default, make_array(1, 7, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("(?:(a+)b+)", perl, "xaaabbba", match_default, make_array(1, 7, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("(?:a+(b+))", perl, "xaaabbba", match_default, make_array(1, 7, 4, 7, -2, -2)); + TEST_REGEX_SEARCH("a+(?#b+)b+", perl, "xaaabbba", match_default, make_array(1, 7, -2, -2)); + TEST_REGEX_SEARCH("(a)(?:b|$)", perl, "ab", match_default, make_array(0, 2, 0, 1, -2, -2)); + TEST_REGEX_SEARCH("(a)(?:b|$)", perl, "a", match_default, make_array(0, 1, 0, 1, -2, -2)); +} + +void test_partial_match() +{ + using namespace boost::regex_constants; + // + // try some partial matches: + // + TEST_REGEX_SEARCH("(xyz)(.*)abc", perl, "xyzaaab", match_default|match_partial, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("(xyz)(.*)abc", perl, "xyz", match_default|match_partial, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("(xyz)(.*)abc", perl, "xy", match_default|match_partial, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("(xyz)(.*)abc", perl, "x", match_default|match_partial, make_array(0, 1, -2, -2)); + TEST_REGEX_SEARCH("(xyz)(.*)abc", perl, "", match_default|match_partial, make_array(-2, -2)); + TEST_REGEX_SEARCH("(xyz)(.*)abc", perl, "aaaa", match_default|match_partial, make_array(-2, -2)); + TEST_REGEX_SEARCH(".abc", perl, "aaab", match_default|match_partial, make_array(1, 4, -2, -2)); + TEST_REGEX_SEARCH("a[_]", perl, "xxa", match_default|match_partial, make_array(2, 3, -2, -2)); +} + +void test_forward_lookahead_asserts() +{ + // + // forward lookahead asserts added 21/01/02 + // + using namespace boost::regex_constants; + TEST_REGEX_SEARCH("((?:(?!a|b)\\w)+)(\\w+)", perl, " xxxabaxxx ", match_default, make_array(2, 11, 2, 5, 5, 11, -2, -2)); + TEST_REGEX_SEARCH("/\\*(?:(?!\\*/).)*\\*/", perl, " /**/ ", match_default, make_array(2, 6, -2, -2)); + TEST_REGEX_SEARCH("/\\*(?:(?!\\*/).)*\\*/", perl, " /***/ ", match_default, make_array(2, 7, -2, -2)); + TEST_REGEX_SEARCH("/\\*(?:(?!\\*/).)*\\*/", perl, " /********/ ", match_default, make_array(2, 12, -2, -2)); + TEST_REGEX_SEARCH("/\\*(?:(?!\\*/).)*\\*/", perl, " /* comment */ ", match_default, make_array(2, 15, -2, -2)); + TEST_REGEX_SEARCH("<\\s*a[^>]*>((?:(?!<\\s*/\\s*a\\s*>).)*)<\\s*/\\s*a\\s*>", perl, " here ", match_default, make_array(1, 24, 16, 20, -2, -2)); + TEST_REGEX_SEARCH("<\\s*a[^>]*>((?:(?!<\\s*/\\s*a\\s*>).)*)<\\s*/\\s*a\\s*>", perl, " here< / a > ", match_default, make_array(1, 28, 16, 20, -2, -2)); + TEST_REGEX_SEARCH("<\\s*a[^>]*>((?:(?!<\\s*/\\s*a\\s*>).)*)(?=<\\s*/\\s*a\\s*>)", perl, " here ", match_default, make_array(1, 20, 16, 20, -2, -2)); + TEST_REGEX_SEARCH("<\\s*a[^>]*>((?:(?!<\\s*/\\s*a\\s*>).)*)(?=<\\s*/\\s*a\\s*>)", perl, " here< / a > ", match_default, make_array(1, 20, 16, 20, -2, -2)); + TEST_REGEX_SEARCH("^(?!^(?:PRN|AUX|CLOCK\\$|NUL|CON|COM\\d|LPT\\d|\\..*)(?:\\..+)?$)[^\\x00-\\x1f\\\\?*:\"|/]+$", perl, "command.com", match_default, make_array(0, 11, -2, -2)); + TEST_REGEX_SEARCH("^(?!^(?:PRN|AUX|CLOCK\\$|NUL|CON|COM\\d|LPT\\d|\\..*)(?:\\..+)?$)[^\\x00-\\x1f\\\\?*:\"|/]+$", perl, "PRN", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?!^(?:PRN|AUX|CLOCK\\$|NUL|CON|COM\\d|LPT\\d|\\..*)(?:\\..+)?$)[^\\x00-\\x1f\\\\?*:\"|/]+$", perl, "COM2", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d).{4,8}$", perl, "abc3", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d).{4,8}$", perl, "abc3def4", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d).{4,8}$", perl, "ab2", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d).{4,8}$", perl, "abcdefg", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$", perl, "abc3", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$", perl, "abC3", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("^(?=.*\\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$", perl, "ABCD3", match_default, make_array(-2, -2)); +} + +void test_fast_repeats() +{ + using namespace boost::regex_constants; + // extended repeat checking to exercise new algorithms: + TEST_REGEX_SEARCH("ab.*xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab.*", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}", perl, "ab_______", match_default, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("ab.*?xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*?xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*?xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*?xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*?", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab.*?", perl, "ab__", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?", perl, "ab_______", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + + // again but with slower algorithm variant: + TEST_REGEX_SEARCH("ab.*xy", perl, "abxy_", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*xy", perl, "ab_xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*xy", perl, "abxy", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*xy", perl, "ab_xy", match_not_dot_newline|match_not_dot_null, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*", perl, "ab", match_not_dot_newline|match_not_dot_null, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab.*", perl, "ab__", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab__xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab____xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_____xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab__xy", match_not_dot_newline|match_not_dot_null, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_____xy", match_not_dot_newline|match_not_dot_null, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}", perl, "ab__", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}", perl, "ab_______", match_not_dot_newline|match_not_dot_null, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab______xy", match_not_dot_newline|match_not_dot_null, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_xy", match_not_dot_newline|match_not_dot_null, make_array(-2, -2)); + + TEST_REGEX_SEARCH("ab.*?xy", perl, "abxy_", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*?xy", perl, "ab_xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*?xy", perl, "abxy", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.*?xy", perl, "ab_xy", match_not_dot_newline|match_not_dot_null, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab.*?", perl, "ab", match_not_dot_newline|match_not_dot_null, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab.*?", perl, "ab__", match_not_dot_newline|match_not_dot_null, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab__xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab____xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab_____xy_", match_not_dot_newline|match_not_dot_null, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab__xy", match_not_dot_newline|match_not_dot_null, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab_____xy", match_not_dot_newline|match_not_dot_null, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?", perl, "ab__", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?", perl, "ab_______", match_not_dot_newline|match_not_dot_null, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}?xy", perl, "ab______xy", match_not_dot_newline|match_not_dot_null, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab.{2,5}xy", perl, "ab_xy", match_not_dot_newline|match_not_dot_null, make_array(-2, -2)); + + // now again for single character repeats: + TEST_REGEX_SEARCH("ab_*xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_*xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab_*xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_*xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab_*", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab_*", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}", perl, "ab_______", match_default, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("ab_*?xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_*?xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab_*?xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_*?xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab_*?", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab_*?", perl, "ab__", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("ab_{2,5}?xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?", perl, "ab_______", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}?xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab_{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(5*?).somesite", perl, "//555.somesite", match_default, make_array(2, 14, 2, 5, -2, -2)); + + // and again for sets: + TEST_REGEX_SEARCH("ab[_,;]*xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}", perl, "ab_______", match_default, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("ab[_,;]*?xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*?xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*?xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*?xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*?", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]*?", perl, "ab__", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("ab[_,;]{2,5}?xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?", perl, "ab_______", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}?xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab[_,;]{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + + // and again for tricky sets with digraphs: + TEST_REGEX_SEARCH("ab[_[.ae.]]*xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}", perl, "ab_______", match_default, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("ab[_[.ae.]]*?xy", perl, "abxy_", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*?xy", perl, "ab_xy_", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*?xy", perl, "abxy", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*?xy", perl, "ab_xy", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*?", perl, "ab", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]*?", perl, "ab__", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?xy", perl, "ab__xy_", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?xy", perl, "ab____xy_", match_default, make_array(0, 8, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?xy", perl, "ab_____xy_", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?xy", perl, "ab__xy", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?xy", perl, "ab_____xy", match_default, make_array(0, 9, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?", perl, "ab__", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?", perl, "ab_______", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}?xy", perl, "ab______xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("ab[_[.ae.]]{2,5}xy", perl, "ab_xy", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("([5[.ae.]]*?).somesite", perl, "//555.somesite", match_default, make_array(2, 14, 2, 5, -2, -2)); + + //TEST_REGEX_SEARCH("", perl, "", match_default, make_array(-2, -2)); #if 0 -; collating elements and rewritten set code: -- match_default normal REG_EXTENDED REG_STARTEND -[[.zero.]] 0 0 1 -[[.one.]] 1 0 1 -[[.two.]] 2 0 1 -[[.three.]] 3 0 1 -[[.a.]] baa 1 2 -[[.right-curly-bracket.]] } 0 1 -[[.NUL.]] \0 0 1 -[[:<:]z] ! -[a[:>:]] ! -[[=a=]] a 0 1 -[[=right-curly-bracket=]] } 0 1 -- match_default normal REG_EXTENDED REG_STARTEND REG_ICASE -[[.A.]] A 0 1 -[[.A.]] a 0 1 -[[.A.]-b]+ AaBb 0 4 -[A-[.b.]]+ AaBb 0 4 -[[.a.]-B]+ AaBb 0 4 -[a-[.B.]]+ AaBb 0 4 -- match_default normal REG_EXTENDED REG_NO_POSIX_TEST -[\x61] a 0 1 -[\x61-c]+ abcd 0 3 -[a-\x63]+ abcd 0 3 -- match_default normal REG_EXTENDED REG_STARTEND -[[.a.]-c]+ abcd 0 3 -[a-[.c.]]+ abcd 0 3 -[[:alpha:]-a] ! -[a-[:alpha:]] ! - -; try mutli-character ligatures: -[[.ae.]] ae 0 2 -[[.ae.]] aE -1 -1 -[[.AE.]] AE 0 2 -[[.Ae.]] Ae 0 2 -[[.ae.]-b] a -1 -1 -[[.ae.]-b] b 0 1 -[[.ae.]-b] ae 0 2 -[a-[.ae.]] a 0 1 -[a-[.ae.]] b -1 -1 -[a-[.ae.]] ae 0 2 -- match_default normal REG_EXTENDED REG_STARTEND REG_ICASE -[[.ae.]] AE 0 2 -[[.ae.]] Ae 0 2 -[[.AE.]] Ae 0 2 -[[.Ae.]] aE 0 2 -[[.AE.]-B] a -1 -1 -[[.Ae.]-b] b 0 1 -[[.Ae.]-b] B 0 1 -[[.ae.]-b] AE 0 2 - -- match_default normal REG_EXTENDED REG_STARTEND -;extended perl style escape sequences: -\e \27 0 1 -\x1b \27 0 1 -\x{1b} \27 0 1 -\x{} ! -\x{ ! -\x} ! -\x ! -\x{yy ! -\x{1b ! - -- match_default normal REG_EXTENDED REG_STARTEND REG_NO_POSIX_TEST -\l+ ABabcAB 2 5 -[\l]+ ABabcAB 2 5 -[a-\l] ! -[\l-a] ! -[\L] ! -\L+ abABCab 2 5 -\u+ abABCab 2 5 -[\u]+ abABCab 2 5 -[\U] ! -\U+ ABabcAB 2 5 -\d+ ab012ab 2 5 -[\d]+ ab012ab 2 5 -[\D] ! -\D+ 01abc01 2 5 -\s+ "ab ab" 2 5 -[\s]+ "ab ab" 2 5 -[\S] ! -\S+ " abc " 2 5 -- match_default normal REG_EXTENDED REG_STARTEND -\Qabc ! -\Qabc\E abcd 0 3 -\Qabc\Ed abcde 0 4 -\Q+*?\\E +*?\\ 0 4 - -\C+ abcde 0 5 -\X+ abcde 0 5 - -- match_default normal REG_EXTENDED REG_STARTEND REG_UNICODE_ONLY -\X+ a\768\769 0 3 -\X+ \2309\2307 0 2 ;DEVANAGARI script -\X+ \2489\2494 0 2 ;BENGALI script - -- match_default normal REG_EXTENDED REG_STARTEND -\Aabc abc 0 3 -\Aabc aabc -1 -1 -abc\z abc 0 3 -abc\z abcd -1 -1 -abc\Z abc\n\n 0 3 -abc\Z abc 0 3 - - -\Gabc abc 0 3 -\Gabc dabcd -1 -1 -a\Gbc abc -1 -1 -a\Aab abc -1 -1 - -; -; now test grep, -; basically check all our restart types - line, word, etc -; checking each one for null and non-null matches. -; -- match_default normal REG_EXTENDED REG_STARTEND REG_GREP -a " a a a aa" 1 2 3 4 5 6 7 8 8 9 -a+b+ "aabaabbb ab" 0 3 3 8 9 11 -a(b*|c|e)d adabbdacd 0 2 2 6 6 9 -a "\na\na\na\naa" 1 2 3 4 5 6 7 8 8 9 - -^ " \n\n \n\n\n" 0 0 4 4 5 5 8 8 9 9 10 10 -^ab "ab \nab ab\n" 0 2 5 7 -^[^\n]*\n " \n \n\n \n" 0 4 4 7 7 8 8 11 -\ <123><><><> -[[:digit:]]* 123ab1 <$0> <123><><><1> - -; and now escapes: -a+ "...aaa,,," $x "$x" -a+ "...aaa,,," \a "\a" -a+ "...aaa,,," \f "\f" -a+ "...aaa,,," \n "\n" -a+ "...aaa,,," \r "\r" -a+ "...aaa,,," \t "\t" -a+ "...aaa,,," \v "\v" - -a+ "...aaa,,," \x21 "!" -a+ "...aaa,,," \x{21} "!" -a+ "...aaa,,," \c@ \0 -a+ "...aaa,,," \e \27 -a+ "...aaa,,," \0101 A - -- match_default normal REG_EXTENDED REG_STARTEND REG_MERGE format_sed format_no_copy -(a+)(b+) ...aabb,, \0 aabb -(a+)(b+) ...aabb,, \1 aa -(a+)(b+) ...aabb,, \2 bb -(a+)(b+) ...aabb,, & aabb -(a+)(b+) ...aabb,, $ $ -(a+)(b+) ...aabb,, $1 $1 -(a+)(b+) ...aabb,, ()?: ()?: -(a+)(b+) ...aabb,, \\ \\ -(a+)(b+) ...aabb,, \& & - - -- match_default normal REG_EXTENDED REG_STARTEND REG_MERGE format_perl format_no_copy -(a+)(b+) ...aabb,, $0 aabb -(a+)(b+) ...aabb,, $1 aa -(a+)(b+) ...aabb,, $2 bb -(a+)(b+) ...aabb,, $& aabb -(a+)(b+) ...aabb,, & & -(a+)(b+) ...aabb,, \0 \0 -(a+)(b+) ...aabb,, ()?: ()?: -a+ "...aaa,,," \0101 A - -- match_default format_all normal REG_EXTENDED REG_STARTEND REG_MERGE -; move to copying unmatched data: -a+ "...aaa,,," bbb "...bbb,,," -a+(b+) "...aaabb,,," $1 "...bb,,," -a+(b+) "...aaabb,,,ab*abbb?" $1 "...bb,,,b*bbb?" - -(a+)|(b+) "...aaabb,,,ab*abbb?" (?1A)(?2B) "...AB,,,AB*AB?" -(a+)|(b+) "...aaabb,,,ab*abbb?" ?1A:B "...AB,,,AB*AB?" -(a+)|(b+) "...aaabb,,,ab*abbb?" (?1A:B)C "...ACBC,,,ACBC*ACBC?" -(a+)|(b+) "...aaabb,,,ab*abbb?" ?1:B "...B,,,B*B?" - -- match_default format_all normal REG_EXTENDED REG_STARTEND REG_MERGE format_first_only -; move to copying unmatched data, but replace first occurance only: -a+ "...aaa,,," bbb "...bbb,,," -a+(b+) "...aaabb,,," $1 "...bb,,," -a+(b+) "...aaabb,,,ab*abbb?" $1 "...bb,,,ab*abbb?" -(a+)|(b+) "...aaabb,,,ab*abbb?" (?1A)(?2B) "...Abb,,,ab*abbb?" - -; -; changes to newline handling with 2.11: -; - -- match_default normal REG_EXTENDED REG_STARTEND REG_GREP - -^. " \n \r\n " 0 1 3 4 7 8 -.$ " \n \r\n " 1 2 4 5 8 9 - -- match_default normal REG_EXTENDED REG_STARTEND REG_GREP REG_UNICODE_ONLY -^. " \8232 \8233 " 0 1 3 4 5 6 -.$ " \8232 \8233 " 1 2 3 4 6 7 - -; -; non-greedy repeats added 21/04/00 -- match_default normal REG_EXTENDED REG_PERL -a** ! -a*? aa 0 0 -^a*?$ aa 0 2 -^.*?$ aa 0 2 -^(?:a)*?$ aa 0 2 -^[ab]*?$ aa 0 2 -a?? aa 0 0 -a++ ! -a+? aa 0 1 -a{1,3}{1} ! -a{1,3}? aaa 0 1 -\w+?w ...ccccccwcccccw 3 10 -\W+\w+?w ...ccccccwcccccw 0 10 -abc|\w+? abd 0 1 -abc|\w+? abcd 0 3 -<\s*tag[^>]*>(.*?)<\s*/tag\s*> " here is some text " 1 29 6 23 -<\s*tag[^>]*>(.*?)<\s*/tag\s*> " < tag attr=\"something\">here is some text< /tag > " 1 49 24 41 - -; -; non-marking parenthesis added 25/04/00 -- match_default normal REG_EXTENDED REG_PERL -(?:abc)+ xxabcabcxx 2 8 -(?:a+)(b+) xaaabbbx 1 7 4 7 -(a+)(?:b+) xaaabbba 1 7 1 4 -(?:(a+)b+) xaaabbba 1 7 1 4 -(?:a+(b+)) xaaabbba 1 7 4 7 -a+(?#b+)b+ xaaabbba 1 7 -(a)(?:b|$) ab 0 2 0 1 -(a)(?:b|$) a 0 1 0 1 - - -; -; try some partial matches: -- match_partial match_default normal REG_EXTENDED REG_NO_POSIX_TEST REG_PARTIAL_MATCH REG_PERL -(xyz)(.*)abc xyzaaab 0 7 -(xyz)(.*)abc xyz 0 3 -(xyz)(.*)abc xy 0 2 -(xyz)(.*)abc x 0 1 -(xyz)(.*)abc "" -1 -1 -(xyz)(.*)abc aaaa -1 -1 -.abc aaab 1 4 -a[_] xxa 2 3 - -; -; forward lookahead asserts added 21/01/02 -- match_default normal REG_EXTENDED REG_NO_POSIX_TEST REG_PERL -((?:(?!a|b)\w)+)(\w+) " xxxabaxxx " 2 11 2 5 5 11 - -/\*(?:(?!\*/).)*\*/ " /**/ " 2 6 -/\*(?:(?!\*/).)*\*/ " /***/ " 2 7 -/\*(?:(?!\*/).)*\*/ " /********/ " 2 12 -/\*(?:(?!\*/).)*\*/ " /* comment */ " 2 15 - -<\s*a[^>]*>((?:(?!<\s*/\s*a\s*>).)*)<\s*/\s*a\s*> " here " 1 24 16 20 -<\s*a[^>]*>((?:(?!<\s*/\s*a\s*>).)*)<\s*/\s*a\s*> " here< / a > " 1 28 16 20 - -<\s*a[^>]*>((?:(?!<\s*/\s*a\s*>).)*)(?=<\s*/\s*a\s*>) " here " 1 20 16 20 -<\s*a[^>]*>((?:(?!<\s*/\s*a\s*>).)*)(?=<\s*/\s*a\s*>) " here< / a > " 1 20 16 20 - -; filename matching: -^(?!^(?:PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(?:\..+)?$)[^\x00-\x1f\\?*:\"|/]+$ command.com 0 11 -^(?!^(?:PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(?:\..+)?$)[^\x00-\x1f\\?*:\"|/]+$ PRN -1 -1 -^(?!^(?:PRN|AUX|CLOCK\$|NUL|CON|COM\d|LPT\d|\..*)(?:\..+)?$)[^\x00-\x1f\\?*:\"|/]+$ COM2 -1 -1 - -; password checking: -^(?=.*\d).{4,8}$ abc3 0 4 -^(?=.*\d).{4,8}$ abc3def4 0 8 -^(?=.*\d).{4,8}$ ab2 -1 -1 -^(?=.*\d).{4,8}$ abcdefg -1 -1 -^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$ abc3 -1 -1 -^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$ abC3 0 4 -^(?=.*\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$ ABCD3 -1 -1 - -; extended repeat checking to exercise new algorithms: -ab.*xy abxy_ 0 4 -ab.*xy ab_xy_ 0 5 -ab.*xy abxy 0 4 -ab.*xy ab_xy 0 5 -ab.* ab 0 2 -ab.* ab__ 0 4 - -ab.{2,5}xy ab__xy_ 0 6 -ab.{2,5}xy ab____xy_ 0 8 -ab.{2,5}xy ab_____xy_ 0 9 -ab.{2,5}xy ab__xy 0 6 -ab.{2,5}xy ab_____xy 0 9 -ab.{2,5} ab__ 0 4 -ab.{2,5} ab_______ 0 7 -ab.{2,5}xy ab______xy -1 -1 -ab.{2,5}xy ab_xy -1 -1 - -ab.*?xy abxy_ 0 4 -ab.*?xy ab_xy_ 0 5 -ab.*?xy abxy 0 4 -ab.*?xy ab_xy 0 5 -ab.*? ab 0 2 -ab.*? ab__ 0 2 - -ab.{2,5}?xy ab__xy_ 0 6 -ab.{2,5}?xy ab____xy_ 0 8 -ab.{2,5}?xy ab_____xy_ 0 9 -ab.{2,5}?xy ab__xy 0 6 -ab.{2,5}?xy ab_____xy 0 9 -ab.{2,5}? ab__ 0 4 -ab.{2,5}? ab_______ 0 4 -ab.{2,5}?xy ab______xy -1 -1 -ab.{2,5}xy ab_xy -1 -1 - -; again but with slower algorithm variant: -- match_default normal REG_PERL REG_NO_POSIX_TEST match_not_dot_newline match_not_dot_null -ab.*xy abxy_ 0 4 -ab.*xy ab_xy_ 0 5 -ab.*xy abxy 0 4 -ab.*xy ab_xy 0 5 -ab.* ab 0 2 -ab.* ab__ 0 4 - -ab.{2,5}xy ab__xy_ 0 6 -ab.{2,5}xy ab____xy_ 0 8 -ab.{2,5}xy ab_____xy_ 0 9 -ab.{2,5}xy ab__xy 0 6 -ab.{2,5}xy ab_____xy 0 9 -ab.{2,5} ab__ 0 4 -ab.{2,5} ab_______ 0 7 -ab.{2,5}xy ab______xy -1 -1 -ab.{2,5}xy ab_xy -1 -1 - -ab.*?xy abxy_ 0 4 -ab.*?xy ab_xy_ 0 5 -ab.*?xy abxy 0 4 -ab.*?xy ab_xy 0 5 -ab.*? ab 0 2 -ab.*? ab__ 0 2 - -ab.{2,5}?xy ab__xy_ 0 6 -ab.{2,5}?xy ab____xy_ 0 8 -ab.{2,5}?xy ab_____xy_ 0 9 -ab.{2,5}?xy ab__xy 0 6 -ab.{2,5}?xy ab_____xy 0 9 -ab.{2,5}? ab__ 0 4 -ab.{2,5}? ab_______ 0 4 -ab.{2,5}?xy ab______xy -1 -1 -ab.{2,5}xy ab_xy -1 -1 -(.*?).somesite \n\n555.somesite 2 14 2 5 - -; now again for single character repeats: - -ab_*xy abxy_ 0 4 -ab_*xy ab_xy_ 0 5 -ab_*xy abxy 0 4 -ab_*xy ab_xy 0 5 -ab_* ab 0 2 -ab_* ab__ 0 4 - -ab_{2,5}xy ab__xy_ 0 6 -ab_{2,5}xy ab____xy_ 0 8 -ab_{2,5}xy ab_____xy_ 0 9 -ab_{2,5}xy ab__xy 0 6 -ab_{2,5}xy ab_____xy 0 9 -ab_{2,5} ab__ 0 4 -ab_{2,5} ab_______ 0 7 -ab_{2,5}xy ab______xy -1 -1 -ab_{2,5}xy ab_xy -1 -1 - -ab_*?xy abxy_ 0 4 -ab_*?xy ab_xy_ 0 5 -ab_*?xy abxy 0 4 -ab_*?xy ab_xy 0 5 -ab_*? ab 0 2 -ab_*? ab__ 0 2 - -ab_{2,5}?xy ab__xy_ 0 6 -ab_{2,5}?xy ab____xy_ 0 8 -ab_{2,5}?xy ab_____xy_ 0 9 -ab_{2,5}?xy ab__xy 0 6 -ab_{2,5}?xy ab_____xy 0 9 -ab_{2,5}? ab__ 0 4 -ab_{2,5}? ab_______ 0 4 -ab_{2,5}?xy ab______xy -1 -1 -ab_{2,5}xy ab_xy -1 -1 -(5*?).somesite //555.somesite 2 14 2 5 - -; and again for sets: -ab[_,;]*xy abxy_ 0 4 -ab[_,;]*xy ab_xy_ 0 5 -ab[_,;]*xy abxy 0 4 -ab[_,;]*xy ab_xy 0 5 -ab[_,;]* ab 0 2 -ab[_,;]* ab__ 0 4 - -ab[_,;]{2,5}xy ab__xy_ 0 6 -ab[_,;]{2,5}xy ab____xy_ 0 8 -ab[_,;]{2,5}xy ab_____xy_ 0 9 -ab[_,;]{2,5}xy ab__xy 0 6 -ab[_,;]{2,5}xy ab_____xy 0 9 -ab[_,;]{2,5} ab__ 0 4 -ab[_,;]{2,5} ab_______ 0 7 -ab[_,;]{2,5}xy ab______xy -1 -1 -ab[_,;]{2,5}xy ab_xy -1 -1 - -ab[_,;]*?xy abxy_ 0 4 -ab[_,;]*?xy ab_xy_ 0 5 -ab[_,;]*?xy abxy 0 4 -ab[_,;]*?xy ab_xy 0 5 -ab[_,;]*? ab 0 2 -ab[_,;]*? ab__ 0 2 - -ab[_,;]{2,5}?xy ab__xy_ 0 6 -ab[_,;]{2,5}?xy ab____xy_ 0 8 -ab[_,;]{2,5}?xy ab_____xy_ 0 9 -ab[_,;]{2,5}?xy ab__xy 0 6 -ab[_,;]{2,5}?xy ab_____xy 0 9 -ab[_,;]{2,5}? ab__ 0 4 -ab[_,;]{2,5}? ab_______ 0 4 -ab[_,;]{2,5}?xy ab______xy -1 -1 -ab[_,;]{2,5}xy ab_xy -1 -1 -(\d*?).somesite //555.somesite 2 14 2 5 - -; and again for tricky sets with digraphs: -ab[_[.ae.]]*xy abxy_ 0 4 -ab[_[.ae.]]*xy ab_xy_ 0 5 -ab[_[.ae.]]*xy abxy 0 4 -ab[_[.ae.]]*xy ab_xy 0 5 -ab[_[.ae.]]* ab 0 2 -ab[_[.ae.]]* ab__ 0 4 - -ab[_[.ae.]]{2,5}xy ab__xy_ 0 6 -ab[_[.ae.]]{2,5}xy ab____xy_ 0 8 -ab[_[.ae.]]{2,5}xy ab_____xy_ 0 9 -ab[_[.ae.]]{2,5}xy ab__xy 0 6 -ab[_[.ae.]]{2,5}xy ab_____xy 0 9 -ab[_[.ae.]]{2,5} ab__ 0 4 -ab[_[.ae.]]{2,5} ab_______ 0 7 -ab[_[.ae.]]{2,5}xy ab______xy -1 -1 -ab[_[.ae.]]{2,5}xy ab_xy -1 -1 - -ab[_[.ae.]]*?xy abxy_ 0 4 -ab[_[.ae.]]*?xy ab_xy_ 0 5 -ab[_[.ae.]]*?xy abxy 0 4 -ab[_[.ae.]]*?xy ab_xy 0 5 -ab[_[.ae.]]*? ab 0 2 -ab[_[.ae.]]*? ab__ 0 2 - -ab[_[.ae.]]{2,5}?xy ab__xy_ 0 6 -ab[_[.ae.]]{2,5}?xy ab____xy_ 0 8 -ab[_[.ae.]]{2,5}?xy ab_____xy_ 0 9 -ab[_[.ae.]]{2,5}?xy ab__xy 0 6 -ab[_[.ae.]]{2,5}?xy ab_____xy 0 9 -ab[_[.ae.]]{2,5}? ab__ 0 4 -ab[_[.ae.]]{2,5}? ab_______ 0 4 -ab[_[.ae.]]{2,5}?xy ab______xy -1 -1 -ab[_[.ae.]]{2,5}xy ab_xy -1 -1 -([5[.ae.]]*?).somesite //555.somesite 2 14 2 5 - -; new bugs detected in spring 2003: -- normal match_continuous REG_NO_POSIX_TEST -b abc -1 -1 - -- normal REG_PERL -(?!foo)bar foobar 3 6 -(?!foo)bar "??bar" 2 5 -(?!foo)bar "barfoo" 0 3 -(?!foo)bar "bar??" 0 3 -(?!foo)bar bar 0 3 - -a\Z a\nb -1 -1 -() abc 0 0 0 0 -^() abc 0 0 0 0 -^()+ abc 0 0 0 0 -^(){1} abc 0 0 0 0 -^(){2} abc 0 0 0 0 -^((){2}) abc 0 0 0 0 0 0 -() "" 0 0 0 0 -()\1 "" 0 0 0 0 -()\1 a 0 0 0 0 -a()\1b ab 0 2 1 1 -a()b\1 ab 0 2 1 1 - -- normal match_not_dot_newline REG_NO_POSIX_TEST - -"(.*\r\n){3}.* abcdefghijklmnopqrstuvwxyz.*\r\n" "00001 01 \r\n00002 02 1 2 3 4 5 6 7 8 9 0\r\n00003 03 0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890\r\n00004 04 \r\n00005 05 \r\n00006 06 Seite: 0001\r\n00007 07 StartSeitEEnde: 0001\r\n00008 08 StartSeiTe Ende: 0001\r\n00009 09 Start seiteEnde: 0001\r\n00010 10 28.2.03\r\n00011 11 Page: 0001\r\n00012 12 Juhu die Erste: 0001\r\n00013 13 Es war einmal! 0001\r\n00014 14 - + - normal REG_PERL ; new (?: construct ) diff --git a/test/regress/main.cpp b/test/regress/main.cpp index 5620a01b..215e991a 100644 --- a/test/regress/main.cpp +++ b/test/regress/main.cpp @@ -14,6 +14,14 @@ int cpp_main(int argc, char * argv[]) test_character_escapes(); test_assertion_escapes(); test_tricky_cases(); + test_tricky_cases2(); + test_grep(); + test_replace(); + test_non_greedy_repeats(); + test_non_marking_paren(); + test_partial_match(); + test_forward_lookahead_asserts(); + test_fast_repeats(); return error_count; } diff --git a/test/regress/test.hpp b/test/regress/test.hpp index 2b270cb8..b2e91622 100644 --- a/test/regress/test.hpp +++ b/test/regress/test.hpp @@ -4,6 +4,7 @@ #define BOOST_REGEX_REGRESS_TEST_HPP #include "test_not_regex.hpp" #include "test_regex_search.hpp" +#include "test_regex_replace.hpp" // @@ -81,6 +82,45 @@ const int* make_array(int first, ...); TEST_REGEX_SEARCH_N(s, f, t, m, a);\ TEST_REGEX_SEARCH_W(BOOST_JOIN(L, s), f, BOOST_JOIN(L, t), m, a) +// +// define macros for testing regex replaces: +// +#define TEST_REGEX_REPLACE_N(s, f, t, m, fs, r)\ + do{\ + const char e[] = { s };\ + std::string se(e, sizeof(e) - 1);\ + const char st[] = { t };\ + std::string sst(st, sizeof(st) - 1);\ + const char ft[] = { fs };\ + std::string sft(ft, sizeof(ft) - 1);\ + const char rt[] = { r };\ + std::string srt(rt, sizeof(rt) - 1);\ + test_info::set_info(__FILE__, __LINE__, se, f, sst, m, 0, sft, srt);\ + test(char(0), test_regex_replace_tag());\ + }while(0) + +#ifndef BOOST_NO_WREGEX +#define TEST_REGEX_REPLACE_W(s, f, t, m, fs, r)\ + do{\ + const wchar_t e[] = { s };\ + std::wstring se(e, (sizeof(e) / sizeof(wchar_t)) - 1);\ + const wchar_t st[] = { t };\ + std::wstring sst(st, (sizeof(st) / sizeof(wchar_t)) - 1);\ + const wchar_t ft[] = { fs };\ + std::wstring sft(ft, (sizeof(ft) / sizeof(wchar_t)) - 1);\ + const wchar_t rt[] = { r };\ + std::wstring srt(rt, (sizeof(rt) / sizeof(wchar_t)) - 1);\ + test_info::set_info(__FILE__, __LINE__, se, f, sst, m, 0, sft, srt);\ + test(wchar_t(0), test_regex_replace_tag());\ + }while(0) +#else +#define TEST_REGEX_REPLACE_W(s, f, t, m, fs, r) +#endif + +#define TEST_REGEX_REPLACE(s, f, t, m, fs, r)\ + TEST_REGEX_REPLACE_N(s, f, t, m, fs, r);\ + TEST_REGEX_REPLACE_W(BOOST_JOIN(L, s), f, BOOST_JOIN(L, t), m, BOOST_JOIN(L, fs), BOOST_JOIN(L, r)) + // // define the test group proceedures: // @@ -92,6 +132,13 @@ void test_backrefs(); void test_character_escapes(); void test_assertion_escapes(); void test_tricky_cases(); - +void test_grep(); +void test_replace(); +void test_non_greedy_repeats(); +void test_non_marking_paren(); +void test_partial_match(); +void test_forward_lookahead_asserts(); +void test_fast_repeats(); +void test_tricky_cases2(); #endif diff --git a/test/regress/test_regex_search.hpp b/test/regress/test_regex_search.hpp index 7e70f011..4246c9c7 100644 --- a/test/regress/test_regex_search.hpp +++ b/test/regress/test_regex_search.hpp @@ -16,7 +16,10 @@ void test_sub_match(const boost::sub_match& sub, Bidirect #pragma warning(disable:4244) #endif typedef typename boost::sub_match::value_type charT; - if(sub.matched == 0) + if((sub.matched == 0) + && + !((i == 0) + && (test_info::match_options() & boost::match_partial)) ) { if(answer_table[2*i] >= 0) { @@ -80,6 +83,101 @@ void test_simple_search(boost::basic_regex& r) } } +template +void test_regex_iterator(boost::basic_regex& r) +{ + typedef typename std::basic_string::const_iterator const_iterator; + typedef boost::regex_iterator test_iterator; + const std::basic_string& search_text = test_info::search_text(); + boost::regex_constants::match_flag_type opts = test_info::match_options(); + const int* answer_table = test_info::answer_table(); + test_iterator start(search_text.begin(), search_text.end(), r, opts), end; + while(start != end) + { + test_result(*start, search_text.begin(), answer_table); + ++start; + // move on the answer table to next set of answers; + while(*answer_table++ != -2){} + } + if(answer_table[0] >= 0) + { + // we should have had a match but didn't: + BOOST_REGEX_TEST_ERROR("Expected match was not found.", charT); + } +} + +template +struct grep_test_predicate +{ + typedef typename std::basic_string::const_iterator test_iter; + + grep_test_predicate(test_iter b, const int* a) + : m_base(b), m_table(a) + {} + bool operator()(const boost::match_results& what) + { + test_result(what, m_base, m_table); + // move on the answer table to next set of answers; + while(*m_table++ != -2){} + return true; + } +private: + test_iter m_base; + const int* m_table; +}; + +template +void test_regex_grep(boost::basic_regex& r) +{ + typedef typename std::basic_string::const_iterator const_iterator; + const std::basic_string& search_text = test_info::search_text(); + boost::regex_constants::match_flag_type opts = test_info::match_options(); + const int* answer_table = test_info::answer_table(); + grep_test_predicate pred(search_text.begin(), answer_table); + boost::regex_grep(pred, search_text, r, opts); +} + +template +void test_regex_match(boost::basic_regex& r) +{ + typedef typename std::basic_string::const_iterator const_iterator; + const std::basic_string& search_text = test_info::search_text(); + boost::regex_constants::match_flag_type opts = test_info::match_options(); + const int* answer_table = test_info::answer_table(); + boost::match_results what; + if(answer_table[0] < 0) + { + if(boost::regex_match(search_text, r, opts)) + { + BOOST_REGEX_TEST_ERROR("boost::regex_match found a match when it should not have done so.", charT); + } + } + else + { + if((answer_table[0] > 0) && boost::regex_match(search_text, r, opts)) + { + BOOST_REGEX_TEST_ERROR("boost::regex_match found a match when it should not have done so.", charT); + } + else if((answer_table[0] == 0) && (answer_table[1] == search_text.size())) + { + if(boost::regex_match( + search_text.begin(), + search_text.end(), + what, + r, + opts)) + { + test_result(what, search_text.begin(), answer_table); + } + else if(answer_table[0] >= 0) + { + // we should have had a match but didn't: + BOOST_REGEX_TEST_ERROR("Expected match was not found.", charT); + } + } + } +} + template void test(boost::basic_regex& r, const test_regex_search_tag&) { @@ -88,6 +186,9 @@ void test(boost::basic_regex& r, const test_regex_search_tag&) try{ r.assign(expression, syntax_options); test_simple_search(r); + test_regex_iterator(r); + test_regex_grep(r); + test_regex_match(r); } catch(const boost::bad_expression& e) {