From d2c3ec6d5767f855ae424f33fc31206b1f34eb3c Mon Sep 17 00:00:00 2001 From: John Maddock Date: Sat, 1 May 2004 11:23:02 +0000 Subject: [PATCH] Almost complete regex implementation now... [SVN r22718] --- example/jgrep/jgrep.cpp | 2 + example/jgrep/jgrep.h | 10 +- example/jgrep/main.cpp | 5 +- example/timer/regex_timer.cpp | 2 +- include/boost/regex/config.hpp | 5 +- include/boost/regex/static_mutex.hpp | 2 +- .../boost/regex/v4/basic_regex_creator.hpp | 217 ++++++++++++--- include/boost/regex/v4/basic_regex_parser.hpp | 51 +++- include/boost/regex/v4/cpp_regex_traits.hpp | 75 ++++- include/boost/regex/v4/perl_matcher.hpp | 1 + .../boost/regex/v4/perl_matcher_common.hpp | 13 +- .../regex/v4/perl_matcher_non_recursive.hpp | 9 +- .../boost/regex/v4/perl_matcher_recursive.hpp | 3 +- .../boost/regex/v4}/primary_transform.hpp | 39 ++- include/boost/regex/v4/regex.hpp | 3 - include/boost/regex/v4/regex_traits.hpp | 8 + .../boost/regex/v4/regex_traits_defaults.hpp | 15 +- include/boost/regex/v4/states.hpp | 4 +- src/regex.cpp | 4 + src/static_mutex.cpp | 4 +- test/regress/basic_tests.cpp | 261 ++++++++++-------- test/regress/info.hpp | 2 + test/regress/main.cpp | 2 + test/regress/test.hpp | 2 + test/regress/test_regex_replace.hpp | 54 ++++ test/regress/test_regex_search.hpp | 4 +- 26 files changed, 587 insertions(+), 210 deletions(-) rename {src => include/boost/regex/v4}/primary_transform.hpp (74%) create mode 100644 test/regress/test_regex_replace.hpp diff --git a/example/jgrep/jgrep.cpp b/example/jgrep/jgrep.cpp index a0bc19c2..9acf48e4 100644 --- a/example/jgrep/jgrep.cpp +++ b/example/jgrep/jgrep.cpp @@ -53,6 +53,8 @@ public: ogrep_predicate(unsigned int& i, const char* p, iterator start, iterator end) : lines(i), filename(p), last_line(-1), end_of_storage(end), last_line_start(start) {} ogrep_predicate(const ogrep_predicate& o) : lines(o.lines), filename(o.filename), last_line(o.last_line), end_of_storage(o.end_of_storage), last_line_start(o.last_line_start) {} bool operator () (const boost::match_results& i); +private: + void operator=(const ogrep_predicate&); }; // ideally we'd ignor the allocator type and use a template member function diff --git a/example/jgrep/jgrep.h b/example/jgrep/jgrep.h index c7919edc..c9e6ac3b 100644 --- a/example/jgrep/jgrep.h +++ b/example/jgrep/jgrep.h @@ -19,15 +19,7 @@ #include -// case sensitive reg_expression determines our allocator type: -typedef boost::reg_expression re_type; -typedef re_type::allocator_type allocator_type; - -// now declare static (global) data, including an allocator -// instance which we'll pass to all instances that require an allocator. - -extern allocator_type a; - +typedef boost::basic_regex re_type; extern re_type e; // flags for output: diff --git a/example/jgrep/main.cpp b/example/jgrep/main.cpp index 207ae140..b5d11f42 100644 --- a/example/jgrep/main.cpp +++ b/example/jgrep/main.cpp @@ -46,10 +46,7 @@ using std::endl; #include #endif -allocator_type a; - -re_type e(a); -//rei_type ei(a); +re_type e; // flags for output: diff --git a/example/timer/regex_timer.cpp b/example/timer/regex_timer.cpp index 5e92cf7f..67484858 100644 --- a/example/timer/regex_timer.cpp +++ b/example/timer/regex_timer.cpp @@ -145,7 +145,7 @@ int main(int argc, char**argv) double tim; bool result; int iters = 100; - double wait_time = std::min(t.elapsed_min() * 1000, 1.0); + double wait_time = (std::min)(t.elapsed_min() * 1000, 1.0); while(true) { diff --git a/include/boost/regex/config.hpp b/include/boost/regex/config.hpp index 24683fea..bcea0fb2 100644 --- a/include/boost/regex/config.hpp +++ b/include/boost/regex/config.hpp @@ -174,6 +174,7 @@ using std::distance; # ifdef BOOST_MSVC // warning suppression with VC6: # pragma warning(disable: 4800) +# pragma warning(disable: 4786) # endif # define BOOST_REGEX_MAKE_BOOL(x) static_cast(x) #endif @@ -367,12 +368,14 @@ BOOST_REGEX_DECL void BOOST_REGEX_CALL reset_stack_guard_page(); namespace boost{ namespace re_detail{ +BOOST_REGEX_DECL void BOOST_REGEX_CALL raise_runtime_error(const std::runtime_error& ex); + template void raise_error(const traits& t, unsigned code) { (void)t; // warning suppression std::runtime_error e(t.error_string(code)); - throw_exception(e); + ::boost::re_detail::raise_runtime_error(e); } } diff --git a/include/boost/regex/static_mutex.hpp b/include/boost/regex/static_mutex.hpp index 9ea0e7ca..109de9f2 100644 --- a/include/boost/regex/static_mutex.hpp +++ b/include/boost/regex/static_mutex.hpp @@ -89,7 +89,7 @@ class static_mutex { public: typedef scoped_static_mutex_lock scoped_lock; - volatile boost::int32_t m_mutex; + boost::int32_t m_mutex; }; #define BOOST_STATIC_MUTEX_INIT { 0, } diff --git a/include/boost/regex/v4/basic_regex_creator.hpp b/include/boost/regex/v4/basic_regex_creator.hpp index 649c7009..0a22b180 100644 --- a/include/boost/regex/v4/basic_regex_creator.hpp +++ b/include/boost/regex/v4/basic_regex_creator.hpp @@ -198,6 +198,7 @@ protected: re_syntax_base* m_last_state; // the last state we added bool m_icase; // true for case insensitive matches unsigned m_repeater_id; // the id of the next repeater + bool m_has_backrefs; // true if there are actually any backrefs unsigned m_backrefs; // bitmask of permitted backrefs boost::uintmax_t m_bad_repeats; // bitmask of repeats we can't deduce a startmap for; typename traits::char_class_type m_word_mask; // mask used to determine if a character is a word character @@ -211,17 +212,19 @@ private: void fixup_pointers(re_syntax_base* state); void create_startmaps(re_syntax_base* state); - void create_startmap(re_syntax_base* state, unsigned char* map, unsigned int* pnull, unsigned char mask); + int calculate_backstep(re_syntax_base* state); + void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask); unsigned get_restart_type(re_syntax_base* state); void set_all_masks(unsigned char* bits, unsigned char); bool is_bad_repeat(re_syntax_base* pt); void set_bad_repeat(re_syntax_base* pt); syntax_element_type get_repeat_type(re_syntax_base* state); + void probe_leading_repeat(re_syntax_base* state); }; template basic_regex_creator::basic_regex_creator(regex_data* data) - : m_pdata(data), m_traits(data->m_traits), m_last_state(0), m_repeater_id(0), m_backrefs(0) + : m_pdata(data), m_traits(data->m_traits), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0) { m_pdata->m_data.clear(); static const charT w = 'w'; @@ -244,6 +247,9 @@ basic_regex_creator::basic_regex_creator(regex_data re_syntax_base* basic_regex_creator::append_state(syntax_element_type t, std::size_t s) { + // if the state is a backref then make a note of it: + if(t == syntax_element_backref) + this->m_has_backrefs = true; // append a new state, start by aligning our last one: m_pdata->m_data.align(); // set the offset to the next state in our last one: @@ -538,7 +544,7 @@ re_syntax_base* basic_regex_creator::append_set( return 0; // invalid or unsupported equivalence class for(unsigned i = 0; i < (1u << CHAR_BIT); ++i) { - charT c(i); + charT c(static_cast(i)); string_type s2 = this->m_traits.transform_primary(&c, &c+1); if(s == s2) result->_map[i] = true; @@ -585,6 +591,8 @@ void basic_regex_creator::finalize(const charT* p1, const charT* create_startmap(m_pdata->m_first_state, m_pdata->m_startmap, &(m_pdata->m_can_be_null), mask_all); // get the restart type: m_pdata->m_restart_type = get_restart_type(m_pdata->m_first_state); + // optimise a leading repeat if there is one: + probe_leading_repeat(m_pdata->m_first_state); } template @@ -645,6 +653,11 @@ void basic_regex_creator::create_startmaps(re_syntax_base* state) // adjust the type of the state to allow for faster matching: state->type = this->get_repeat_type(state); return; + case syntax_element_backstep: + // we need to calculate how big the backstep is: + static_cast(state)->index + = this->calculate_backstep(state->next.p); + // fall through: default: state = state->next.p; } @@ -652,7 +665,65 @@ void basic_regex_creator::create_startmaps(re_syntax_base* state) } template -void basic_regex_creator::create_startmap(re_syntax_base* state, unsigned char* map, unsigned int* pnull, unsigned char mask) +int basic_regex_creator::calculate_backstep(re_syntax_base* state) +{ + typedef typename traits::char_class_type mask_type; + int result = 0; + while(state) + { + switch(state->type) + { + case syntax_element_startmark: + if((static_cast(state)->index == -1) + || (static_cast(state)->index == -2)) + { + state = static_cast(state->next.p)->alt.p->next.p; + continue; + } + else if(static_cast(state)->index == -3) + { + state = state->next.p->next.p; + continue; + } + break; + case syntax_element_endmark: + if((static_cast(state)->index == -1) + || (static_cast(state)->index == -2)) + return result; + case syntax_element_literal: + result += static_cast(state)->length; + break; + case syntax_element_wild: + case syntax_element_set: + result += 1; + break; + case syntax_element_backref: + case syntax_element_rep: + case syntax_element_combining: + case syntax_element_dot_rep: + case syntax_element_char_rep: + case syntax_element_short_set_rep: + case syntax_element_long_set_rep: + case syntax_element_backstep: + return -1; + case syntax_element_long_set: + if(static_cast*>(state)->singleton == 0) + return -1; + result += 1; + break; + case syntax_element_jump: + state = static_cast(state)->alt.p; + continue; + default: + break; + } + state = state->next.p; + } + return -1; +} + +template +void basic_regex_creator::create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask) { int not_last_jump = 1; while(state) @@ -661,16 +732,16 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, { case syntax_element_literal: { - // don't set anything in *pnull, set each element in map + // don't set anything in *pnull, set each element in l_map // that could match the first character in the literal: - if(map) + if(l_map) { - map[0] |= mask_init; + l_map[0] |= mask_init; charT first_char = *static_cast(static_cast(static_cast(state) + 1)); for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) { if(m_traits.translate(static_cast(i), m_icase) == first_char) - map[i] |= mask; + l_map[i] |= mask; } } return; @@ -678,11 +749,11 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, case syntax_element_end_line: { // next character must be a line separator (if there is one): - if(map) + if(l_map) { - map[0] |= mask_init; - map['\n'] |= mask; - map['\r'] |= mask; + l_map[0] |= mask_init; + l_map['\n'] |= mask; + l_map['\r'] |= mask; } // now figure out if we can match a NULL string at this point: if(pnull) @@ -697,13 +768,13 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, case syntax_element_wild: { // can't be null, any character can match: - set_all_masks(map, mask); + set_all_masks(l_map, mask); return; } case syntax_element_match: { // must be null, any character can match: - set_all_masks(map, mask); + set_all_masks(l_map, mask); if(pnull) *pnull |= mask; return; @@ -711,14 +782,14 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, case syntax_element_word_start: { // recurse, then AND with all the word characters: - create_startmap(state->next.p, map, pnull, mask); - if(map) + create_startmap(state->next.p, l_map, pnull, mask); + if(l_map) { - map[0] |= mask_init; + l_map[0] |= mask_init; for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) { if(!m_traits.is_class(static_cast(i), m_word_mask)) - map[i] &= static_cast(~mask); + l_map[i] &= static_cast(~mask); } } return; @@ -726,14 +797,14 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, case syntax_element_word_end: { // recurse, then AND with all the word characters: - create_startmap(state->next.p, map, pnull, mask); - if(map) + create_startmap(state->next.p, l_map, pnull, mask); + if(l_map) { - map[0] |= mask_init; + l_map[0] |= mask_init; for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) { if(m_traits.is_class(static_cast(i), m_word_mask)) - map[i] &= static_cast(~mask); + l_map[i] &= static_cast(~mask); } } return; @@ -746,32 +817,32 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, return; } case syntax_element_long_set: - if(map) + if(l_map) { typedef typename traits::char_class_type mask_type; if(static_cast*>(state)->singleton) { - map[0] |= mask_init; + l_map[0] |= mask_init; for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) { charT c = static_cast(i); if(&c != re_is_set_member(&c, &c + 1, static_cast*>(state), *m_pdata)) - map[i] |= mask; + l_map[i] |= mask; } } else - set_all_masks(map, mask); + set_all_masks(l_map, mask); } return; case syntax_element_set: - if(map) + if(l_map) { - map[0] |= mask_init; + l_map[0] |= mask_init; for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i) { if(static_cast(state)->_map[ static_cast(m_traits.translate(static_cast(i), this->m_icase))]) - map[i] |= mask; + l_map[i] |= mask; } } return; @@ -790,14 +861,14 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, re_alt* rep = static_cast(state); if(rep->_map[0] & mask_init) { - if(map) + if(l_map) { // copy previous results: - map[0] |= mask_init; + l_map[0] |= mask_init; for(unsigned int i = 0; i <= UCHAR_MAX; ++i) { if(rep->_map[i] & mask_any) - map[i] |= mask; + l_map[i] |= mask; } } if(pnull) @@ -812,29 +883,53 @@ void basic_regex_creator::create_startmap(re_syntax_base* state, // so take the union of the two options: if(is_bad_repeat(state)) { - set_all_masks(map, mask); + set_all_masks(l_map, mask); return; } set_bad_repeat(state); - create_startmap(state->next.p, map, pnull, mask); - if((state->type == syntax_element_alt) + create_startmap(state->next.p, l_map, pnull, mask); + if((state->type == syntax_element_alt) || (static_cast(state)->min == 0) || (not_last_jump == 0)) - create_startmap(rep->alt.p, map, pnull, mask); + create_startmap(rep->alt.p, l_map, pnull, mask); } } return; case syntax_element_soft_buffer_end: // match newline or null: - if(map) + if(l_map) { - map[0] |= mask_init; - map['\n'] |= mask; - map['\r'] |= mask; + l_map[0] |= mask_init; + l_map['\n'] |= mask; + l_map['\r'] |= mask; } if(pnull) *pnull |= mask; return; + case syntax_element_endmark: + // need to handle independent subs as a special case: + if(static_cast(state)->index == -3) + { + // can be null, any character can match: + set_all_masks(l_map, mask); + if(pnull) + *pnull |= mask; + return; + } + else + { + state = state->next.p; + break; + } + + case syntax_element_startmark: + // need to handle independent subs as a special case: + if(static_cast(state)->index == -3) + { + state = state->next.p->next.p; + break; + } + // otherwise fall through: default: state = state->next.p; } @@ -962,6 +1057,48 @@ syntax_element_type basic_regex_creator::get_repeat_type(re_synta return state->type; } +template +void basic_regex_creator::probe_leading_repeat(re_syntax_base* state) +{ + // enumerate our states, and see if we have a leading repeat + // for which failed search restarts can be optimised; + do + { + switch(state->type) + { + case syntax_element_startmark: + if(static_cast(state)->index >= 0) + { + state = state->next.p; + continue; + } + return; + case syntax_element_endmark: + case syntax_element_start_line: + case syntax_element_end_line: + case syntax_element_word_boundary: + case syntax_element_within_word: + case syntax_element_word_start: + case syntax_element_word_end: + case syntax_element_buffer_start: + case syntax_element_buffer_end: + case syntax_element_restart_continue: + state = state->next.p; + break; + case syntax_element_dot_rep: + case syntax_element_char_rep: + case syntax_element_short_set_rep: + case syntax_element_long_set_rep: + if(this->m_has_backrefs == 0) + static_cast(state)->leading = true; + // fall through: + default: + return; + } + }while(state); +} + + } // namespace re_detail } // namespace boost diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 19f2efbd..403facc2 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -249,16 +249,14 @@ bool basic_regex_parser::parse_open_paren() // if((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0) { - if(m_traits.syntax_type(*m_position) == regex_constants::syntax_question) + if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question) return parse_perl_extension(); } // // update our mark count, and append the required state: // - unsigned markid; - if(this->flags() & regbase::nosubs) - markid = 0; - else + unsigned markid = 0; + if(0 == (this->flags() & regbase::nosubs)) markid = ++m_mark_count; re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); pb->index = markid; @@ -1070,6 +1068,10 @@ bool basic_regex_parser::parse_backref() template bool basic_regex_parser::parse_QE() { +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable:4127) +#endif // // parse a \Q...\E sequence: // @@ -1104,6 +1106,9 @@ bool basic_regex_parser::parse_QE() ++start; } return true; +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif } template @@ -1114,7 +1119,7 @@ bool basic_regex_parser::parse_perl_extension() // // backup some state, and prepare the way: // - int markid; + int markid = 0; std::ptrdiff_t jump_offset = 0; re_brace* pb = static_cast(this->append_state(syntax_element_startmark, sizeof(re_brace))); std::ptrdiff_t last_paren_start = this->getoffset(pb); @@ -1157,6 +1162,35 @@ bool basic_regex_parser::parse_perl_extension() this->m_pdata->m_data.align(); m_alt_insert_point = this->m_pdata->m_data.size(); break; + case regex_constants::escape_type_left_word: + { + // a lookbehind assertion: + if(++m_position == m_end) + fail(REG_BADRPT, m_position - m_base); + regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position); + if(t == regex_constants::syntax_not) + pb->index = markid = -2; + else if(t == regex_constants::syntax_equal) + pb->index = markid = -1; + else + fail(REG_BADRPT, m_position - m_base); + ++m_position; + jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); + this->append_state(syntax_element_backstep, sizeof(re_brace)); + this->m_pdata->m_data.align(); + m_alt_insert_point = this->m_pdata->m_data.size(); + break; + } + case regex_constants::escape_type_right_word: + // + // an independent sub-expression: + // + pb->index = markid = -3; + ++m_position; + jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump))); + this->m_pdata->m_data.align(); + m_alt_insert_point = this->m_pdata->m_data.size(); + break; default: fail(REG_BADRPT, m_position - m_base); } @@ -1180,6 +1214,11 @@ bool basic_regex_parser::parse_perl_extension() this->m_pdata->m_data.align(); re_jump* jmp = static_cast(this->getaddress(jump_offset)); jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp); + if(this->m_last_state == jmp) + { + // Oops... we didn't have anything inside the assertion: + fail(REG_EMPTY, m_position - m_base); + } } // // append closing parenthesis state: diff --git a/include/boost/regex/v4/cpp_regex_traits.hpp b/include/boost/regex/v4/cpp_regex_traits.hpp index 7975f96f..8ba46de8 100644 --- a/include/boost/regex/v4/cpp_regex_traits.hpp +++ b/include/boost/regex/v4/cpp_regex_traits.hpp @@ -29,6 +29,15 @@ #include #endif +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_PREFIX +#endif + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable:4786) +#endif + namespace boost{ // @@ -58,8 +67,8 @@ public: const charT* getnext() { return this->gptr(); } protected: std::basic_streambuf* setbuf(char_type* s, streamsize n); - typename parser_buf::pos_type seekpos(pos_type sp, ::std::ios_base::openmode which); - typename parser_buf::pos_type seekoff(off_type off, ::std::ios_base::seekdir way, ::std::ios_base::openmode which); + //typename parser_buf::pos_type seekpos(pos_type sp, ::std::ios_base::openmode which); + //typename parser_buf::pos_type seekoff(off_type off, ::std::ios_base::seekdir way, ::std::ios_base::openmode which); private: parser_buf& operator=(const parser_buf&); parser_buf(const parser_buf&); @@ -73,6 +82,7 @@ parser_buf::setbuf(char_type* s, streamsize n) return this; } +#if 0 template typename parser_buf::pos_type parser_buf::seekoff(off_type off, ::std::ios_base::seekdir way, ::std::ios_base::openmode which) @@ -131,7 +141,7 @@ parser_buf::seekpos(pos_type sp, ::std::ios_base::openmode which) } return pos_type(off_type(-1)); } - +#endif // // class cpp_regex_traits_base: @@ -308,9 +318,25 @@ class cpp_regex_traits_implementation : public cpp_regex_traits_char_layer::char_class_type char_class_type; + typedef typename std::ctype::mask native_mask_type; BOOST_STATIC_CONSTANT(char_class_type, mask_blank = 1u << 16); BOOST_STATIC_CONSTANT(char_class_type, mask_word = 1u << 17); BOOST_STATIC_CONSTANT(char_class_type, mask_unicode = 1u << 18); +#ifdef __GNUC__ + BOOST_STATIC_CONSTANT(native_mask_type, + mask_base = + std::ctype::alnum + | std::ctype::alpha + | std::ctype::cntrl + | std::ctype::digit + | std::ctype::graph + | std::ctype::lower + | std::ctype::print + | std::ctype::punct + | std::ctype::space + | std::ctype::upper + | std::ctype::xdigit); +#else BOOST_STATIC_CONSTANT(char_class_type, mask_base = std::ctype::alnum @@ -324,6 +350,7 @@ public: | std::ctype::space | std::ctype::upper | std::ctype::xdigit); +#endif //BOOST_STATIC_ASSERT(0 == (mask_base & (mask_word | mask_unicode))); @@ -346,9 +373,9 @@ public: char_class_type result = lookup_classname_imp(p1, p2); if(result == 0) { - string_type s(p1, p2); - this->m_pctype->tolower(&*s.begin(), &*s.end()); - result = lookup_classname_imp(&*s.begin(), &*s.end()); + string_type temp(p1, p2); + this->m_pctype->tolower(&*temp.begin(), &*temp.begin() + temp.size()); + result = lookup_classname_imp(&*temp.begin(), &*temp.begin() + temp.size()); } return result; } @@ -388,20 +415,20 @@ typename cpp_regex_traits_implementation::string_type // the best we can do is translate to lower case, then get a regular sort key: { result.assign(p1, p2); - m_pctype->tolower(&*result.begin(), &*result.end()); - result = this->m_pcollate->transform(&*result.begin(), &*result.end()); + this->m_pctype->tolower(&*result.begin(), &*result.begin() + result.size()); + result = this->m_pcollate->transform(&*result.begin(), &*result.begin() + result.size()); break; } case sort_fixed: { // get a regular sort key, and then truncate it: - result.assign(this->m_pcollate->transform(&*result.begin(), &*result.end())); + result.assign(this->m_pcollate->transform(&*result.begin(), &*result.begin() + result.size())); result.erase(this->m_collate_delim); break; } case sort_delim: // get a regular sort key, and then truncate everything after the delim: - result.assign(this->m_pcollate->transform(&*result.begin(), &*result.end())); + result.assign(this->m_pcollate->transform(&*result.begin(), &*result.begin() + result.size())); std::size_t i; for(i = 0; i < result.size(); ++i) { @@ -425,10 +452,30 @@ typename cpp_regex_traits_implementation::string_type if(pos != m_custom_collate_names.end()) return pos->second; } +#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS std::string name(p1, p2); +#else + std::string name; + const charT* p0 = p1; + while(p0 != p2) + name.append(1, char(*p0++)); +#endif name = lookup_default_collate_name(name); +#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS if(name.size()) return string_type(name.begin(), name.end()); +#else + if(name.size()) + { + string_type result; + typedef std::string::const_iterator iter; + iter b = name.begin(); + iter e = name.end(); + while(b != e) + result.append(1, charT(*b++)); + return result; + } +#endif if(p2 - p1 == 1) return string_type(1, *p1); return string_type(); @@ -731,4 +778,12 @@ static_mutex& cpp_regex_traits::get_mutex_inst() } // boost +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_SUFFIX +#endif + #endif diff --git a/include/boost/regex/v4/perl_matcher.hpp b/include/boost/regex/v4/perl_matcher.hpp index 1576f037..41455a93 100644 --- a/include/boost/regex/v4/perl_matcher.hpp +++ b/include/boost/regex/v4/perl_matcher.hpp @@ -333,6 +333,7 @@ private: bool match_char_repeat(); bool match_dot_repeat_fast(); bool match_dot_repeat_slow(); + bool match_backstep(); bool backtrack_till_match(unsigned count); // find procs stored in s_find_vtable: diff --git a/include/boost/regex/v4/perl_matcher_common.hpp b/include/boost/regex/v4/perl_matcher_common.hpp index 48b8dc10..3e7f8eea 100644 --- a/include/boost/regex/v4/perl_matcher_common.hpp +++ b/include/boost/regex/v4/perl_matcher_common.hpp @@ -659,6 +659,17 @@ bool perl_matcher::match_restart_continue() return false; } +template +bool perl_matcher::match_backstep() +{ + std::ptrdiff_t maxlen = std::distance(search_base, position); + if(maxlen < static_cast(pstate)->index) + return false; + std::advance(position, -static_cast(pstate)->index); + pstate = pstate->next.p; + return true; +} + template bool perl_matcher::find_restart_any() { @@ -737,7 +748,7 @@ bool perl_matcher::find_restart_line() return true; while(position != last) { - while((position != last) && (*position != '\n')) + while((position != last) && !is_separator(*position)) ++position; if(position == last) return false; diff --git a/include/boost/regex/v4/perl_matcher_non_recursive.hpp b/include/boost/regex/v4/perl_matcher_non_recursive.hpp index ced7a8d8..4f6d45dc 100644 --- a/include/boost/regex/v4/perl_matcher_non_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_non_recursive.hpp @@ -113,7 +113,7 @@ struct saved_single_repeat : public saved_state template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[26] = + static matcher_proc_type const s_match_vtable[27] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -141,6 +141,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_char_repeat, &perl_matcher::match_set_repeat, &perl_matcher::match_long_set_repeat, + &perl_matcher::match_backstep, }; push_recursion_stopper(); @@ -717,8 +718,9 @@ bool perl_matcher::match_long_set_repeat() #ifdef __BORLANDC__ #pragma option push -w-8008 -w-8066 -w-8004 #endif + typedef typename traits::char_class_type mask_type; const re_repeat* rep = static_cast(pstate); - const re_set_long* set = static_cast*>(pstate->next.p); + const re_set_long* set = static_cast*>(pstate->next.p); std::size_t count = 0; // // start by working out how much we can skip: @@ -1207,6 +1209,7 @@ bool perl_matcher::unwind_short_set_repeat(bool template bool perl_matcher::unwind_long_set_repeat(bool r) { + typedef typename traits::char_class_type mask_type; saved_single_repeat* pmp = static_cast*>(m_backup_state); // if we have a match, just discard this state: @@ -1219,7 +1222,7 @@ bool perl_matcher::unwind_long_set_repeat(bool const re_repeat* rep = pmp->rep; std::size_t count = pmp->count; pstate = rep->next.p; - const re_set_long* set = static_cast*>(pstate); + const re_set_long* set = static_cast*>(pstate); position = pmp->last_position; assert(rep->type == syntax_element_long_set_rep); diff --git a/include/boost/regex/v4/perl_matcher_recursive.hpp b/include/boost/regex/v4/perl_matcher_recursive.hpp index 586bb973..4ce6a75b 100644 --- a/include/boost/regex/v4/perl_matcher_recursive.hpp +++ b/include/boost/regex/v4/perl_matcher_recursive.hpp @@ -48,7 +48,7 @@ public: template bool perl_matcher::match_all_states() { - static matcher_proc_type const s_match_vtable[26] = + static matcher_proc_type const s_match_vtable[27] = { (&perl_matcher::match_startmark), &perl_matcher::match_endmark, @@ -76,6 +76,7 @@ bool perl_matcher::match_all_states() &perl_matcher::match_char_repeat, &perl_matcher::match_set_repeat, &perl_matcher::match_long_set_repeat, + &perl_matcher::match_backstep, }; if(state_count > max_state_count) diff --git a/src/primary_transform.hpp b/include/boost/regex/v4/primary_transform.hpp similarity index 74% rename from src/primary_transform.hpp rename to include/boost/regex/v4/primary_transform.hpp index a7381b8b..f9a0bcb1 100644 --- a/src/primary_transform.hpp +++ b/include/boost/regex/v4/primary_transform.hpp @@ -17,6 +17,13 @@ * by the current locale. */ +#ifndef BOOST_REGEX_PRIMARY_TRANSFORM +#define BOOST_REGEX_PRIMARY_TRANSFORM + +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_PREFIX +#endif + namespace boost{ namespace re_detail{ @@ -31,6 +38,12 @@ enum{ template unsigned count_chars(const S& s, charT c) { + // + // Count how many occurances of character c occur + // in string s: if c is a delimeter between collation + // fields, then this should be the same value for all + // sort keys: + // unsigned int count = 0; for(unsigned pos = 0; pos < s.size(); ++pos) { @@ -53,20 +66,17 @@ unsigned find_sort_syntax(const traits* pt, charT* delim) // Suppress incorrect warning for MSVC (void)pt; - string_type a(1, (char_type)'a'); - string_type sa; - pt->transform(sa, a); + char_type a[2] = {'a', '\0', }; + string_type sa(pt->transform(a, a+1)); if(sa == a) { *delim = 0; return sort_C; } - string_type A(1, (char_type)'A'); - string_type sA; - pt->transform(sA, A); - string_type c(1, (char_type)';'); - string_type sc; - pt->transform(sc, c); + char_type A[2] = { 'A', '\0', }; + string_type sA(pt->transform(A, A+1)); + char_type c[2] = { ';', '\0', }; + string_type sc(pt->transform(c, c+1)); int pos = 0; while((pos <= static_cast(sa.size())) && (pos <= static_cast(sA.size())) && (sa[pos] == sA[pos])) ++pos; @@ -77,11 +87,11 @@ unsigned find_sort_syntax(const traits* pt, charT* delim) return sort_unknown; } // - // at this point sa[pos] is either the end of a fixed with field + // at this point sa[pos] is either the end of a fixed width field // or the character that acts as a delimiter: // charT maybe_delim = sa[pos]; - if((pos != 0) && (count_chars(sa, maybe_delim) == count_chars(sA, maybe_delim)) && (count_chars(sa, maybe_delim) == count_chars(c, maybe_delim))) + if((pos != 0) && (count_chars(sa, maybe_delim) == count_chars(sA, maybe_delim)) && (count_chars(sa, maybe_delim) == count_chars(sc, maybe_delim))) { *delim = maybe_delim; return sort_delim; @@ -89,7 +99,7 @@ unsigned find_sort_syntax(const traits* pt, charT* delim) // // OK doen't look like a delimiter, try for fixed width field: // - if((sa.size() == sA.size()) && (sa.size() == c.size())) + if((sa.size() == sA.size()) && (sa.size() == sc.size())) { // note assumes that the fixed width field is less than // numeric_limits::max(), should be true for all types @@ -108,6 +118,11 @@ unsigned find_sort_syntax(const traits* pt, charT* delim) } // namespace re_detail } // namespace boost +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_SUFFIX +#endif + +#endif diff --git a/include/boost/regex/v4/regex.hpp b/include/boost/regex/v4/regex.hpp index 7a9aeb45..3f19fed3 100644 --- a/include/boost/regex/v4/regex.hpp +++ b/include/boost/regex/v4/regex.hpp @@ -41,9 +41,6 @@ #ifndef BOOST_REGEX_FWD_HPP #include #endif -#ifndef BOOST_REGEX_STACK_HPP -#include -#endif #ifndef BOOST_REGEX_RAW_BUFFER_HPP #include #endif diff --git a/include/boost/regex/v4/regex_traits.hpp b/include/boost/regex/v4/regex_traits.hpp index 5643ef1e..93b7b678 100644 --- a/include/boost/regex/v4/regex_traits.hpp +++ b/include/boost/regex/v4/regex_traits.hpp @@ -35,6 +35,10 @@ #include #endif +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_PREFIX +#endif + namespace boost{ template @@ -45,5 +49,9 @@ struct regex_traits : public implementationT } // namespace boost +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_SUFFIX +#endif + #endif // include diff --git a/include/boost/regex/v4/regex_traits_defaults.hpp b/include/boost/regex/v4/regex_traits_defaults.hpp index 5e1d6cea..0409f6bd 100644 --- a/include/boost/regex/v4/regex_traits_defaults.hpp +++ b/include/boost/regex/v4/regex_traits_defaults.hpp @@ -19,6 +19,10 @@ #ifndef BOOST_REGEX_TRAITS_DEFAULTS_HPP_INCLUDED #define BOOST_REGEX_TRAITS_DEFAULTS_HPP_INCLUDED +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_PREFIX +#endif + namespace boost{ namespace re_detail{ BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants::syntax_type n); @@ -77,6 +81,11 @@ inline bool is_separator(charT c) { return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r') || (static_cast(c) == 0x2028) || (static_cast(c) == 0x2029)); } +template <> +inline bool is_separator(char c) +{ + return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r')); +} // // get a default collating element: @@ -99,7 +108,7 @@ struct character_pointer_range } bool operator == (const character_pointer_range& r)const { - return (std::distance(p1, p2) == std::distance(r.p1, r.p2)) && std::equal(p1, p2, r.p1); + return ((p2 - p1) == (r.p2 - r.p1)) && std::equal(p1, p2, r.p1); } }; template @@ -183,4 +192,8 @@ int parse_value(const charT*& p1, const charT* p2, const traits& traits_inst, in } // re_detail } // boost +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_SUFFIX +#endif + #endif diff --git a/include/boost/regex/v4/states.hpp b/include/boost/regex/v4/states.hpp index 09d89b32..92842a7e 100644 --- a/include/boost/regex/v4/states.hpp +++ b/include/boost/regex/v4/states.hpp @@ -106,7 +106,9 @@ enum syntax_element_type syntax_element_dot_rep = syntax_element_restart_continue + 1, syntax_element_char_rep = syntax_element_dot_rep + 1, syntax_element_short_set_rep = syntax_element_char_rep + 1, - syntax_element_long_set_rep = syntax_element_short_set_rep + 1 + syntax_element_long_set_rep = syntax_element_short_set_rep + 1, + // a backstep for lookbehind repeats: + syntax_element_backstep = syntax_element_long_set_rep + 1 }; #ifdef BOOST_REGEX_DEBUG diff --git a/src/regex.cpp b/src/regex.cpp index cc892389..63be80b1 100644 --- a/src/regex.cpp +++ b/src/regex.cpp @@ -49,6 +49,10 @@ bad_expression::~bad_expression() throw() {} namespace re_detail{ +BOOST_REGEX_DECL void BOOST_REGEX_CALL raise_runtime_error(const std::runtime_error& ex) +{ + ::boost::throw_exception(ex); +} // // error checking API: // diff --git a/src/static_mutex.cpp b/src/static_mutex.cpp index e290ec27..65fa20d5 100644 --- a/src/static_mutex.cpp +++ b/src/static_mutex.cpp @@ -90,7 +90,7 @@ void scoped_static_mutex_lock::lock() #if !defined(InterlockedCompareExchangePointer) while(0 != InterlockedCompareExchange(reinterpret_cast((boost::uint_least16_t*)&(m_mutex.m_mutex)), (void*)1, 0)) #else - while(0 != InterlockedCompareExchange(reinterpret_cast(&(m_mutex.m_mutex)), 1, 0)) + while(0 != InterlockedCompareExchange(reinterpret_cast(&(m_mutex.m_mutex)), 1, 0)) #endif { Sleep(0); @@ -106,7 +106,7 @@ void scoped_static_mutex_lock::unlock() #if !defined(InterlockedCompareExchangePointer) InterlockedExchange((LONG*)&(m_mutex.m_mutex), 0); #else - InterlockedExchange(reinterpret_cast(&(m_mutex.m_mutex)), 0); + InterlockedExchange(reinterpret_cast(&(m_mutex.m_mutex)), 0); #endif m_have_lock = false; } diff --git a/test/regress/basic_tests.cpp b/test/regress/basic_tests.cpp index 78183294..46dca15c 100644 --- a/test/regress/basic_tests.cpp +++ b/test/regress/basic_tests.cpp @@ -1,7 +1,18 @@ +#include + +#if BOOST_WORKAROUND(__BORLANDC__, < 0x560) +// we get unresolved externals from basic_string +// unless we do this, a well known Borland bug: +#define _RWSTD_COMPILE_INSTANTIATE +#endif #include "test.hpp" +#ifdef BOOST_MSVC +#pragma warning(disable:4127) +#endif + void basic_tests() { using namespace boost::regex_constants; @@ -417,8 +428,10 @@ void test_anchors() // TEST_REGEX_SEARCH("^.", extended, " \n \r\n ", match_default, make_array(0, 1, -2, 3, 4, -2, 7, 8, -2, -2)); TEST_REGEX_SEARCH(".$", extended, " \n \r\n ", match_default, make_array(1, 2, -2, 4, 5, -2, 8, 9, -2, -2)); - TEST_REGEX_SEARCH_W(L"^.", extended, L"\u2028 \u2028", match_default, make_array(0, 1, -2, 1, 2, -2, -2)); - TEST_REGEX_SEARCH_W(L".$", extended, L" \u2028 \u2028", match_default, make_array(0, 1, -2, 2, 3, -2, 3, 4, -2, -2)); +#if !BOOST_WORKAROUND(__BORLANDC__, < 0x560) + TEST_REGEX_SEARCH_W(L"^.", extended, L"\x2028 \x2028", match_default, make_array(0, 1, -2, 1, 2, -2, -2)); + TEST_REGEX_SEARCH_W(L".$", extended, L" \x2028 \x2028", match_default, make_array(0, 1, -2, 2, 3, -2, 3, 4, -2, -2)); +#endif } void test_backrefs() @@ -491,7 +504,9 @@ void test_character_escapes() TEST_REGEX_SEARCH("a\\Q+*?\\\\Eb", perl, "a+*?\\b", match_default, make_array(0, 6, -2, -2)); TEST_REGEX_SEARCH("\\C+", perl, "abcde", match_default, make_array(0, 5, -2, -2)); TEST_REGEX_SEARCH("\\X+", perl, "abcde", match_default, make_array(0, 5, -2, -2)); - TEST_REGEX_SEARCH_W(L"\\X", perl, L"a\u0300\u0301", match_default, make_array(0, 3, -2, -2)); +#if !BOOST_WORKAROUND(__BORLANDC__, < 0x560) + TEST_REGEX_SEARCH_W(L"\\X", perl, L"a\x0300\x0301", match_default, make_array(0, 3, -2, -2)); +#endif } void test_assertion_escapes() @@ -667,6 +682,7 @@ void test_tricky_cases() void test_tricky_cases2() { + using namespace boost::regex_constants; TEST_REGEX_SEARCH("((0x[[:xdigit:]]+)|([[:digit:]]+))u?((int(8|16|32|64))|L)?", perl, "0xFF", match_default, make_array(0, 4, 0, 4, 0, 4, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2)); TEST_REGEX_SEARCH("((0x[[:xdigit:]]+)|([[:digit:]]+))u?((int(8|16|32|64))|L)?", perl, "35", match_default, make_array(0, 2, 0, 2, -1, -1, 0, 2, -1, -1, -1, -1, -1, -1, -2, -2)); TEST_REGEX_SEARCH("((0x[[:xdigit:]]+)|([[:digit:]]+))u?((int(8|16|32|64))|L)?", perl, "0xFFu", match_default, make_array(0, 5, 0, 4, 0, 4, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2)); @@ -679,14 +695,16 @@ void test_tricky_cases2() // posix only: TEST_REGEX_SEARCH("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*", awk, "#define some_symbol(x) \\ \r\n foo();\\\r\n printf(#x);", match_default, make_array(0, 53, 28, 42, -2, -2)); // now try and test some unicode specific characters: - TEST_REGEX_SEARCH_W(L"[[:unicode:]]+", perl, L"a\u0300\u0400z", match_default, make_array(1, 3, -2, -2)); - TEST_REGEX_SEARCH_W(L"[\x10-\xff]", perl, L"\u0300\u0400", match_default, make_array(-2, -2)); - TEST_REGEX_SEARCH_W(L"[\01-\05]{5}", perl, L"\u0300\u0400\u0300\u0400\u0300\u0400", match_default, make_array(-2, -2)); - TEST_REGEX_SEARCH_W(L"[\x300-\x400]+", perl, L"\u0300\u0400\u0300\u0400\u0300\u0400", match_default, make_array(0, 6, -2, -2)); - TEST_REGEX_SEARCH_W(L"[\\x{300}-\\x{400}]+", perl, L"\u0300\u0400\u0300\u0400\u0300\u0400", match_default, make_array(0, 6, -2, -2)); - TEST_REGEX_SEARCH_W(L"\\x{300}\\x{400}+", perl, L"\u0300\u0400\u0400\u0400\u0400\u0400", match_default, make_array(0, 6, -2, -2)); +#if !BOOST_WORKAROUND(__BORLANDC__, < 0x560) + TEST_REGEX_SEARCH_W(L"[[:unicode:]]+", perl, L"a\x0300\x0400z", match_default, make_array(1, 3, -2, -2)); + TEST_REGEX_SEARCH_W(L"[\x10-\xff]", perl, L"\x0300\x0400", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH_W(L"[\01-\05]{5}", perl, L"\x0300\x0400\x0300\x0400\x0300\x0400", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH_W(L"[\x300-\x400]+", perl, L"\x0300\x0400\x0300\x0400\x0300\x0400", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"[\\x{300}-\\x{400}]+", perl, L"\x0300\x0400\x0300\x0400\x0300\x0400", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH_W(L"\\x{300}\\x{400}+", perl, L"\x0300\x0400\x0400\x0400\x0400\x0400", match_default, make_array(0, 6, -2, -2)); +#endif // finally try some case insensitive matches: - TEST_REGEX_SEARCH("0123456789@abcdefghijklmnopqrstuvwxyz\\[\\\\\\]\\^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ\\{\\|\\}", perl|icase, "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}", match_default, make_array(0, 72, -2, -2)); + TEST_REGEX_SEARCH("0123456789@abcdefghijklmnopqrstuvwxyz\\[\\\\\\]\\^_`ABCDEFGHIJKLMNOPQRSTUVWXYZ\\{\\|\\}", perl|icase, "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}", match_default, make_array(0, 72, -2, -2)); TEST_REGEX_SEARCH("a", perl|icase, "A", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("A", perl|icase, "a", match_default, make_array(0, 1, -2, -2)); TEST_REGEX_SEARCH("[abc]+", perl|icase, "abcABC", match_default, make_array(0, 6, -2, -2)); @@ -763,6 +781,9 @@ void test_tricky_cases2() TEST_REGEX_SEARCH("()\\1", perl, "a", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, -2)); TEST_REGEX_SEARCH("a()\\1b", perl, "ab", match_default, make_array(0, 2, 1, 1, -2, -2)); TEST_REGEX_SEARCH("a()b\\1", perl, "ab", match_default, make_array(0, 2, 1, 1, -2, -2)); + TEST_REGEX_SEARCH("([a-c]+)\\1", perl, "abcbc", match_default, make_array(1, 5, 1, 3, -2, -2)); + TEST_REGEX_SEARCH(".+abc", perl, "xxxxxxxxyyyyyyyyab", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(.+)\\1", perl, "abcdxxxyyyxxxyyy", match_default, make_array(4, 16, 4, 10, -2, -2)); // // the strings in the next test case are too long for most compilers to cope with, @@ -806,7 +827,7 @@ void test_tricky_cases2() make_array(753, 1076, 934, 1005, -2, 2143, 2466, 2324, 2395, -2, -2)); test(char(0), test_regex_search_tag()); }while(0); -#ifndef BOOST_NO_WREGEX +#if !defined(BOOST_NO_WREGEX) && !defined(BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS) do{ std::string st(big_text); test_info::set_info(__FILE__, __LINE__, @@ -997,6 +1018,18 @@ void test_forward_lookahead_asserts() TEST_REGEX_SEARCH("^(?=.*\\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$", perl, "abc3", match_default, make_array(-2, -2)); TEST_REGEX_SEARCH("^(?=.*\\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$", perl, "abC3", match_default, make_array(0, 4, -2, -2)); TEST_REGEX_SEARCH("^(?=.*\\d)(?=.*[a-z])(?=.*[A-Z]).{4,8}$", perl, "ABCD3", match_default, make_array(-2, -2)); + + // lookbehind assertions, added 2004-04-30 + TEST_REGEX_SEARCH("/\\*.*(?<=\\*)/", perl, "/**/", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("/\\*.*(?<=\\*)/", perl, "/*****/ ", match_default, make_array(0, 7, -2, -2)); + TEST_REGEX_SEARCH("(?<=['\"]).*?(?=['\"])", perl, " 'ac' ", match_default, make_array(2, 4, -2, -2)); + TEST_REGEX_SEARCH("(?<=['\"]).*?(?=['\"])", perl, " \"ac\" ", match_default, make_array(2, 4, -2, -2)); + TEST_REGEX_SEARCH("(?<=['\"]).*?(?^abc) abc 0 3 -(?>^abc) def\nabc 4 7 -(?>^abc) defabc -1 -1 -(?>.*/)foo /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/it/you/see/ -1 -1 -(?>.*/)foo /this/is/a/very/long/line/in/deed/with/very/many/slashes/in/and/foo 0 67 -(?>(\.\d\d[1-9]?))\d+ 1.230003938 1 11 1 4 -(?>(\.\d\d[1-9]?))\d+ 1.875000282 1 11 1 5 -(?>(\.\d\d[1-9]?))\d+ 1.235 -1 -1 -^((?>\w+)|(?>\s+))*$ "now is the time for all good men to come to the aid of the party" 0 64 59 64 -^((?>\w+)|(?>\s+))*$ "this is not a line with only words and spaces!" -1 -1 -((?>\d+))(\w) 12345a 0 6 0 5 5 6 -((?>\d+))(\w) 12345+ -1 -1 -((?>\d+))(\d) 12345 -1 -1 -(?>a+)b aaab 0 4 -((?>a+)b) aaab 0 4 0 4 -(?>(a+))b aaab 0 4 0 3 -(?>b)+ aaabbbccc 3 6 -(?>a+|b+|c+)*c aaabbbbccccd 0 8 -((?>[^()]+)|\([^()]*\))+ ((abc(ade)ufh()()x 2 18 17 18 -\(((?>[^()]+)|\([^()]+\))+\) (abc) 0 5 1 4 -\(((?>[^()]+)|\([^()]+\))+\) (abc(def)xyz) 0 13 9 12 -\(((?>[^()]+)|\([^()]+\))+\) ((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa -1 -1 -(?>a*)* a 0 1 -(?>a*)* aa 0 2 -(?>a*)* aaaa 0 4 -(?>a*)* a 0 1 -(?>a*)* aaabcde 0 3 -((?>a*))* aaaaa 0 5 5 5 -((?>a*))* aabbaa 0 2 2 2 -((?>a*?))* aaaaa 0 0 0 0 -((?>a*?))* aabbaa 0 0 0 0 -"word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword" "word cat dog elephant mussel cow horse canary baboon snake shark otherword" 0 74 -"word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword" "word cat dog elephant mussel cow horse canary baboon snake shark" -1 -1 -"word (?>[a-zA-Z0-9]+ ){0,30}otherword" "word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope" -1 -1 -"word (?>[a-zA-Z0-9]+ ){0,30}otherword" "word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I really really hope otherword" -1 -1 -((?>Z)+|A)* ZABCDEFG 0 2 1 2 -((?>)+|A)* ! - -; subtleties of matching with no sub-expressions marked -- normal match_nosubs REG_NO_POSIX_TEST -a(b?c)+d accd 0 4 -(wee|week)(knights|night) weeknights 0 10 -.* abc 0 3 -a(b|(c))d abd 0 3 -a(b|(c))d acd 0 3 -a(b*|c|e)d abbd 0 4 -a(b*|c|e)d acd 0 3 -a(b*|c|e)d ad 0 2 -a(b?)c abc 0 3 -a(b?)c ac 0 2 -a(b+)c abc 0 3 -a(b+)c abbbc 0 5 -a(b*)c ac 0 2 -(a|ab)(bc([de]+)f|cde) abcdef 0 6 -a([bc]?)c abc 0 3 -a([bc]?)c ac 0 2 -a([bc]+)c abc 0 3 -a([bc]+)c abcc 0 4 -a([bc]+)bc abcbc 0 5 -a(bb+|b)b abb 0 3 -a(bbb+|bb+|b)b abb 0 3 -a(bbb+|bb+|b)b abbb 0 4 -a(bbb+|bb+|b)bb abbb 0 4 -(.*).* abcdef 0 6 -(a*)* bc 0 0 - -- normal nosubs REG_NO_POSIX_TEST -a(b?c)+d accd 0 4 -(wee|week)(knights|night) weeknights 0 10 -.* abc 0 3 -a(b|(c))d abd 0 3 -a(b|(c))d acd 0 3 -a(b*|c|e)d abbd 0 4 -a(b*|c|e)d acd 0 3 -a(b*|c|e)d ad 0 2 -a(b?)c abc 0 3 -a(b?)c ac 0 2 -a(b+)c abc 0 3 -a(b+)c abbbc 0 5 -a(b*)c ac 0 2 -(a|ab)(bc([de]+)f|cde) abcdef 0 6 -a([bc]?)c abc 0 3 -a([bc]?)c ac 0 2 -a([bc]+)c abc 0 3 -a([bc]+)c abcc 0 4 -a([bc]+)bc abcbc 0 5 -a(bb+|b)b abb 0 3 -a(bbb+|bb+|b)b abb 0 3 -a(bbb+|bb+|b)b abbb 0 4 -a(bbb+|bb+|b)bb abbb 0 4 -(.*).* abcdef 0 6 -(a*)* bc 0 0 - -#endif +} + +void test_independent_subs() +{ + using namespace boost::regex_constants; + TEST_REGEX_SEARCH("(?>^abc)", perl, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?>^abc)", perl, "def\nabc", match_default, make_array(4, 7, -2, -2)); + TEST_REGEX_SEARCH("(?>^abc)", perl, "defabc", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?>.*/)foo", perl, "/this/is/a/very/long/line/in/deed/with/very/many/slashes/in/it/you/see/", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?>.*/)foo", perl, "/this/is/a/very/long/line/in/deed/with/very/many/slashes/in/and/foo", match_default, make_array(0, 67, -2, -2)); + TEST_REGEX_SEARCH("(?>(\\.\\d\\d[1-9]?))\\d+", perl, "1.230003938", match_default, make_array(1, 11, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("(?>(\\.\\d\\d[1-9]?))\\d+", perl, "1.875000282", match_default, make_array(1, 11, 1, 5, -2, -2)); + TEST_REGEX_SEARCH("(?>(\\.\\d\\d[1-9]?))\\d+", perl, "1.235", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("^((?>\\w+)|(?>\\s+))*$", perl, "now is the time for all good men to come to the aid of the party", match_default, make_array(0, 64, 59, 64, -2, -2)); + TEST_REGEX_SEARCH("^((?>\\w+)|(?>\\s+))*$", perl, "this is not a line with only words and spaces!", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("((?>\\d+))(\\w)", perl, "12345a", match_default, make_array(0, 6, 0, 5, 5, 6, -2, -2)); + TEST_REGEX_SEARCH("((?>\\d+))(\\w)", perl, "12345+", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("((?>\\d+))(\\d)", perl, "12345", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("(?>a+)b", perl, "aaab", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("((?>a+)b)", perl, "aaab", match_default, make_array(0, 4, 0, 4, -2, -2)); + TEST_REGEX_SEARCH("(?>(a+))b", perl, "aaab", match_default, make_array(0, 4, 0, 3, -2, -2)); + TEST_REGEX_SEARCH("(?>b)+", perl, "aaabbbccc", match_default, make_array(3, 6, -2, -2)); + TEST_REGEX_SEARCH("(?>a+|b+|c+)*c", perl, "aaabbbbccccd", match_default, make_array(0, 8, -2, 8, 9, -2, 9, 10, -2, 10, 11, -2, -2)); + TEST_REGEX_SEARCH("((?>[^()]+)|\\([^()]*\\))+", perl, "((abc(ade)ufh()()x", match_default, make_array(2, 18, 17, 18, -2, -2)); + TEST_REGEX_SEARCH("\\(((?>[^()]+)|\\([^()]+\\))+\\)", perl, "(abc)", match_default, make_array(0, 5, 1, 4, -2, -2)); + TEST_REGEX_SEARCH("\\(((?>[^()]+)|\\([^()]+\\))+\\)", perl, "(abc(def)xyz)", match_default, make_array(0, 13, 9, 12, -2, -2)); + TEST_REGEX_SEARCH("\\(((?>[^()]+)|\\([^()]+\\))+\\)", perl, "((()aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", match_default, make_array(-2, -2)); + + TEST_REGEX_SEARCH("(?>a*)*", perl, "a", match_default, make_array(0, 1, -2, 1, 1, -2, -2)); + TEST_REGEX_SEARCH("(?>a*)*", perl, "aa", match_default, make_array(0, 2, -2, 2, 2, -2, -2)); + TEST_REGEX_SEARCH("(?>a*)*", perl, "aaaa", match_default, make_array(0, 4, -2, 4, 4, -2, -2)); + TEST_REGEX_SEARCH("(?>a*)*", perl, "a", match_default, make_array(0, 1, -2, 1, 1, -2, -2)); + TEST_REGEX_SEARCH("(?>a*)*", perl, "aaabcde", match_default, make_array(0, 3, -2, 3, 3, -2, 4, 4, -2, 5, 5, -2, 6, 6, -2, 7, 7, -2, -2)); + TEST_REGEX_SEARCH("((?>a*))*", perl, "aaaaa", match_default, make_array(0, 5, 5, 5, -2, 5, 5, 5, 5, -2, -2)); + TEST_REGEX_SEARCH("((?>a*))*", perl, "aabbaa", match_default, make_array(0, 2, 2, 2, -2, 2, 2, 2, 2, -2, 3, 3, 3, 3, -2, 4, 6, 6, 6, -2, 6, 6, 6, 6, -2, -2)); + TEST_REGEX_SEARCH("((?>a*?))*", perl, "aaaaa", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, 2, 2, 2, 2, -2, 3, 3, 3, 3, -2, 4, 4, 4, 4, -2, 5, 5, 5, 5, -2, -2)); + TEST_REGEX_SEARCH("((?>a*?))*", perl, "aabbaa", match_default, make_array(0, 0, 0, 0, -2, 1, 1, 1, 1, -2, 2, 2, 2, 2, -2, 3, 3, 3, 3, -2, 4, 4, 4, 4, -2, 5, 5, 5, 5, -2, 6, 6, 6, 6, -2, -2)); + TEST_REGEX_SEARCH("word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword", perl, "word cat dog elephant mussel cow horse canary baboon snake shark otherword", match_default, make_array(0, 74, -2, -2)); + TEST_REGEX_SEARCH("word (?>(?:(?!otherword)[a-zA-Z0-9]+ ){0,30})otherword", perl, "word cat dog elephant mussel cow horse canary baboon snake shark", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("word (?>[a-zA-Z0-9]+ ){0,30}otherword", perl, "word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I hope", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("word (?>[a-zA-Z0-9]+ ){0,30}otherword", perl, "word cat dog elephant mussel cow horse canary baboon snake shark the quick brown fox and the lazy dog and several other words getting close to thirty by now I really really hope otherword", match_default, make_array(-2, -2)); + TEST_REGEX_SEARCH("((?>Z)+|A)+", perl, "ZABCDEFG", match_default, make_array(0, 2, 1, 2, -2, -2)); + TEST_INVALID_REGEX("((?>)+|A)+", perl); +} + +void test_nosubs() +{ + using namespace boost::regex_constants; + // subtleties of matching with no sub-expressions marked + TEST_REGEX_SEARCH("a(b?c)+d", perl, "accd", match_default|match_nosubs, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl, "weeknights", match_default|match_nosubs, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH(".*", perl, "abc", match_default|match_nosubs, make_array(0, 3, -2, 3, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b|(c))d", perl, "abd", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b|(c))d", perl, "acd", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*|c|e)d", perl, "abbd", match_default|match_nosubs, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("a(b*|c|e)d", perl, "acd", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*|c|e)d", perl, "ad", match_default|match_nosubs, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a(b?)c", perl, "abc", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b?)c", perl, "ac", match_default|match_nosubs, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a(b+)c", perl, "abc", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b+)c", perl, "abbbc", match_default|match_nosubs, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("a(b*)c", perl, "ac", match_default|match_nosubs, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("(a|ab)(bc([de]+)f|cde)", perl, "abcdef", match_default|match_nosubs, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("a([bc]?)c", perl, "abc", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a([bc]?)c", perl, "ac", match_default|match_nosubs, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("a([bc]+)c", perl, "abc", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a([bc]+)c", perl, "abcc", match_default|match_nosubs, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("a([bc]+)bc", perl, "abcbc", match_default|match_nosubs, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("a(bb+|b)b", perl, "abb", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(bbb+|bb+|b)b", perl, "abb", match_default|match_nosubs, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(bbb+|bb+|b)b", perl, "abbb", match_default|match_nosubs, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("a(bbb+|bb+|b)bb", perl, "abbb", match_default|match_nosubs, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("(.*).*", perl, "abcdef", match_default|match_nosubs, make_array(0, 6, -2, 6, 6, -2, -2)); + TEST_REGEX_SEARCH("(a*)*", perl, "bc", match_default|match_nosubs, make_array(0, 0, -2, 1, 1, -2, 2, 2, -2, -2)); + + TEST_REGEX_SEARCH("a(b?c)+d", perl|nosubs, "accd", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("(wee|week)(knights|night)", perl|nosubs, "weeknights", match_default, make_array(0, 10, -2, -2)); + TEST_REGEX_SEARCH(".*", perl|nosubs, "abc", match_default, make_array(0, 3, -2, 3, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b|(c))d", perl|nosubs, "abd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b|(c))d", perl|nosubs, "acd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*|c|e)d", perl|nosubs, "abbd", match_default, make_array(0, 4, -2, -2)); + + TEST_REGEX_SEARCH("a(b*|c|e)d", perl|nosubs, "acd", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b*|c|e)d", perl|nosubs, "ad", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a(b?)c", perl|nosubs, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b?)c", perl|nosubs, "ac", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("a(b+)c", perl|nosubs, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(b+)c", perl|nosubs, "abbbc", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("a(b*)c", perl|nosubs, "ac", match_default, make_array(0, 2, -2, -2)); + TEST_REGEX_SEARCH("(a|ab)(bc([de]+)f|cde)", perl|nosubs, "abcdef", match_default, make_array(0, 6, -2, -2)); + TEST_REGEX_SEARCH("a([bc]?)c", perl|nosubs, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a([bc]?)c", perl|nosubs, "ac", match_default, make_array(0, 2, -2, -2)); + + TEST_REGEX_SEARCH("a([bc]+)c", perl|nosubs, "abc", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a([bc]+)c", perl|nosubs, "abcc", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("a([bc]+)bc", perl|nosubs, "abcbc", match_default, make_array(0, 5, -2, -2)); + TEST_REGEX_SEARCH("a(bb+|b)b", perl|nosubs, "abb", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(bbb+|bb+|b)b", perl|nosubs, "abb", match_default, make_array(0, 3, -2, -2)); + TEST_REGEX_SEARCH("a(bbb+|bb+|b)b", perl|nosubs, "abbb", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("a(bbb+|bb+|b)bb", perl|nosubs, "abbb", match_default, make_array(0, 4, -2, -2)); + TEST_REGEX_SEARCH("(.*).*", perl|nosubs, "abcdef", match_default, make_array(0, 6, -2, 6, 6, -2, -2)); + TEST_REGEX_SEARCH("(a*)*", perl|nosubs, "bc", match_default, make_array(0, 0, -2, 1, 1, -2, 2, 2, -2, -2)); + } diff --git a/test/regress/info.hpp b/test/regress/info.hpp index 9926cc1d..23189574 100644 --- a/test/regress/info.hpp +++ b/test/regress/info.hpp @@ -11,7 +11,9 @@ template class test_info { +public: typedef std::basic_string string_type; +private: struct data_type { std::string file; diff --git a/test/regress/main.cpp b/test/regress/main.cpp index 215e991a..6963cdf9 100644 --- a/test/regress/main.cpp +++ b/test/regress/main.cpp @@ -22,6 +22,8 @@ int cpp_main(int argc, char * argv[]) test_partial_match(); test_forward_lookahead_asserts(); test_fast_repeats(); + test_independent_subs(); + test_nosubs(); return error_count; } diff --git a/test/regress/test.hpp b/test/regress/test.hpp index b2e91622..a790a516 100644 --- a/test/regress/test.hpp +++ b/test/regress/test.hpp @@ -140,5 +140,7 @@ void test_partial_match(); void test_forward_lookahead_asserts(); void test_fast_repeats(); void test_tricky_cases2(); +void test_independent_subs(); +void test_nosubs(); #endif diff --git a/test/regress/test_regex_replace.hpp b/test/regress/test_regex_replace.hpp new file mode 100644 index 00000000..b1eefc2e --- /dev/null +++ b/test/regress/test_regex_replace.hpp @@ -0,0 +1,54 @@ + +#ifndef BOOST_REGEX_REGRESS_REGEX_REPLACE_HPP +#define BOOST_REGEX_REGRESS_REGEX_REPLACE_HPP +#include "info.hpp" + +template +void test_regex_replace(boost::basic_regex& r) +{ + typedef std::basic_string string_type; + const string_type& search_text = test_info::search_text(); + boost::regex_constants::match_flag_type opts = test_info::match_options(); + const string_type& format_string = test_info::format_string(); + const string_type& result_string = test_info::result_string(); + + string_type result = boost::regex_replace(search_text, r, format_string, opts); + if(result != result_string) + { + BOOST_REGEX_TEST_ERROR("regex_replace generated an incorrect string result", charT); + } +} + + +struct test_regex_replace_tag{}; + +template +void test(boost::basic_regex& r, const test_regex_replace_tag&) +{ + const std::basic_string& expression = test_info::expression(); + boost::regex_constants::syntax_option_type syntax_options = test_info::syntax_options(); + try{ + r.assign(expression, syntax_options); + test_regex_replace(r); + } + catch(const boost::bad_expression& e) + { + BOOST_REGEX_TEST_ERROR("Expression did not compile when it should have done: " << e.what(), charT); + } + catch(const std::runtime_error& r) + { + BOOST_REGEX_TEST_ERROR("Received an unexpected std::runtime_error: " << r.what(), charT); + } + catch(const std::exception& r) + { + BOOST_REGEX_TEST_ERROR("Received an unexpected std::exception: " << r.what(), charT); + } + catch(...) + { + BOOST_REGEX_TEST_ERROR("Received an unexpected exception of unknown type", charT); + } + +} + +#endif + diff --git a/test/regress/test_regex_search.hpp b/test/regress/test_regex_search.hpp index 4246c9c7..99d97276 100644 --- a/test/regress/test_regex_search.hpp +++ b/test/regress/test_regex_search.hpp @@ -134,7 +134,7 @@ void test_regex_grep(boost::basic_regex& r) boost::regex_constants::match_flag_type opts = test_info::match_options(); const int* answer_table = test_info::answer_table(); grep_test_predicate pred(search_text.begin(), answer_table); - boost::regex_grep(pred, search_text, r, opts); + boost::regex_grep(pred, search_text.begin(), search_text.end(), r, opts); } template @@ -158,7 +158,7 @@ void test_regex_match(boost::basic_regex& r) { BOOST_REGEX_TEST_ERROR("boost::regex_match found a match when it should not have done so.", charT); } - else if((answer_table[0] == 0) && (answer_table[1] == search_text.size())) + else if((answer_table[0] == 0) && (answer_table[1] == static_cast(search_text.size()))) { if(boost::regex_match( search_text.begin(),