Almost complete implementation...

[SVN r22669]
This commit is contained in:
John Maddock
2004-04-19 12:23:41 +00:00
parent 50b5391c8f
commit 641d60b059
20 changed files with 1378 additions and 592 deletions

View File

@ -236,9 +236,9 @@ namespace boost{ typedef wchar_t regex_wchar_type; }
# if defined(BOOST_REGEX_DYN_LINK) || defined(BOOST_ALL_DYN_LINK)
# define BOOST_DYN_LINK
# endif
#ifdef BOOST_REGEX_DIAG
# define BOOST_LIB_DIAGNOSTIC
#endif
# ifdef BOOST_REGEX_DIAG
# define BOOST_LIB_DIAGNOSTIC
# endif
# include <boost/config/auto_link.hpp>
#endif

View File

@ -22,6 +22,7 @@
#define BOOST_REGEX_STATIC_MUTEX_HPP
#include <boost/config.hpp>
#include <boost/regex/config.hpp> // dll import/export options.
#ifdef BOOST_HAS_PTHREADS
#include <pthread.h>
@ -35,7 +36,7 @@
//
namespace boost{
class scoped_static_mutex_lock;
class BOOST_REGEX_DECL scoped_static_mutex_lock;
class static_mutex
{
@ -46,7 +47,7 @@ public:
#define BOOST_STATIC_MUTEX_INIT { PTHREAD_MUTEX_INITIALIZER, }
class scoped_static_mutex_lock
class BOOST_REGEX_DECL scoped_static_mutex_lock
{
public:
scoped_static_mutex_lock(static_mutex& mut, bool lk = true);
@ -82,7 +83,7 @@ inline bool scoped_static_mutex_lock::locked()const
namespace boost{
class scoped_static_mutex_lock;
class BOOST_REGEX_DECL scoped_static_mutex_lock;
class static_mutex
{
@ -93,7 +94,7 @@ public:
#define BOOST_STATIC_MUTEX_INIT { 0, }
class scoped_static_mutex_lock
class BOOST_REGEX_DECL scoped_static_mutex_lock
{
public:
scoped_static_mutex_lock(static_mutex& mut, bool lk = true);
@ -134,10 +135,10 @@ inline bool scoped_static_mutex_lock::locked()const
namespace boost{
class scoped_static_mutex_lock;
extern "C" void free_static_mutex();
class BOOST_REGEX_DECL scoped_static_mutex_lock;
extern "C" BOOST_REGEX_DECL void free_static_mutex();
class static_mutex
class BOOST_REGEX_DECL static_mutex
{
public:
typedef scoped_static_mutex_lock scoped_lock;
@ -148,7 +149,7 @@ public:
#define BOOST_STATIC_MUTEX_INIT { }
class scoped_static_mutex_lock
class BOOST_REGEX_DECL scoped_static_mutex_lock
{
public:
scoped_static_mutex_lock(static_mutex& mut, bool lk = true);

View File

@ -68,7 +68,7 @@ struct regex_data
//
template <class charT, class traits>
class basic_regex_implementation
: protected regex_data<charT, traits>
: public regex_data<charT, traits>
{
public:
typedef regex_constants::syntax_option_type flag_type;

View File

@ -45,8 +45,8 @@ template <class charT, class traits>
class basic_char_set
{
public:
typedef digraph<charT> digraph_type;
typedef std::basic_string<charT> string_type;
typedef digraph<charT> digraph_type;
typedef typename traits::string_type string_type;
typedef typename traits::char_class_type mask_type;
basic_char_set()
@ -68,8 +68,16 @@ public:
{
m_ranges.push_back(first);
m_ranges.push_back(end);
if(first.second || end.second)
if(first.second)
{
m_has_digraphs = true;
add_single(first);
}
if(end.second)
{
m_has_digraphs = true;
add_single(end);
}
m_empty = false;
}
void add_class(mask_type m)
@ -77,10 +85,20 @@ public:
m_classes |= m;
m_empty = false;
}
void add_equivalent(const digraph_type& s)
{
m_equivalents.push_back(s);
if(s.second)
{
m_has_digraphs = true;
add_single(s);
}
m_empty = false;
}
void negate()
{
m_negate = true;
m_empty = false;
//m_empty = false;
}
//
@ -111,6 +129,14 @@ public:
{
return m_ranges.end();
}
list_iterator equivalents_begin()const
{
return m_equivalents.begin();
}
list_iterator equivalents_end()const
{
return m_equivalents.end();
}
mask_type classes()const
{
return m_classes;
@ -126,6 +152,7 @@ private:
bool m_has_digraphs; // true if we have digraphs present
mask_type m_classes; // character classes to match
bool m_empty; // whether we've added anything yet
std::vector<digraph_type> m_equivalents; // a list of equivalence classes
};
template <class charT, class traits>
@ -189,6 +216,7 @@ private:
void set_all_masks(unsigned char* bits, unsigned char);
bool is_bad_repeat(re_syntax_base* pt);
void set_bad_repeat(re_syntax_base* pt);
syntax_element_type get_repeat_type(re_syntax_base* state);
};
template <class charT, class traits>
@ -297,7 +325,7 @@ re_syntax_base* basic_regex_creator<charT, traits>::append_set(
//
result->csingles = static_cast<unsigned int>(std::distance(char_set.singles_begin(), char_set.singles_end()));
result->cranges = static_cast<unsigned int>(std::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2;
result->cequivalents = 0;
result->cequivalents = static_cast<unsigned int>(std::distance(char_set.equivalents_begin(), char_set.equivalents_end()));
result->cclasses = char_set.classes();
if(flags() & regbase::icase)
{
@ -377,6 +405,27 @@ re_syntax_base* basic_regex_creator<charT, traits>::append_set(
std::memcpy(p, s2.c_str(), sizeof(charT) * (s2.size() + 1));
}
//
// now process the equivalence classes:
//
first = char_set.equivalents_begin();
last = char_set.equivalents_end();
while(first != last)
{
string_type s;
if(first->second)
{
charT cs[2] = { first->first, first->second, };
s = m_traits.transform_primary(cs, cs+2);
}
else
s = m_traits.transform_primary(&first->first, &first->first+1);
if(s.empty())
return 0; // invalid or unsupported equivalence class
charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) );
std::memcpy(p, s.c_str(), sizeof(charT) * (s.size() + 1));
++first;
}
//
// finally reset the address of our last state:
//
m_last_state = result = static_cast<re_set_long<mask_type>*>(getaddress(offset));
@ -470,6 +519,32 @@ re_syntax_base* basic_regex_creator<charT, traits>::append_set(
result->_map[i] = true;
}
}
//
// now process the equivalence classes:
//
first = char_set.equivalents_begin();
last = char_set.equivalents_end();
while(first != last)
{
string_type s;
if(first->second)
{
charT cs[2] = { first->first, first->second, };
s = m_traits.transform_primary(cs, cs+2);
}
else
s = m_traits.transform_primary(&first->first, &first->first+1);
if(s.empty())
return 0; // invalid or unsupported equivalence class
for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
{
charT c(i);
string_type s2 = this->m_traits.transform_primary(&c, &c+1);
if(s == s2)
result->_map[i] = true;
}
++first;
}
if(negate)
{
for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
@ -567,6 +642,8 @@ void basic_regex_creator<charT, traits>::create_startmaps(re_syntax_base* state)
create_startmap(state->next.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_take);
m_bad_repeats = 0;
create_startmap(static_cast<re_alt*>(state)->alt.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_skip);
// adjust the type of the state to allow for faster matching:
state->type = this->get_repeat_type(state);
return;
default:
state = state->next.p;
@ -613,6 +690,10 @@ void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state,
return;
}
case syntax_element_backref:
// can be null, and any character can match:
if(pnull)
*pnull |= mask;
// fall through:
case syntax_element_wild:
{
// can't be null, any character can match:
@ -668,13 +749,18 @@ void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state,
if(map)
{
typedef typename traits::char_class_type mask_type;
map[0] |= mask_init;
for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
if(static_cast<re_set_long<mask_type>*>(state)->singleton)
{
charT c = static_cast<charT>(i);
if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<mask_type>*>(state), *m_pdata))
map[i] |= mask;
map[0] |= mask_init;
for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
{
charT c = static_cast<charT>(i);
if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<mask_type>*>(state), *m_pdata))
map[i] |= mask;
}
}
else
set_all_masks(map, mask);
}
return;
case syntax_element_set:
@ -772,7 +858,6 @@ unsigned basic_regex_creator<charT, traits>::get_restart_type(re_syntax_base* st
continue;
case syntax_element_start_line:
return regbase::restart_line;
case syntax_element_word_boundary:
case syntax_element_word_start:
return regbase::restart_word;
case syntax_element_buffer_start:
@ -848,6 +933,35 @@ void basic_regex_creator<charT, traits>::set_bad_repeat(re_syntax_base* pt)
}
}
template <class charT, class traits>
syntax_element_type basic_regex_creator<charT, traits>::get_repeat_type(re_syntax_base* state)
{
typedef typename traits::char_class_type mask_type;
if(state->type == syntax_element_rep)
{
// check to see if we are repeating a single state:
if(state->next.p->next.p->next.p == static_cast<re_alt*>(state)->alt.p)
{
switch(state->next.p->type)
{
case re_detail::syntax_element_wild:
return re_detail::syntax_element_dot_rep;
case re_detail::syntax_element_literal:
return re_detail::syntax_element_char_rep;
case re_detail::syntax_element_set:
return re_detail::syntax_element_short_set_rep;
case re_detail::syntax_element_long_set:
if(static_cast<re_detail::re_set_long<mask_type>*>(state->next.p)->singleton)
return re_detail::syntax_element_long_set_rep;
break;
default:
break;
}
}
}
return state->type;
}
} // namespace re_detail
} // namespace boost

View File

@ -49,11 +49,15 @@ public:
bool parse_backref();
void parse_set_literal(basic_char_set<charT, traits>& char_set);
bool parse_inner_set(basic_char_set<charT, traits>& char_set);
digraph<charT> get_next_set_literal();
bool parse_QE();
bool parse_perl_extension();
digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
charT unescape_character();
private:
typedef bool (basic_regex_parser::*parser_proc_type)();
typedef typename traits::string_type string_type;
typedef typename traits::char_class_type char_class_type;
parser_proc_type m_parser_proc; // the main parser to use
const charT* m_base; // the start of the string being parsed
const charT* m_end; // the end of the string being parsed
@ -235,13 +239,29 @@ bool basic_regex_parser<charT, traits>::parse_literal()
template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_open_paren()
{
//
// skip the '(' and error check:
//
if(++m_position == m_end)
fail(REG_EPAREN, m_position - m_base);
//
// begin by checking for a perl-style (?...) extension:
//
if((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
{
if(m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
return parse_perl_extension();
}
//
// update our mark count, and append the required state:
//
unsigned markid = ++m_mark_count;
unsigned markid;
if(this->flags() & regbase::nosubs)
markid = 0;
else
markid = ++m_mark_count;
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
pb->index = markid;
++m_position;
std::ptrdiff_t last_paren_start = this->getoffset(pb);
// back up insertion point for alternations, and set new point:
std::ptrdiff_t last_alt_point = m_alt_insert_point;
@ -392,6 +412,18 @@ bool basic_regex_parser<charT, traits>::parse_extended_escape()
++m_position;
this->append_state(syntax_element_soft_buffer_end);
break;
case regex_constants::escape_type_Q:
return parse_QE();
case regex_constants::escape_type_C:
return parse_match_any();
case regex_constants::escape_type_X:
++m_position;
this->append_state(syntax_element_combining);
break;
case regex_constants::escape_type_G:
++m_position;
this->append_state(syntax_element_restart_continue);
break;
default:
this->append_literal(unescape_character());
break;
@ -465,6 +497,7 @@ bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_
case syntax_element_alt:
case syntax_element_soft_buffer_end:
case syntax_element_restart_continue:
case syntax_element_jump:
// can't legally repeat any of the above:
fail(REG_BADRPT, m_position - m_base);
default:
@ -653,6 +686,38 @@ bool basic_regex_parser<charT, traits>::parse_set()
if(parse_inner_set(char_set))
break;
return true;
case regex_constants::syntax_escape:
{
//
// look ahead and see if this is a character class shortcut
// \d \w \s etc...
//
++m_position;
if(this->m_traits.escape_syntax_type(*m_position)
== regex_constants::escape_type_class)
{
char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
if(m)
{
char_set.add_class(m);
break;
}
}
else if(this->m_traits.escape_syntax_type(*m_position)
== regex_constants::escape_type_not_class)
{
// negated character classes aren't supported:
char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
if(m)
{
fail(REG_EESCAPE, m_position - m_base);
}
}
// not a character class, just a regular escape:
--m_position;
parse_set_literal(char_set);
break;
}
default:
parse_set_literal(char_set);
break;
@ -673,6 +738,13 @@ bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, tr
fail(REG_EBRACK, m_position - m_base);
switch(this->m_traits.syntax_type(*m_position))
{
case regex_constants::syntax_dot:
//
// a collating element is treated as a literal:
//
--m_position;
parse_set_literal(char_set);
return true;
case regex_constants::syntax_colon:
{
// check that character classes are actually enabled:
@ -733,6 +805,37 @@ bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, tr
++m_position;
break;
}
case regex_constants::syntax_equal:
{
// skip the '='
if(m_end == ++m_position)
fail(REG_EBRACK, m_position - m_base);
const charT* name_first = m_position;
// skip at least one character, then find the matching '=]'
if(m_end == ++m_position)
fail(REG_EBRACK, m_position - m_base);
while((m_position != m_end)
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
++m_position;
const charT* name_last = m_position;
if(m_end == m_position)
fail(REG_EBRACK, m_position - m_base);
if((m_end == ++m_position)
|| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
fail(REG_EBRACK, m_position - m_base);
string_type m = this->m_traits.lookup_collatename(name_first, name_last);
if((0 == m.size()) || (m.size() > 2))
fail(REG_ECOLLATE, name_first - m_base);
digraph<charT> d;
d.first = m[0];
if(m.size() > 1)
d.second = m[1];
else
d.second = 0;
char_set.add_equivalent(d);
++m_position;
break;
}
default:
--m_position;
parse_set_literal(char_set);
@ -744,7 +847,7 @@ bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, tr
template <class charT, class traits>
void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
{
digraph<charT> start_range = get_next_set_literal();
digraph<charT> start_range = get_next_set_literal(char_set);
if(m_end == m_position)
fail(REG_EBRACK, m_position - m_base);
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
@ -754,7 +857,7 @@ void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT,
fail(REG_EBRACK, m_position - m_base);
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
{
digraph<charT> end_range = get_next_set_literal();
digraph<charT> end_range = get_next_set_literal(char_set);
char_set.add_range(start_range, end_range);
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
fail(REG_ERANGE, m_position - m_base);
@ -766,11 +869,22 @@ void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT,
}
template <class charT, class traits>
digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal()
digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
{
typedef typename traits::string_type string_type;
digraph<charT> result;
switch(this->m_traits.syntax_type(*m_position))
{
case regex_constants::syntax_dash:
if(!char_set.empty())
{
// see if we are at the end of the set:
if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
fail(REG_ERANGE, m_position - m_base);
--m_position;
}
result.first = *m_position++;
return result;
case regex_constants::syntax_escape:
// check to see if escapes are supported first:
if(this->flags() & regex_constants::no_escape_in_lists)
@ -781,6 +895,43 @@ digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal()
++m_position;
result = unescape_character();
break;
case regex_constants::syntax_open_set:
{
if(m_end == ++m_position)
fail(REG_ECOLLATE, m_position - m_base);
if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
{
--m_position;
result.first = *m_position;
++m_position;
return result;
}
if(m_end == ++m_position)
fail(REG_ECOLLATE, m_position - m_base);
const charT* name_first = m_position;
// skip at least one character, then find the matching ':]'
if(m_end == ++m_position)
fail(REG_ECOLLATE, name_first - m_base);
while((m_position != m_end)
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
++m_position;
const charT* name_last = m_position;
if(m_end == m_position)
fail(REG_ECOLLATE, name_first - m_base);
if((m_end == ++m_position)
|| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
fail(REG_ECOLLATE, name_first - m_base);
++m_position;
string_type s = this->m_traits.lookup_collatename(name_first, name_last);
if(s.empty() || (s.size() > 2))
fail(REG_ECOLLATE, name_first - m_base);
result.first = s[0];
if(s.size() > 1)
result.second = s[1];
else
result.second = 0;
return result;
}
default:
result = *m_position++;
}
@ -916,6 +1067,133 @@ bool basic_regex_parser<charT, traits>::parse_backref()
return true;
}
template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_QE()
{
//
// parse a \Q...\E sequence:
//
++m_position; // skip the Q
const charT* start = m_position;
const charT* end;
do
{
while((m_position != m_end)
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
++m_position;
if((m_position == m_end) || (++m_position == m_end)) // skip the escape
{
fail(REG_EESCAPE, m_position - m_base);
return false;
}
// check to see if it's a \E:
if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_E)
{
++m_position;
end = m_position - 2;
break;
}
// otherwise go round again:
}while(true);
//
// now add all the character between the two escapes as literals:
//
while(start != end)
{
this->append_literal(*start);
++start;
}
return true;
}
template <class charT, class traits>
bool basic_regex_parser<charT, traits>::parse_perl_extension()
{
if(++m_position == m_end)
fail(REG_BADRPT, m_position - m_base);
//
// backup some state, and prepare the way:
//
int markid;
std::ptrdiff_t jump_offset = 0;
re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
std::ptrdiff_t last_paren_start = this->getoffset(pb);
// back up insertion point for alternations, and set new point:
std::ptrdiff_t last_alt_point = m_alt_insert_point;
this->m_pdata->m_data.align();
m_alt_insert_point = this->m_pdata->m_data.size();
//
// select the actual extension used:
//
switch(this->m_traits.syntax_type(*m_position))
{
case regex_constants::syntax_colon:
//
// a non-capturing mark:
//
pb->index = markid = 0;
++m_position;
break;
case regex_constants::syntax_hash:
//
// a comment; this actually becomes an empty non-capturing mark:
//
pb->index = markid = 0;
while((m_position != m_end)
&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
++m_position;
break;
case regex_constants::syntax_equal:
pb->index = markid = -1;
++m_position;
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
this->m_pdata->m_data.align();
m_alt_insert_point = this->m_pdata->m_data.size();
break;
case regex_constants::syntax_not:
pb->index = markid = -2;
++m_position;
jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
this->m_pdata->m_data.align();
m_alt_insert_point = this->m_pdata->m_data.size();
break;
default:
fail(REG_BADRPT, m_position - m_base);
}
//
// now recursively add more states, this will terminate when we get to a
// matching ')' :
//
parse_all();
//
// we either have a ')' or we have run out of characters prematurely:
//
if(m_position == m_end)
this->fail(REG_EPAREN, std::distance(m_base, m_end));
BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
++m_position;
//
// set up the jump pointer if we have one:
//
if(jump_offset)
{
this->m_pdata->m_data.align();
re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
}
//
// append closing parenthesis state:
//
pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
pb->index = markid;
this->m_paren_start = last_paren_start;
//
// restore the alternate insertion point:
//
this->m_alt_insert_point = last_alt_point;
return true;
}
} // namespace re_detail
} // namespace boost

View File

@ -25,6 +25,9 @@
#ifdef BOOST_HAS_THREADS
#include <boost/regex/static_mutex.hpp>
#endif
#ifndef BOOST_REGEX_PRIMARY_TRANSFORM
#include <boost/regex/v4/primary_transform.hpp>
#endif
namespace boost{
@ -272,7 +275,7 @@ typename cpp_regex_traits_char_layer<charT>::string_type
// specialised version for narrow characters:
//
template <>
class cpp_regex_traits_char_layer<char> : public cpp_regex_traits_base<char>
class BOOST_REGEX_DECL cpp_regex_traits_char_layer<char> : public cpp_regex_traits_base<char>
{
typedef std::string string_type;
public:
@ -326,6 +329,7 @@ public:
typedef std::basic_string<charT> string_type;
typedef charT char_type;
//cpp_regex_traits_implementation();
cpp_regex_traits_implementation(const std::locale& l);
std::string error_string(regex_constants::error_type n) const
@ -348,16 +352,88 @@ public:
}
return result;
}
string_type lookup_collatename(const charT* p1, const charT* p2) const;
string_type transform_primary(const charT* p1, const charT* p2) const;
string_type transform(const charT* p1, const charT* p2) const
{
return this->m_pcollate->transform(p1, p2);
}
re_detail::parser_buf<charT> m_sbuf; // buffer for parsing numbers.
std::basic_istream<charT> m_is; // stream for parsing numbers.
private:
std::map<int, std::string> m_error_strings; // error messages indexed by numberic ID
std::map<string_type, char_class_type> m_custom_class_names; // character class names
std::map<string_type, string_type> m_custom_collate_names; // collating element names
unsigned m_collate_type; // the form of the collation string
charT m_collate_delim; // the collation group delimiter
//
// helpers:
//
char_class_type lookup_classname_imp(const charT* p1, const charT* p2) const;
};
template <class charT>
typename cpp_regex_traits_implementation<charT>::string_type
cpp_regex_traits_implementation<charT>::transform_primary(const charT* p1, const charT* p2) const
{
string_type result;
//
// What we do here depends upon the format of the sort key returned by
// sort key returned by this->transform:
//
switch(m_collate_type)
{
case sort_C:
case sort_unknown:
// the best we can do is translate to lower case, then get a regular sort key:
{
result.assign(p1, p2);
m_pctype->tolower(&*result.begin(), &*result.end());
result = this->m_pcollate->transform(&*result.begin(), &*result.end());
break;
}
case sort_fixed:
{
// get a regular sort key, and then truncate it:
result.assign(this->m_pcollate->transform(&*result.begin(), &*result.end()));
result.erase(this->m_collate_delim);
break;
}
case sort_delim:
// get a regular sort key, and then truncate everything after the delim:
result.assign(this->m_pcollate->transform(&*result.begin(), &*result.end()));
std::size_t i;
for(i = 0; i < result.size(); ++i)
{
if(result[i] == m_collate_delim)
break;
}
result.erase(i);
break;
}
return result;
}
template <class charT>
typename cpp_regex_traits_implementation<charT>::string_type
cpp_regex_traits_implementation<charT>::lookup_collatename(const charT* p1, const charT* p2) const
{
typedef typename std::map<string_type, string_type>::const_iterator iter_type;
if(m_custom_collate_names.size())
{
iter_type pos = m_custom_collate_names.find(string_type(p1, p2));
if(pos != m_custom_collate_names.end())
return pos->second;
}
std::string name(p1, p2);
name = lookup_default_collate_name(name);
if(name.size())
return string_type(name.begin(), name.end());
if(p2 - p1 == 1)
return string_type(1, *p1);
return string_type();
}
template <class charT>
cpp_regex_traits_implementation<charT>::cpp_regex_traits_implementation(const std::locale& l)
: cpp_regex_traits_char_layer<charT>(l), m_is(&m_sbuf)
@ -385,6 +461,9 @@ cpp_regex_traits_implementation<charT>::cpp_regex_traits_implementation(const st
//
if((int)cat >= 0)
{
//
// Error messages:
//
for(boost::regex_constants::error_type i = 0; i <= boost::regex_constants::error_unknown; ++i)
{
const char* p = get_default_error_string(i);
@ -402,7 +481,38 @@ cpp_regex_traits_implementation<charT>::cpp_regex_traits_implementation(const st
}
m_error_strings[i] = result;
}
//
// Custom class names:
//
static const char_class_type masks[] =
{
std::ctype<charT>::alnum,
std::ctype<charT>::alpha,
std::ctype<charT>::cntrl,
std::ctype<charT>::digit,
std::ctype<charT>::graph,
std::ctype<charT>::lower,
std::ctype<charT>::print,
std::ctype<charT>::punct,
std::ctype<charT>::space,
std::ctype<charT>::upper,
std::ctype<charT>::xdigit,
cpp_regex_traits_implementation<charT>::mask_blank,
cpp_regex_traits_implementation<charT>::mask_word,
cpp_regex_traits_implementation<charT>::mask_unicode,
};
static const string_type null_string;
for(unsigned int j = 0; j <= 13; ++j)
{
string_type s(this->m_pmessages->get(cat, 0, j+300, null_string));
if(s.size())
this->m_custom_class_names[s] = masks[j];
}
}
//
// get the collation format used by m_pcollate:
//
m_collate_type = re_detail::find_sort_syntax(this, &m_collate_delim);
}
template <class charT>
@ -432,6 +542,13 @@ typename cpp_regex_traits_implementation<charT>::char_class_type
std::ctype<char>::alnum | cpp_regex_traits_implementation<charT>::mask_word,
std::ctype<char>::xdigit,
};
if(m_custom_class_names.size())
{
typedef typename std::map<std::basic_string<charT>, char_class_type>::const_iterator map_iter;
map_iter pos = m_custom_class_names.find(string_type(p1, p2));
if(pos != m_custom_class_names.end())
return pos->second;
}
std::size_t id = 1 + re_detail::get_default_class_id(p1, p2);
assert(id < sizeof(masks) / sizeof(masks[0]));
return masks[id];
@ -491,9 +608,9 @@ public:
{
return m_pimpl->m_pcollate->transform(p1, p2);
}
string_type transform_primary(const charT* , const charT* ) const
string_type transform_primary(const charT* p1, const charT* p2) const
{
return string_type();
return m_pimpl->transform_primary(p1, p2);
}
char_class_type lookup_classname(const charT* p1, const charT* p2) const
{
@ -501,7 +618,7 @@ public:
}
string_type lookup_collatename(const charT* p1, const charT* p2) const
{
return string_type();
return m_pimpl->lookup_collatename(p1, p2);
}
bool is_class(charT c, char_class_type f) const
{

View File

@ -47,7 +47,11 @@ typedef size_t regsize_t;
typedef struct
{
unsigned int re_magic;
#ifdef __cplusplus
std::size_t re_nsub; /* number of parenthesized subexpressions */
#else
size_t re_nsub;
#endif
const char* re_endp; /* end pointer for REG_PEND */
void* guts; /* none of your business :-) */
match_flag_type eflags; /* none of your business :-) */
@ -57,7 +61,11 @@ typedef struct
typedef struct
{
unsigned int re_magic;
#ifdef __cplusplus
std::size_t re_nsub; /* number of parenthesized subexpressions */
#else
size_t re_nsub;
#endif
const wchar_t* re_endp; /* end pointer for REG_PEND */
void* guts; /* none of your business :-) */
match_flag_type eflags; /* none of your business :-) */

View File

@ -52,13 +52,13 @@ static const reg_error_t REG_ESPACE = 12; /* Ran out of memory. */
static const reg_error_t REG_BADRPT = 13; /* No preceding re for repetition op. */
static const reg_error_t REG_EEND = 14; /* unexpected end of expression */
static const reg_error_t REG_ESIZE = 15; /* expression too big */
static const reg_error_t REG_ERPAREN = REG_EPAREN; /* unmatched right parenthesis */
static const reg_error_t REG_ERPAREN = 8; /* = REG_EPAREN : unmatched right parenthesis */
static const reg_error_t REG_EMPTY = 17; /* empty expression */
static const reg_error_t REG_E_MEMORY = REG_ESIZE; /* out of memory */
static const reg_error_t REG_E_MEMORY = 15; /* = REG_ESIZE : out of memory */
static const reg_error_t REG_ECOMPLEXITY = 18; /* complexity too high */
static const reg_error_t REG_ESTACK = 19; /* out of stack space */
static const reg_error_t REG_E_UNKNOWN = 20; /* unknown error */
static const reg_error_t REG_ENOSYS = REG_E_UNKNOWN; /* Reserved. */
static const reg_error_t REG_ENOSYS = 20; /* = REG_E_UNKNOWN : Reserved. */
#ifdef __cplusplus
namespace regex_constants{

View File

@ -153,7 +153,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_all_states()
{
if(state_count > max_state_count)
raise_error(traits_inst, REG_ESPACE);
if((m_match_flags & match_partial) && (position == last))
if((m_match_flags & match_partial) && (position == last) && (position != search_base))
m_has_partial_match = true;
if(false == unwind(false))
return m_recursive_result;

View File

@ -86,7 +86,7 @@ bool perl_matcher<BidiIterator, Allocator, traits>::match_all_states()
++state_count;
if(!(this->*proc)())
{
if((m_match_flags & match_partial) && (position == last))
if((m_match_flags & match_partial) && (position == last) && (position != search_base))
m_has_partial_match = true;
return 0;
}

View File

@ -96,7 +96,7 @@ enum{
// this is used by basic_regex for expression storage
//
class raw_storage
class BOOST_REGEX_DECL raw_storage
{
public:
typedef std::size_t size_type;

View File

@ -21,11 +21,11 @@
namespace boost{ namespace re_detail{
const char* get_default_syntax(regex_constants::syntax_type n);
const char* get_default_error_string(regex_constants::error_type n);
BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants::syntax_type n);
BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_error_string(regex_constants::error_type n);
// is charT c a combining character?
bool is_combining_implementation(uint_least16_t s);
BOOST_REGEX_DECL bool BOOST_REGEX_CALL is_combining_implementation(uint_least16_t s);
template <class charT>
inline bool is_combining(charT c)
@ -75,9 +75,14 @@ inline bool is_combining<wchar_t>(wchar_t c)
template <class charT>
inline bool is_separator(charT c)
{
return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r'));
return BOOST_REGEX_MAKE_BOOL((c == '\n') || (c == '\r') || (static_cast<int>(c) == 0x2028) || (static_cast<int>(c) == 0x2029));
}
//
// get a default collating element:
//
BOOST_REGEX_DECL std::string BOOST_REGEX_CALL lookup_default_collate_name(const std::string& name);
//
// get the id of a character clasification, the individual
// traits classes then transform that id into a bitmask:

View File

@ -16,6 +16,7 @@
* DESCRIPTION: Implements cpp_regex_traits<char> (and associated helper classes).
*/
#define BOOST_REGEX_SOURCE
#include <boost/regex/regex_traits.hpp>
namespace boost{ namespace re_detail{

View File

@ -1,5 +1,6 @@
#define BOOST_REGEX_SOURCE
#include <boost/regex/v4/regex_raw_buffer.hpp>
namespace boost{ namespace re_detail{

View File

@ -16,11 +16,12 @@
* DESCRIPTION: Declares API's for access to regex_traits default properties.
*/
#define BOOST_REGEX_SOURCE
#include <boost/regex/regex_traits.hpp>
namespace boost{ namespace re_detail{
const char* get_default_syntax(regex_constants::syntax_type n)
BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants::syntax_type n)
{
// if the user hasn't supplied a message catalog, then this supplies
// default "messages" for us to load in the range 1-100.
@ -77,13 +78,13 @@ const char* get_default_syntax(regex_constants::syntax_type n)
"X",
"C",
"Z",
"G"
"G",
"!", };
return ((n >= (sizeof(messages) / sizeof(messages[1]))) ? "" : messages[n]);
}
const char* get_default_error_string(regex_constants::error_type n)
BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_error_string(regex_constants::error_type n)
{
static const char* const s_default_error_messages[] = {
"Success", /* REG_NOERROR */
@ -115,7 +116,7 @@ const char* get_default_error_string(regex_constants::error_type n)
return (n > REG_E_UNKNOWN) ? s_default_error_messages[REG_E_UNKNOWN] : s_default_error_messages[n];
}
bool is_combining_implementation(boost::uint_least16_t c)
BOOST_REGEX_DECL bool BOOST_REGEX_CALL is_combining_implementation(boost::uint_least16_t c)
{
const boost::uint_least16_t combining_ranges[] = { 0x0300, 0x0361,
0x0483, 0x0486,
@ -164,5 +165,80 @@ bool is_combining_implementation(boost::uint_least16_t c)
return false;
}
//
// these are the POSIX collating names:
//
BOOST_REGEX_DECL const char* def_coll_names[] = {
"NUL", "SOH", "STX", "ETX", "EOT", "ENQ", "ACK", "alert", "backspace", "tab", "newline",
"vertical-tab", "form-feed", "carriage-return", "SO", "SI", "DLE", "DC1", "DC2", "DC3", "DC4", "NAK",
"SYN", "ETB", "CAN", "EM", "SUB", "ESC", "IS4", "IS3", "IS2", "IS1", "space", "exclamation-mark",
"quotation-mark", "number-sign", "dollar-sign", "percent-sign", "ampersand", "apostrophe",
"left-parenthesis", "right-parenthesis", "asterisk", "plus-sign", "comma", "hyphen",
"period", "slash", "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
"colon", "semicolon", "less-than-sign", "equals-sign", "greater-than-sign",
"question-mark", "commercial-at", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "left-square-bracket", "backslash",
"right-square-bracket", "circumflex", "underscore", "grave-accent", "a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "left-curly-bracket",
"vertical-line", "right-curly-bracket", "tilde", "DEL", "",
};
// these multi-character collating elements
// should keep most Western-European locales
// happy - we should really localise these a
// little more - but this will have to do for
// now:
BOOST_REGEX_DECL const char* def_multi_coll[] = {
"ae",
"Ae",
"AE",
"ch",
"Ch",
"CH",
"ll",
"Ll",
"LL",
"ss",
"Ss",
"SS",
"nj",
"Nj",
"NJ",
"dz",
"Dz",
"DZ",
"lj",
"Lj",
"LJ",
"",
};
BOOST_REGEX_DECL std::string BOOST_REGEX_CALL lookup_default_collate_name(const std::string& name)
{
unsigned int i = 0;
while(*def_coll_names[i])
{
if(def_coll_names[i] == name)
{
return std::string(1, char(i));
}
++i;
}
i = 0;
while(*def_multi_coll[i])
{
if(def_multi_coll[i] == name)
{
return def_multi_coll[i];
}
++i;
}
return std::string();
}
} // re_detail
} // boost

View File

@ -16,6 +16,7 @@
* DESCRIPTION: Declares static_mutex lock type.
*/
#define BOOST_REGEX_SOURCE
#include <boost/config.hpp>
#ifdef BOOST_HAS_THREADS
@ -86,7 +87,7 @@ void scoped_static_mutex_lock::lock()
{
if(0 == m_have_lock)
{
#if defined(BOOST_MSVC) && (BOOST_MSVC <=1200)
#if !defined(InterlockedCompareExchangePointer)
while(0 != InterlockedCompareExchange(reinterpret_cast<void**>((boost::uint_least16_t*)&(m_mutex.m_mutex)), (void*)1, 0))
#else
while(0 != InterlockedCompareExchange(reinterpret_cast<volatile LONG*>(&(m_mutex.m_mutex)), 1, 0))
@ -102,7 +103,7 @@ void scoped_static_mutex_lock::unlock()
{
if(m_have_lock)
{
#if defined(BOOST_MSVC) && (BOOST_MSVC <=1200)
#if !defined(InterlockedCompareExchangePointer)
InterlockedExchange((LONG*)&(m_mutex.m_mutex), 0);
#else
InterlockedExchange(reinterpret_cast<volatile LONG*>(&(m_mutex.m_mutex)), 0);
@ -121,7 +122,7 @@ void scoped_static_mutex_lock::unlock()
boost::recursive_mutex* static_mutex::m_pmutex = 0;
boost::once_flag static_mutex::m_once = BOOST_ONCE_INIT;
extern "C" void free_static_mutex()
extern "C" BOOST_REGEX_DECL void free_static_mutex()
{
delete static_mutex::m_pmutex;
static_mutex::m_pmutex = 0;

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,14 @@ int cpp_main(int argc, char * argv[])
test_character_escapes();
test_assertion_escapes();
test_tricky_cases();
test_tricky_cases2();
test_grep();
test_replace();
test_non_greedy_repeats();
test_non_marking_paren();
test_partial_match();
test_forward_lookahead_asserts();
test_fast_repeats();
return error_count;
}

View File

@ -4,6 +4,7 @@
#define BOOST_REGEX_REGRESS_TEST_HPP
#include "test_not_regex.hpp"
#include "test_regex_search.hpp"
#include "test_regex_replace.hpp"
//
@ -81,6 +82,45 @@ const int* make_array(int first, ...);
TEST_REGEX_SEARCH_N(s, f, t, m, a);\
TEST_REGEX_SEARCH_W(BOOST_JOIN(L, s), f, BOOST_JOIN(L, t), m, a)
//
// define macros for testing regex replaces:
//
#define TEST_REGEX_REPLACE_N(s, f, t, m, fs, r)\
do{\
const char e[] = { s };\
std::string se(e, sizeof(e) - 1);\
const char st[] = { t };\
std::string sst(st, sizeof(st) - 1);\
const char ft[] = { fs };\
std::string sft(ft, sizeof(ft) - 1);\
const char rt[] = { r };\
std::string srt(rt, sizeof(rt) - 1);\
test_info<char>::set_info(__FILE__, __LINE__, se, f, sst, m, 0, sft, srt);\
test(char(0), test_regex_replace_tag());\
}while(0)
#ifndef BOOST_NO_WREGEX
#define TEST_REGEX_REPLACE_W(s, f, t, m, fs, r)\
do{\
const wchar_t e[] = { s };\
std::wstring se(e, (sizeof(e) / sizeof(wchar_t)) - 1);\
const wchar_t st[] = { t };\
std::wstring sst(st, (sizeof(st) / sizeof(wchar_t)) - 1);\
const wchar_t ft[] = { fs };\
std::wstring sft(ft, (sizeof(ft) / sizeof(wchar_t)) - 1);\
const wchar_t rt[] = { r };\
std::wstring srt(rt, (sizeof(rt) / sizeof(wchar_t)) - 1);\
test_info<wchar_t>::set_info(__FILE__, __LINE__, se, f, sst, m, 0, sft, srt);\
test(wchar_t(0), test_regex_replace_tag());\
}while(0)
#else
#define TEST_REGEX_REPLACE_W(s, f, t, m, fs, r)
#endif
#define TEST_REGEX_REPLACE(s, f, t, m, fs, r)\
TEST_REGEX_REPLACE_N(s, f, t, m, fs, r);\
TEST_REGEX_REPLACE_W(BOOST_JOIN(L, s), f, BOOST_JOIN(L, t), m, BOOST_JOIN(L, fs), BOOST_JOIN(L, r))
//
// define the test group proceedures:
//
@ -92,6 +132,13 @@ void test_backrefs();
void test_character_escapes();
void test_assertion_escapes();
void test_tricky_cases();
void test_grep();
void test_replace();
void test_non_greedy_repeats();
void test_non_marking_paren();
void test_partial_match();
void test_forward_lookahead_asserts();
void test_fast_repeats();
void test_tricky_cases2();
#endif

View File

@ -16,7 +16,10 @@ void test_sub_match(const boost::sub_match<BidirectionalIterator>& sub, Bidirect
#pragma warning(disable:4244)
#endif
typedef typename boost::sub_match<BidirectionalIterator>::value_type charT;
if(sub.matched == 0)
if((sub.matched == 0)
&&
!((i == 0)
&& (test_info<charT>::match_options() & boost::match_partial)) )
{
if(answer_table[2*i] >= 0)
{
@ -80,6 +83,101 @@ void test_simple_search(boost::basic_regex<charT, traits>& r)
}
}
template<class charT, class traits>
void test_regex_iterator(boost::basic_regex<charT, traits>& r)
{
typedef typename std::basic_string<charT>::const_iterator const_iterator;
typedef boost::regex_iterator<const_iterator> test_iterator;
const std::basic_string<charT>& search_text = test_info<charT>::search_text();
boost::regex_constants::match_flag_type opts = test_info<charT>::match_options();
const int* answer_table = test_info<charT>::answer_table();
test_iterator start(search_text.begin(), search_text.end(), r, opts), end;
while(start != end)
{
test_result(*start, search_text.begin(), answer_table);
++start;
// move on the answer table to next set of answers;
while(*answer_table++ != -2){}
}
if(answer_table[0] >= 0)
{
// we should have had a match but didn't:
BOOST_REGEX_TEST_ERROR("Expected match was not found.", charT);
}
}
template <class charT, class traits>
struct grep_test_predicate
{
typedef typename std::basic_string<charT>::const_iterator test_iter;
grep_test_predicate(test_iter b, const int* a)
: m_base(b), m_table(a)
{}
bool operator()(const boost::match_results<test_iter>& what)
{
test_result(what, m_base, m_table);
// move on the answer table to next set of answers;
while(*m_table++ != -2){}
return true;
}
private:
test_iter m_base;
const int* m_table;
};
template<class charT, class traits>
void test_regex_grep(boost::basic_regex<charT, traits>& r)
{
typedef typename std::basic_string<charT>::const_iterator const_iterator;
const std::basic_string<charT>& search_text = test_info<charT>::search_text();
boost::regex_constants::match_flag_type opts = test_info<charT>::match_options();
const int* answer_table = test_info<charT>::answer_table();
grep_test_predicate<charT, traits> pred(search_text.begin(), answer_table);
boost::regex_grep(pred, search_text, r, opts);
}
template<class charT, class traits>
void test_regex_match(boost::basic_regex<charT, traits>& r)
{
typedef typename std::basic_string<charT>::const_iterator const_iterator;
const std::basic_string<charT>& search_text = test_info<charT>::search_text();
boost::regex_constants::match_flag_type opts = test_info<charT>::match_options();
const int* answer_table = test_info<charT>::answer_table();
boost::match_results<const_iterator> what;
if(answer_table[0] < 0)
{
if(boost::regex_match(search_text, r, opts))
{
BOOST_REGEX_TEST_ERROR("boost::regex_match found a match when it should not have done so.", charT);
}
}
else
{
if((answer_table[0] > 0) && boost::regex_match(search_text, r, opts))
{
BOOST_REGEX_TEST_ERROR("boost::regex_match found a match when it should not have done so.", charT);
}
else if((answer_table[0] == 0) && (answer_table[1] == search_text.size()))
{
if(boost::regex_match(
search_text.begin(),
search_text.end(),
what,
r,
opts))
{
test_result(what, search_text.begin(), answer_table);
}
else if(answer_table[0] >= 0)
{
// we should have had a match but didn't:
BOOST_REGEX_TEST_ERROR("Expected match was not found.", charT);
}
}
}
}
template<class charT, class traits>
void test(boost::basic_regex<charT, traits>& r, const test_regex_search_tag&)
{
@ -88,6 +186,9 @@ void test(boost::basic_regex<charT, traits>& r, const test_regex_search_tag&)
try{
r.assign(expression, syntax_options);
test_simple_search(r);
test_regex_iterator(r);
test_regex_grep(r);
test_regex_match(r);
}
catch(const boost::bad_expression& e)
{