Added support for Perl style \N \P and \p.

Completed first draft of Unicode UCS-4 support.
Broken compiler compatibility fixes.
Added unicode_iterators.


[SVN r26185]
This commit is contained in:
John Maddock
2004-11-11 17:04:17 +00:00
parent f141de61ec
commit d35e5a088e
17 changed files with 1051 additions and 46 deletions

View File

@ -42,6 +42,7 @@ SOURCES =
cpp_regex_traits.cpp
cregex.cpp
fileiter.cpp
icu.cpp
instances.cpp
posix_api.cpp
regex.cpp

View File

@ -179,7 +179,7 @@
# define BOOST_REGEX_DECL
#endif
#if (defined(BOOST_MSVC) || defined(__BORLANDC__)) && !defined(BOOST_REGEX_NO_LIB) && !defined(BOOST_REGEX_SOURCE) && !defined(BOOST_ALL_NO_LIB) && defined(__cplusplus)
#if !defined(BOOST_REGEX_NO_LIB) && !defined(BOOST_REGEX_SOURCE) && !defined(BOOST_ALL_NO_LIB) && defined(__cplusplus)
# define BOOST_LIB_NAME boost_regex
# if defined(BOOST_REGEX_DYN_LINK) || defined(BOOST_ALL_DYN_LINK)
# define BOOST_DYN_LINK

View File

@ -177,6 +177,8 @@ private:
offset_xdigit = U_CHAR_CATEGORY_COUNT+2,
offset_underscore = U_CHAR_CATEGORY_COUNT+3,
offset_unicode = U_CHAR_CATEGORY_COUNT+4,
offset_any = U_CHAR_CATEGORY_COUNT+5,
offset_ascii = U_CHAR_CATEGORY_COUNT+6,
};
//
@ -187,6 +189,10 @@ private:
static const char_class_type mask_xdigit;
static const char_class_type mask_underscore;
static const char_class_type mask_unicode;
static const char_class_type mask_any;
static const char_class_type mask_ascii;
static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2);
boost::shared_ptr< ::boost::re_detail::icu_regex_traits_implementation> m_pimpl;
};

View File

@ -49,35 +49,63 @@ inline bool is_surrogate(T v)
return (v & 0xF800u) == 0xd800;
}
inline unsigned utf8_byte_count(boost::uint8_t c)
{
// if the most significant bit with a zero in it is in position
// 8-N then there are N bytes in this UTF-8 sequence:
boost::uint8_t mask = 0x80u;
unsigned result = 0;
while(c & mask)
{
++result;
mask >>= 1;
}
return (result == 0) ? 1 : result;
}
inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
{
return utf8_byte_count(c) - 1;
}
}
template <class BaseIterator, class U16Type = ::boost::uint16_t>
class u32_to_u16_iterator
: public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
{
typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
#endif
public:
typename base_type::reference
dereference()const
{
if(m_current == 2)
const_cast<u32_to_u16_iterator*>(this)->extract_current();
extract_current();
return m_values[m_current];
}
bool equal(const u32_to_u16_iterator& that)const
{
if(m_position == that.m_position)
{
// Both m_currents must be equal, or both even
// this is the same as saying their sum must be even:
return (m_current + that.m_current) & 1u ? false : true;
/*
if((m_current >= 2) && (that.m_current < 2))
const_cast<u32_to_u16_iterator*>(this)->extract_current();
else if((m_current < 2) && (that.m_current >= 2))
const_cast<u32_to_u16_iterator&>(that).extract_current();
if(m_current == that.m_current)
return true;
*/
}
return false;
}
@ -127,7 +155,7 @@ public:
m_values[2] = 0;
}
private:
void invalid_code_point(::boost::uint32_t val)
static void invalid_code_point(::boost::uint32_t val)
{
#ifndef BOOST_NO_STD_LOCALE
std::stringstream ss;
@ -139,15 +167,15 @@ private:
boost::throw_exception(e);
}
void extract_current()
void extract_current()const
{
// begin by checking for a code point out of range:
if(static_cast< ::boost::uint32_t>(*m_position) >= 0x10000u)
::boost::uint32_t v = *m_position;
if(v >= 0x10000u)
{
if(static_cast< ::boost::uint32_t>(*m_position) > 0x10FFFFu)
if(v > 0x10FFFFu)
invalid_code_point(*m_position);
// split into two surrogates:
base_value_type v = *m_position;
m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
m_current = 0;
@ -166,29 +194,31 @@ private:
}
}
BaseIterator m_position;
U16Type m_values[3];
unsigned m_current;
mutable U16Type m_values[3];
mutable unsigned m_current;
};
template <class BaseIterator, class U32Type = ::boost::uint32_t>
class u16_to_u32_iterator
: public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
{
typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
// special values for pending iterator reads:
BOOST_STATIC_CONSTANT(::boost::uint32_t, pending_read = 0xffffffffu);
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
#endif
public:
typename base_type::reference
dereference()const
{
if(m_value == pending_read)
const_cast<u16_to_u32_iterator*>(this)->extract_current();
extract_current();
return m_value;
}
bool equal(const u16_to_u32_iterator& that)const
@ -223,7 +253,7 @@ public:
m_value = pending_read;
}
private:
void invalid_code_point(::boost::uint16_t val)
static void invalid_code_point(::boost::uint16_t val)
{
#ifndef BOOST_NO_STD_LOCALE
std::stringstream ss;
@ -234,28 +264,254 @@ private:
#endif
boost::throw_exception(e);
}
void extract_current()
void extract_current()const
{
m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
// if the last value is a high surrogate then adjust m_position and m_value as needed:
if(detail::is_high_surrogate(*m_position))
{
// precondition; next value must have be a low-surrogate:
::boost::uint16_t t = *++m_position;
if((*m_position & 0xFC00u) != 0xDC00u)
BaseIterator next(m_position);
::boost::uint16_t t = *++next;
if((t & 0xFC00u) != 0xDC00u)
invalid_code_point(t);
m_value = (m_value - detail::high_surrogate_base) << 10;
m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
--m_position;
}
// postcondition; result must not be a surrogate:
if(detail::is_surrogate(m_value))
invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
}
BaseIterator m_position;
U32Type m_value;
mutable U32Type m_value;
};
}
template <class BaseIterator, class U8Type = ::boost::uint8_t>
class u32_to_u8_iterator
: public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
{
typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
#endif
public:
typename base_type::reference
dereference()const
{
if(m_current == 4)
extract_current();
return m_values[m_current];
}
bool equal(const u32_to_u8_iterator& that)const
{
if(m_position == that.m_position)
{
// either the m_current's must be equal, or one must be 0 and
// the other 4: which means neither must have bits 1 or 2 set:
return (m_current == that.m_current)
|| (((m_current | that.m_current) & 3) == 0);
}
return false;
}
void increment()
{
// if we have a pending read then read now, so that we know whether
// to skip a position, or move to a low-surrogate:
if(m_current == 4)
{
// pending read:
extract_current();
}
// move to the next surrogate position:
++m_current;
// if we've reached the end skip a position:
if(m_values[m_current] == 0)
{
m_current = 4;
++m_position;
}
}
void decrement()
{
if((m_current & 3) == 0)
{
--m_position;
extract_current();
m_current = 3;
while(m_current && (m_values[m_current] == 0))
--m_current;
}
else
--m_current;
}
BaseIterator base()const
{
return m_position;
}
// construct:
u32_to_u8_iterator() : m_position(), m_current(0)
{
m_values[4] = 0;
}
u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
{
m_values[4] = 0;
}
private:
static void invalid_code_point(::boost::uint32_t val)
{
#ifndef BOOST_NO_STD_LOCALE
std::stringstream ss;
ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-8 sequence";
std::out_of_range e(ss.str());
#else
std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-8 sequence");
#endif
boost::throw_exception(e);
}
void extract_current()const
{
boost::uint32_t c = *m_position;
if(c > 0x10FFFFu)
invalid_code_point(c);
if(c < 0x80u)
{
m_values[0] = static_cast<unsigned char>(c);
m_values[1] = static_cast<unsigned char>(0u);
m_values[2] = static_cast<unsigned char>(0u);
m_values[3] = static_cast<unsigned char>(0u);
}
else if(c < 0x800u)
{
m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
m_values[2] = static_cast<unsigned char>(0u);
m_values[3] = static_cast<unsigned char>(0u);
}
else if(c < 0x10000u)
{
m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
m_values[3] = static_cast<unsigned char>(0u);
}
else
{
m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
}
m_current= 0;
}
BaseIterator m_position;
mutable U8Type m_values[5];
mutable unsigned m_current;
};
template <class BaseIterator, class U32Type = ::boost::uint32_t>
class u8_to_u32_iterator
: public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
{
typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
// special values for pending iterator reads:
BOOST_STATIC_CONSTANT(::boost::uint32_t, pending_read = 0xffffffffu);
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
#endif
public:
typename base_type::reference
dereference()const
{
if(m_value == pending_read)
extract_current();
return m_value;
}
bool equal(const u8_to_u32_iterator& that)const
{
return m_position == that.m_position;
}
void increment()
{
// skip high surrogate first if there is one:
unsigned c = detail::utf8_byte_count(*m_position);
std::advance(m_position, c);
m_value = pending_read;
}
void decrement()
{
// Keep backtracking until we don't have a trailing character:
unsigned count = 0;
while((*--m_position & 0xC0u) == 0x80u) ++count;
// now check that the sequence was valid:
if(count != detail::utf8_trailing_byte_count(*m_position))
invalid_sequnce();
m_value = pending_read;
}
BaseIterator base()const
{
return m_position;
}
// construct:
u8_to_u32_iterator() : m_position()
{
m_value = pending_read;
}
u8_to_u32_iterator(BaseIterator b) : m_position(b)
{
m_value = pending_read;
}
private:
static void invalid_sequnce()
{
std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
boost::throw_exception(e);
}
void extract_current()const
{
m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
// we must not have a continuation character:
if((m_value & 0xC0u) == 0x80u)
invalid_sequnce();
// see how many extra byts we have:
unsigned extra = detail::utf8_trailing_byte_count(*m_position);
// extract the extra bits, 6 from each extra byte:
BaseIterator next(m_position);
for(unsigned c = 0; c < extra; ++c)
{
++next;
m_value <<= 6;
m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
}
// we now need to remove a few of the leftmost bits, but how many depends
// upon how many extra bytes we've extracted:
static const boost::uint32_t masks[] =
{
0x7Fu,
0x7FFu,
0xFFFFu,
0x1FFFFFu,
};
m_value &= masks[extra];
// check the result:
if(m_value > 0x10FFFFu)
invalid_sequnce();
}
BaseIterator m_position;
mutable U32Type m_value;
};
} // namespace boost
#endif // BOOST_REGEX_UNICODE_ITERATOR_HPP

View File

@ -611,6 +611,52 @@ bool basic_regex_parser<charT, traits>::parse_extended_escape()
++m_position;
this->append_state(syntax_element_restart_continue);
break;
case regex_constants::escape_type_not_property:
negate = true;
// fall through:
case regex_constants::escape_type_property:
{
++m_position;
char_class_type m;
if(m_position == m_end)
{
fail(regex_constants::error_escape, m_position - m_base);
return false;
}
// maybe have \p{ddd}
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
{
const charT* base = m_position;
// skip forward until we find enclosing brace:
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
++m_position;
if(m_position == m_end)
{
fail(regex_constants::error_escape, m_position - m_base);
return false;
}
m = this->m_traits.lookup_classname(++base, m_position++);
}
else
{
m = this->m_traits.lookup_classname(m_position, m_position+1);
++m_position;
}
if(m != 0)
{
basic_char_set<charT, traits> char_set;
if(negate)
char_set.negate();
char_set.add_class(m);
if(0 == this->append_set(char_set))
{
fail(regex_constants::error_ctype, m_position - m_base);
return false;
}
return true;
}
fail(regex_constants::error_ctype, m_position - m_base);
}
default:
this->append_literal(unescape_character());
break;
@ -948,6 +994,7 @@ bool basic_regex_parser<charT, traits>::parse_set()
if(m != 0)
{
char_set.add_class(m);
++m_position;
break;
}
}
@ -1373,6 +1420,41 @@ charT basic_regex_parser<charT, traits>::unescape_character()
}
return static_cast<charT>(val);
}
case regex_constants::escape_type_named_char:
{
++m_position;
if(m_position == m_end)
{
fail(regex_constants::error_escape, m_position - m_base);
return false;
}
// maybe have \N{name}
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
{
const charT* base = m_position;
// skip forward until we find enclosing brace:
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
++m_position;
if(m_position == m_end)
{
fail(regex_constants::error_escape, m_position - m_base);
return false;
}
string_type s = this->m_traits.lookup_collatename(++base, m_position++);
if(s.empty())
{
fail(regex_constants::error_collate, m_position - m_base);
return false;
}
if(s.size() == 1)
{
return s[0];
}
}
// fall through is a failure:
fail(regex_constants::error_escape, m_position - m_base);
return false;
}
default:
result = *m_position;
break;

View File

@ -75,7 +75,7 @@ class basic_regex_formatter
public:
typedef typename traits::char_type char_type;
basic_regex_formatter(OutputIterator o, const Results& r, const traits& t)
: m_traits(t), m_results(r), m_out(o), m_state(output_copy) {}
: m_traits(t), m_results(r), m_out(o), m_state(output_copy), m_have_conditional(false) {}
OutputIterator format(const char_type* p1, const char_type* p2, match_flag_type f);
OutputIterator format(const char_type* p1, match_flag_type f)
{
@ -108,6 +108,7 @@ private:
const char_type* m_end; // format string end
match_flag_type m_flags; // format flags to use
output_state m_state; // what to do with the next character
bool m_have_conditional; // we are parsing a conditional
private:
basic_regex_formatter(const basic_regex_formatter&);
basic_regex_formatter& operator=(const basic_regex_formatter&);
@ -147,7 +148,10 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_all()
if(m_flags & boost::regex_constants::format_all)
{
++m_position;
bool have_conditional = m_have_conditional;
m_have_conditional = false;
format_until_scope_end();
m_have_conditional = have_conditional;
if(m_position == m_end)
return;
BOOST_ASSERT(*m_position == static_cast<char_type>(')'));
@ -158,7 +162,6 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_all()
++m_position;
break;
case ')':
case ':':
if(m_flags & boost::regex_constants::format_all)
{
return;
@ -166,6 +169,14 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_all()
put(*m_position);
++m_position;
break;
case ':':
if((m_flags & boost::regex_constants::format_all) && m_have_conditional)
{
return;
}
put(*m_position);
++m_position;
break;
case '?':
if(m_flags & boost::regex_constants::format_all)
{
@ -405,7 +416,9 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_conditional(
// output varies depending upon whether sub-expression v matched or not:
if(m_results[v].matched)
{
m_have_conditional = true;
format_all();
m_have_conditional = false;
if((m_position != m_end) && (*m_position == static_cast<char_type>(':')))
{
// skip the ':':
@ -425,7 +438,9 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_conditional(
output_state saved_state = m_state;
m_state = output_none;
// format until ':' or ')':
m_have_conditional = true;
format_all();
m_have_conditional = false;
// restore state:
m_state = saved_state;
if((m_position != m_end) && (*m_position == static_cast<char_type>(':')))

View File

@ -89,7 +89,11 @@ static const escape_syntax_type escape_type_C = 50; /
static const escape_syntax_type escape_type_Z = 51; // for \Z
static const escape_syntax_type escape_type_G = 52; // for \G
static const escape_syntax_type syntax_max = 54;
static const escape_syntax_type escape_type_property = 54; // for \p
static const escape_syntax_type escape_type_not_property = 55; // for \P
static const escape_syntax_type escape_type_named_char = 56; // for \N
static const escape_syntax_type syntax_max = 57;
}
}

View File

@ -15,7 +15,10 @@
* VERSION see <boost/version.hpp>
* DESCRIPTION: Unicode regular expressions on top of the ICU Library.
*/
#define BOOST_REGEX_SOURCE
#include <boost/regex/config.hpp>
#ifdef BOOST_HAS_ICU
#include <boost/regex/icu.hpp>
namespace boost{
@ -64,6 +67,264 @@ const icu_regex_traits::char_class_type icu_regex_traits::mask_space = icu_regex
const icu_regex_traits::char_class_type icu_regex_traits::mask_xdigit = icu_regex_traits::char_class_type(1) << offset_xdigit;
const icu_regex_traits::char_class_type icu_regex_traits::mask_underscore = icu_regex_traits::char_class_type(1) << offset_underscore;
const icu_regex_traits::char_class_type icu_regex_traits::mask_unicode = icu_regex_traits::char_class_type(1) << offset_unicode;
const icu_regex_traits::char_class_type icu_regex_traits::mask_any = icu_regex_traits::char_class_type(1) << offset_any;
const icu_regex_traits::char_class_type icu_regex_traits::mask_ascii = icu_regex_traits::char_class_type(1) << offset_ascii;
icu_regex_traits::char_class_type icu_regex_traits::lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2)
{
static const ::UChar32 prop_name_table[] = {
/* any */ 'a', 'n', 'y',
/* ascii */ 'a', 's', 'c', 'i', 'i',
/* assigned */ 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
/* c* */ 'c', '*',
/* cc */ 'c', 'c',
/* cf */ 'c', 'f',
/* closepunctuation */ 'c', 'l', 'o', 's', 'e', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* cn */ 'c', 'n',
/* co */ 'c', 'o',
/* connectorpunctuation */ 'c', 'o', 'n', 'n', 'e', 'c', 't', 'o', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* control */ 'c', 'o', 'n', 't', 'r', 'o', 'l',
/* cs */ 'c', 's',
/* currencysymbol */ 'c', 'u', 'r', 'r', 'e', 'n', 'c', 'y', 's', 'y', 'm', 'b', 'o', 'l',
/* dashpunctuation */ 'd', 'a', 's', 'h', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* decimaldigitnumber */ 'd', 'e', 'c', 'i', 'm', 'a', 'l', 'd', 'i', 'g', 'i', 't', 'n', 'u', 'm', 'b', 'e', 'r',
/* enclosingmark */ 'e', 'n', 'c', 'l', 'o', 's', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
/* finalpunctuation */ 'f', 'i', 'n', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* format */ 'f', 'o', 'r', 'm', 'a', 't',
/* initialpunctuation */ 'i', 'n', 'i', 't', 'i', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* l* */ 'l', '*',
/* letter */ 'l', 'e', 't', 't', 'e', 'r',
/* letternumber */ 'l', 'e', 't', 't', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
/* lineseparator */ 'l', 'i', 'n', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
/* ll */ 'l', 'l',
/* lm */ 'l', 'm',
/* lo */ 'l', 'o',
/* lowercaseletter */ 'l', 'o', 'w', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
/* lt */ 'l', 't',
/* lu */ 'l', 'u',
/* m* */ 'm', '*',
/* mark */ 'm', 'a', 'r', 'k',
/* mathsymbol */ 'm', 'a', 't', 'h', 's', 'y', 'm', 'b', 'o', 'l',
/* mc */ 'm', 'c',
/* me */ 'm', 'e',
/* mn */ 'm', 'n',
/* modifierletter */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
/* modifiersymbol */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
/* n* */ 'n', '*',
/* nd */ 'n', 'd',
/* nl */ 'n', 'l',
/* no */ 'n', 'o',
/* nonspacingmark */ 'n', 'o', 'n', 's', 'p', 'a', 'c', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
/* notassigned */ 'n', 'o', 't', 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
/* number */ 'n', 'u', 'm', 'b', 'e', 'r',
/* openpunctuation */ 'o', 'p', 'e', 'n', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* other */ 'o', 't', 'h', 'e', 'r',
/* otherletter */ 'o', 't', 'h', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
/* othernumber */ 'o', 't', 'h', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
/* otherpunctuation */ 'o', 't', 'h', 'e', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* othersymbol */ 'o', 't', 'h', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
/* p* */ 'p', '*',
/* paragraphseparator */ 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
/* pc */ 'p', 'c',
/* pd */ 'p', 'd',
/* pe */ 'p', 'e',
/* pf */ 'p', 'f',
/* pi */ 'p', 'i',
/* po */ 'p', 'o',
/* privateuse */ 'p', 'r', 'i', 'v', 'a', 't', 'e', 'u', 's', 'e',
/* ps */ 'p', 's',
/* punctuation */ 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
/* s* */ 's', '*',
/* sc */ 's', 'c',
/* separator */ 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
/* sk */ 's', 'k',
/* sm */ 's', 'm',
/* so */ 's', 'o',
/* spaceseparator */ 's', 'p', 'a', 'c', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
/* spacingcombiningmark */ 's', 'p', 'a', 'c', 'i', 'n', 'g', 'c', 'o', 'm', 'b', 'i', 'n', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
/* surrogate */ 's', 'u', 'r', 'r', 'o', 'g', 'a', 't', 'e',
/* symbol */ 's', 'y', 'm', 'b', 'o', 'l',
/* titlecase */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e',
/* titlecaseletter */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
/* uppercaseletter */ 'u', 'p', 'p', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
/* z* */ 'z', '*',
/* zl */ 'z', 'l',
/* zp */ 'z', 'p',
/* zs */ 'z', 's',
};
static const re_detail::character_pointer_range<::UChar32> range_data[] = {
{ prop_name_table+0, prop_name_table+3, }, // any
{ prop_name_table+3, prop_name_table+8, }, // ascii
{ prop_name_table+8, prop_name_table+16, }, // assigned
{ prop_name_table+16, prop_name_table+18, }, // c*
{ prop_name_table+18, prop_name_table+20, }, // cc
{ prop_name_table+20, prop_name_table+22, }, // cf
{ prop_name_table+22, prop_name_table+38, }, // closepunctuation
{ prop_name_table+38, prop_name_table+40, }, // cn
{ prop_name_table+40, prop_name_table+42, }, // co
{ prop_name_table+42, prop_name_table+62, }, // connectorpunctuation
{ prop_name_table+62, prop_name_table+69, }, // control
{ prop_name_table+69, prop_name_table+71, }, // cs
{ prop_name_table+71, prop_name_table+85, }, // currencysymbol
{ prop_name_table+85, prop_name_table+100, }, // dashpunctuation
{ prop_name_table+100, prop_name_table+118, }, // decimaldigitnumber
{ prop_name_table+118, prop_name_table+131, }, // enclosingmark
{ prop_name_table+131, prop_name_table+147, }, // finalpunctuation
{ prop_name_table+147, prop_name_table+153, }, // format
{ prop_name_table+153, prop_name_table+171, }, // initialpunctuation
{ prop_name_table+171, prop_name_table+173, }, // l*
{ prop_name_table+173, prop_name_table+179, }, // letter
{ prop_name_table+179, prop_name_table+191, }, // letternumber
{ prop_name_table+191, prop_name_table+204, }, // lineseparator
{ prop_name_table+204, prop_name_table+206, }, // ll
{ prop_name_table+206, prop_name_table+208, }, // lm
{ prop_name_table+208, prop_name_table+210, }, // lo
{ prop_name_table+210, prop_name_table+225, }, // lowercaseletter
{ prop_name_table+225, prop_name_table+227, }, // lt
{ prop_name_table+227, prop_name_table+229, }, // lu
{ prop_name_table+229, prop_name_table+231, }, // m*
{ prop_name_table+231, prop_name_table+235, }, // mark
{ prop_name_table+235, prop_name_table+245, }, // mathsymbol
{ prop_name_table+245, prop_name_table+247, }, // mc
{ prop_name_table+247, prop_name_table+249, }, // me
{ prop_name_table+249, prop_name_table+251, }, // mn
{ prop_name_table+251, prop_name_table+265, }, // modifierletter
{ prop_name_table+265, prop_name_table+279, }, // modifiersymbol
{ prop_name_table+279, prop_name_table+281, }, // n*
{ prop_name_table+281, prop_name_table+283, }, // nd
{ prop_name_table+283, prop_name_table+285, }, // nl
{ prop_name_table+285, prop_name_table+287, }, // no
{ prop_name_table+287, prop_name_table+301, }, // nonspacingmark
{ prop_name_table+301, prop_name_table+312, }, // notassigned
{ prop_name_table+312, prop_name_table+318, }, // number
{ prop_name_table+318, prop_name_table+333, }, // openpunctuation
{ prop_name_table+333, prop_name_table+338, }, // other
{ prop_name_table+338, prop_name_table+349, }, // otherletter
{ prop_name_table+349, prop_name_table+360, }, // othernumber
{ prop_name_table+360, prop_name_table+376, }, // otherpunctuation
{ prop_name_table+376, prop_name_table+387, }, // othersymbol
{ prop_name_table+387, prop_name_table+389, }, // p*
{ prop_name_table+389, prop_name_table+407, }, // paragraphseparator
{ prop_name_table+407, prop_name_table+409, }, // pc
{ prop_name_table+409, prop_name_table+411, }, // pd
{ prop_name_table+411, prop_name_table+413, }, // pe
{ prop_name_table+413, prop_name_table+415, }, // pf
{ prop_name_table+415, prop_name_table+417, }, // pi
{ prop_name_table+417, prop_name_table+419, }, // po
{ prop_name_table+419, prop_name_table+429, }, // privateuse
{ prop_name_table+429, prop_name_table+431, }, // ps
{ prop_name_table+431, prop_name_table+442, }, // punctuation
{ prop_name_table+442, prop_name_table+444, }, // s*
{ prop_name_table+444, prop_name_table+446, }, // sc
{ prop_name_table+446, prop_name_table+455, }, // separator
{ prop_name_table+455, prop_name_table+457, }, // sk
{ prop_name_table+457, prop_name_table+459, }, // sm
{ prop_name_table+459, prop_name_table+461, }, // so
{ prop_name_table+461, prop_name_table+475, }, // spaceseparator
{ prop_name_table+475, prop_name_table+495, }, // spacingcombiningmark
{ prop_name_table+495, prop_name_table+504, }, // surrogate
{ prop_name_table+504, prop_name_table+510, }, // symbol
{ prop_name_table+510, prop_name_table+519, }, // titlecase
{ prop_name_table+519, prop_name_table+534, }, // titlecaseletter
{ prop_name_table+534, prop_name_table+549, }, // uppercaseletter
{ prop_name_table+549, prop_name_table+551, }, // z*
{ prop_name_table+551, prop_name_table+553, }, // zl
{ prop_name_table+553, prop_name_table+555, }, // zp
{ prop_name_table+555, prop_name_table+557, }, // zs
};
static const icu_regex_traits::char_class_type icu_class_map[] = {
icu_regex_traits::mask_any, // any
icu_regex_traits::mask_ascii, // ascii
(0x3FFFFFFFu) & ~(U_GC_CN_MASK), // assigned
U_GC_C_MASK, // c*
U_GC_CC_MASK, // cc
U_GC_CF_MASK, // cf
U_GC_PE_MASK, // closepunctuation
U_GC_CN_MASK, // cn
U_GC_CO_MASK, // co
U_GC_PC_MASK, // connectorpunctuation
U_GC_CC_MASK, // control
U_GC_CS_MASK, // cs
U_GC_SC_MASK, // currencysymbol
U_GC_PD_MASK, // dashpunctuation
U_GC_ND_MASK, // decimaldigitnumber
U_GC_ME_MASK, // enclosingmark
U_GC_PF_MASK, // finalpunctuation
U_GC_CF_MASK, // format
U_GC_PI_MASK, // initialpunctuation
U_GC_L_MASK, // l*
U_GC_L_MASK, // letter
U_GC_NL_MASK, // letternumber
U_GC_ZL_MASK, // lineseparator
U_GC_LL_MASK, // ll
U_GC_LM_MASK, // lm
U_GC_LO_MASK, // lo
U_GC_LL_MASK, // lowercaseletter
U_GC_LT_MASK, // lt
U_GC_LU_MASK, // lu
U_GC_M_MASK, // m*
U_GC_M_MASK, // mark
U_GC_SM_MASK, // mathsymbol
U_GC_MC_MASK, // mc
U_GC_ME_MASK, // me
U_GC_MN_MASK, // mn
U_GC_LM_MASK, // modifierletter
U_GC_SK_MASK, // modifiersymbol
U_GC_N_MASK, // n*
U_GC_ND_MASK, // nd
U_GC_NL_MASK, // nl
U_GC_NO_MASK, // no
U_GC_MN_MASK, // nonspacingmark
U_GC_CN_MASK, // notassigned
U_GC_N_MASK, // number
U_GC_PS_MASK, // openpunctuation
U_GC_C_MASK, // other
U_GC_LO_MASK, // otherletter
U_GC_NO_MASK, // othernumber
U_GC_PO_MASK, // otherpunctuation
U_GC_SO_MASK, // othersymbol
U_GC_P_MASK, // p*
U_GC_ZP_MASK, // paragraphseparator
U_GC_PC_MASK, // pc
U_GC_PD_MASK, // pd
U_GC_PE_MASK, // pe
U_GC_PF_MASK, // pf
U_GC_PI_MASK, // pi
U_GC_PO_MASK, // po
U_GC_CO_MASK, // privateuse
U_GC_PS_MASK, // ps
U_GC_P_MASK, // punctuation
U_GC_S_MASK, // s*
U_GC_SC_MASK, // sc
U_GC_Z_MASK, // separator
U_GC_SK_MASK, // sk
U_GC_SM_MASK, // sm
U_GC_SO_MASK, // so
U_GC_ZS_MASK, // spaceseparator
U_GC_MC_MASK, // spacingcombiningmark
U_GC_CS_MASK, // surrogate
U_GC_S_MASK, // symbol
U_GC_LT_MASK, // titlecase
U_GC_LT_MASK, // titlecaseletter
U_GC_LU_MASK, // uppercaseletter
U_GC_Z_MASK, // z*
U_GC_ZL_MASK, // zl
U_GC_ZP_MASK, // zp
U_GC_ZS_MASK, // zs
};
static const re_detail::character_pointer_range< ::UChar32>* ranges_begin = range_data;
static const re_detail::character_pointer_range< ::UChar32>* ranges_end = range_data + (sizeof(range_data)/sizeof(range_data[0]));
re_detail::character_pointer_range< ::UChar32> t = { p1, p2, };
const re_detail::character_pointer_range< ::UChar32>* p = std::lower_bound(ranges_begin, ranges_end, t);
if((p != ranges_end) && (t == *p))
return icu_class_map[p - ranges_begin];
return 0;
}
icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_type* p1, const char_type* p2) const
{
@ -92,12 +353,33 @@ icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_
};
int id = ::boost::re_detail::get_default_class_id(p1, p2);
if(id >= 0)
return masks[id+1];
char_class_type result = lookup_icu_mask(p1, p2);
if(result != 0)
return result;
if(id < 0)
{
string_type s(p1, p2);
for(string_type::size_type i = 0; i < s.size(); ++i)
string_type::size_type i = 0;
while(i < s.size())
{
s[i] = static_cast<char>((::u_tolower)(s[i]));
if(::u_isspace(s[i]) || (s[i] == '-') || (s[i] == '_'))
s.erase(s.begin()+i, s.begin()+i+1);
else
{
s[i] = static_cast<char>((::u_tolower)(s[i]));
++i;
}
}
id = ::boost::re_detail::get_default_class_id(&*s.begin(), &*s.begin() + s.size());
if(id >= 0)
return masks[id+1];
result = lookup_icu_mask(&*s.begin(), &*s.begin() + s.size());
if(result != 0)
return result;
}
BOOST_ASSERT(id+1 < sizeof(masks) / sizeof(masks[0]));
return masks[id+1];
@ -109,6 +391,23 @@ icu_regex_traits::string_type icu_regex_traits::lookup_collatename(const char_ty
if(std::find_if(p1, p2, std::bind2nd(std::greater< ::UChar32>(), 0x7f)) == p2)
{
std::string s(p1, p2);
// Try Unicode name:
UErrorCode err = U_ZERO_ERROR;
UChar32 c = ::u_charFromName(U_UNICODE_CHAR_NAME, s.c_str(), &err);
if(U_SUCCESS(err))
{
result.push_back(c);
return result;
}
// Try Unicode-extended name:
err = U_ZERO_ERROR;
c = ::u_charFromName(U_EXTENDED_CHAR_NAME, s.c_str(), &err);
if(U_SUCCESS(err))
{
result.push_back(c);
return result;
}
// try POSIX name:
s = ::boost::re_detail::lookup_default_collate_name(s);
result.assign(s.begin(), s.end());
}
@ -121,21 +420,26 @@ bool icu_regex_traits::isctype(char_type c, char_class_type f) const
{
// check for standard catagories first:
char_class_type m = char_class_type(1u << u_charType(c));
if((m & f).any())
if((m & f) != 0)
return true;
// now check for special cases:
if((f & mask_blank).any() && u_isblank(c))
if(((f & mask_blank) != 0) && u_isblank(c))
return true;
if((f & mask_space).any() && u_isspace(c))
if(((f & mask_space) != 0) && u_isspace(c))
return true;
if((f & mask_xdigit).any() && (u_digit(c, 16) >= 0))
if(((f & mask_xdigit) != 0) && (u_digit(c, 16) >= 0))
return true;
if((f & mask_unicode).any() && (c >= 0x100))
if(((f & mask_unicode) != 0) && (c >= 0x100))
return true;
if((f & mask_underscore).any() && (c == '_'))
if(((f & mask_underscore) != 0) && (c == '_'))
return true;
if(((f & mask_any) != 0) && (c <= 0x10FFFF))
return true;
if(((f & mask_ascii) != 0) && (c <= 0x7F))
return true;
return false;
}
}
#endif // BOOST_HAS_ICU

View File

@ -96,7 +96,11 @@ BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants
"C",
"Z",
"G",
"!", };
"!",
"p",
"P",
"N",
};
return ((n >= (sizeof(messages) / sizeof(messages[1]))) ? "" : messages[n]);
}
@ -374,9 +378,9 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul
regex_constants::escape_type_not_class, /*K*/
regex_constants::escape_type_not_class, /*L*/
regex_constants::escape_type_not_class, /*M*/
regex_constants::escape_type_not_class, /*N*/
regex_constants::escape_type_named_char, /*N*/
regex_constants::escape_type_not_class, /*O*/
regex_constants::escape_type_not_class, /*P*/
regex_constants::escape_type_not_property, /*P*/
regex_constants::escape_type_Q, /*Q*/
regex_constants::escape_type_not_class, /*R*/
regex_constants::escape_type_not_class, /*S*/
@ -408,7 +412,7 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul
regex_constants::escape_type_class, /*m*/
regex_constants::escape_type_control_n, /*n*/
regex_constants::escape_type_class, /*o*/
regex_constants::escape_type_class, /*p*/
regex_constants::escape_type_property, /*p*/
regex_constants::escape_type_class, /*q*/
regex_constants::escape_type_control_r, /*r*/
regex_constants::escape_type_class, /*s*/

View File

@ -26,6 +26,7 @@ test_sets.cpp
test_simple_repeats.cpp
test_tricky_cases.cpp
test_icu.cpp
test_unicode.cpp
test_overloads.cpp
test_operators.cpp
;
@ -120,6 +121,17 @@ test-suite regex
<lib>../../test/build/boost_test_exec_monitor
]
[ run unicode/unicode_iterator_test.cpp ]
[ regex-test static_mutex_test
: <template>test # sources
static_mutex/static_mutex_test.cpp
<dll>../../thread/build/boost_thread
]
[ regex-test object_cache_test
: <template>test # sources
object_cache/object_cache_test.cpp
]
[ run config_info/regex_config_info.cpp <template>test
: : : <test-info>always_show_run_output ]

View File

@ -53,6 +53,7 @@ int cpp_main(int /*argc*/, char * /*argv*/[])
test_emacs();
test_operators();
test_overloads();
test_unicode();
return error_count;
}

View File

@ -215,6 +215,7 @@ void test_en_locale();
void test_emacs();
void test_operators();
void test_overloads();
void test_unicode();
//
// template instances:

View File

@ -19,7 +19,8 @@
//
// We can only build this if we have ICU support:
//
#ifdef TEST_ICU
#include <boost/regex/config.hpp>
#ifdef BOOST_HAS_ICU
#include <boost/regex/icu.hpp>
#include "test.hpp"
@ -27,7 +28,6 @@
void test_icu(const wchar_t&, const test_regex_search_tag& )
{
typedef boost::u16_to_u32_iterator<std::wstring::const_iterator, ::UChar32> conv_iterator;
boost::u32regex r;
if(*test_locale::c_str())
{
@ -37,26 +37,28 @@ void test_icu(const wchar_t&, const test_regex_search_tag& )
r.imbue(l);
}
const std::wstring& expression = test_info<wchar_t>::expression();
std::vector< ::UChar32> expression;
expression.assign(test_info<wchar_t>::expression().begin(), test_info<wchar_t>::expression().end());
boost::regex_constants::syntax_option_type syntax_options = test_info<UChar32>::syntax_options();
try{
r.assign(conv_iterator(expression.begin()), conv_iterator(expression.end()), syntax_options);
r.assign(expression.begin(), expression.end(), syntax_options);
if(r.status())
{
BOOST_REGEX_TEST_ERROR("Expression did not compile when it should have done, error code = " << r.status(), UChar32);
}
const std::wstring& search_text = test_info<wchar_t>::search_text();
std::vector< ::UChar32> search_text;
search_text.assign(test_info<wchar_t>::search_text().begin(), test_info<wchar_t>::search_text().end());
boost::regex_constants::match_flag_type opts = test_info<wchar_t>::match_options();
const int* answer_table = test_info<wchar_t>::answer_table();
boost::match_results<conv_iterator> what;
boost::match_results<std::vector< ::UChar32>::const_iterator> what;
if(boost::regex_search(
conv_iterator(search_text.begin()),
conv_iterator(search_text.end()),
const_cast<std::vector< ::UChar32>const&>(search_text).begin(),
const_cast<std::vector< ::UChar32>const&>(search_text).end(),
what,
r,
opts))
{
test_result(what, conv_iterator(search_text.begin()), answer_table);
test_result(what, const_cast<std::vector< ::UChar32>const&>(search_text).begin(), answer_table);
}
else if(answer_table[0] >= 0)
{
@ -85,7 +87,8 @@ void test_icu(const wchar_t&, const test_regex_search_tag& )
void test_icu(const wchar_t&, const test_invalid_regex_tag&)
{
typedef boost::u16_to_u32_iterator<std::wstring::const_iterator, ::UChar32> conv_iterator;
const std::wstring& expression = test_info<wchar_t>::expression();
std::vector< ::UChar32> expression;
expression.assign(test_info<wchar_t>::expression().begin(), test_info<wchar_t>::expression().end());
boost::regex_constants::syntax_option_type syntax_options = test_info<wchar_t>::syntax_options();
boost::u32regex r;
if(*test_locale::c_str())
@ -100,7 +103,7 @@ void test_icu(const wchar_t&, const test_invalid_regex_tag&)
//
try
{
if(0 == r.assign(conv_iterator(expression.begin()), conv_iterator(expression.end()), syntax_options | boost::regex_constants::no_except).status())
if(0 == r.assign(expression.begin(), expression.end(), syntax_options | boost::regex_constants::no_except).status())
{
BOOST_REGEX_TEST_ERROR("Expression compiled when it should not have done so.", wchar_t);
}
@ -114,7 +117,7 @@ void test_icu(const wchar_t&, const test_invalid_regex_tag&)
//
bool have_catch = false;
try{
r.assign(conv_iterator(expression.begin()), conv_iterator(expression.end()), syntax_options);
r.assign(expression.begin(), expression.end(), syntax_options);
#ifdef BOOST_NO_EXCEPTIONS
if(r.status())
have_catch = true;
@ -148,7 +151,8 @@ void test_icu(const wchar_t&, const test_invalid_regex_tag&)
void test_icu(const wchar_t&, const test_regex_replace_tag&)
{
const std::wstring& expression = test_info<wchar_t>::expression();
std::vector< ::UChar32> expression;
expression.assign(test_info<wchar_t>::expression().begin(), test_info<wchar_t>::expression().end());
boost::regex_constants::syntax_option_type syntax_options = test_info<UChar32>::syntax_options();
boost::u32regex r;
try{

View File

@ -75,6 +75,8 @@ void test_replace()
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\0", "\0");
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "()?:", "()?:");
TEST_REGEX_REPLACE("a+", perl, "...aaa,,", match_default|format_perl|format_no_copy, "\\0101", "A");
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\1", "aa");
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\2", "bb");
// move to copying unmatched data:
TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_all, "bbb", "...bbb,,,");
@ -101,5 +103,10 @@ void test_replace()
TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,", match_default|format_perl|format_no_copy, "(?1abc:def)", "(?1abc:def)");
TEST_REGEX_REPLACE("a+(b+)", perl, "...", match_default|format_perl, "(?1abc:def)", "...");
TEST_REGEX_REPLACE("a+(b+)", perl, "...", match_default|format_perl|format_no_copy, "(?1abc:def)", "");
// probe bug reports and other special cases:
TEST_REGEX_REPLACE("([^\\d]+).*", normal|icase, "tesd 999 test", match_default|format_all, "($1)replace", "tesd replace");
TEST_REGEX_REPLACE("(a)(b)", perl, "ab", match_default|format_all, "$1:$2", "a:b");
TEST_REGEX_REPLACE("(a(c)?)|(b)", perl, "acab", match_default|format_all, "(?1(?2(C:):A):B:)", "C:AB:");
}

View File

@ -88,6 +88,7 @@ void test_sets()
TEST_REGEX_SEARCH("[[:space:]]+", extended, "a \n\t\rb", match_default, make_array(1, 5, -2, -2));
TEST_REGEX_SEARCH("[[:upper:]]+", extended, "aBCd", match_default, make_array(1, 3, -2, -2));
TEST_REGEX_SEARCH("[[:xdigit:]]+", extended, "p0f3Cx", match_default, make_array(1, 5, -2, -2));
TEST_REGEX_SEARCH("[\\d]+", perl, "a019b", match_default, make_array(1, 4, -2, -2));
//
// escapes are supported in character classes if we have either
@ -243,5 +244,66 @@ void test_sets2()
TEST_REGEX_SEARCH("[\\s]+", perl, "AB AB", match_default, make_array(2, 5, -2, -2));
TEST_INVALID_REGEX("[\\S]", perl);
TEST_REGEX_SEARCH("\\S+", perl, " abc ", match_default, make_array(2, 5, -2, -2));
// and some Perl style properties:
TEST_REGEX_SEARCH("\\pl+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\Pl+", perl, "abABCab", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\pu+", perl, "abABCab", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\Pu+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\pd+", perl, "AB012AB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\PD+", perl, "01abc01", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\ps+", perl, "AB AB", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\PS+", perl, " abc ", match_default, make_array(2, 5, -2, -2));
TEST_REGEX_SEARCH("\\p{alnum}+", perl, "-%@a0X_-", match_default, make_array(3, 6, -2, -2));
TEST_REGEX_SEARCH("\\p{alpha}+", perl, " -%@aX_0-", match_default, make_array(4, 6, -2, -2));
TEST_REGEX_SEARCH("\\p{blank}+", perl, "a \tb", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{cntrl}+", perl, " a\n\tb", match_default, make_array(2, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{digit}+", perl, "a019b", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{graph}+", perl, " a%b ", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{lower}+", perl, "AabC", match_default, make_array(1, 3, -2, -2));
TEST_REGEX_SEARCH("\\p{print}+", perl, "AabC", match_default, make_array(0, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{punct}+", perl, " %-&\t", match_default, make_array(1, 4, -2, -2));
TEST_REGEX_SEARCH("\\p{space}+", perl, "a \n\t\rb", match_default, make_array(1, 5, -2, -2));
TEST_REGEX_SEARCH("\\p{upper}+", perl, "aBCd", match_default, make_array(1, 3, -2, -2));
TEST_REGEX_SEARCH("\\p{xdigit}+", perl, "p0f3Cx", match_default, make_array(1, 5, -2, -2));
TEST_REGEX_SEARCH("\\P{alnum}+", perl, "-%@a", match_default, make_array(0, 3, -2, -2));
TEST_REGEX_SEARCH("\\P{alpha}+", perl, " -%@a", match_default, make_array(0, 4, -2, -2));
TEST_REGEX_SEARCH("\\P{blank}+", perl, "a ", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{cntrl}+", perl, " a\n", match_default, make_array(0, 2, -2, -2));
TEST_REGEX_SEARCH("\\P{digit}+", perl, "a0", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{graph}+", perl, " a", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{lower}+", perl, "Aa", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{print}+", perl, "Absc", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH("\\P{punct}+", perl, " %", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{space}+", perl, "a ", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{upper}+", perl, "aB", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\P{xdigit}+", perl, "pf", match_default, make_array(0, 1, -2, -2));
TEST_INVALID_REGEX("\\p{invalid class}", perl);
TEST_INVALID_REGEX("\\p{upper", perl);
TEST_INVALID_REGEX("\\p{", perl);
TEST_INVALID_REGEX("\\p", perl);
TEST_INVALID_REGEX("\\P{invalid class}", perl);
TEST_INVALID_REGEX("\\P{upper", perl);
TEST_INVALID_REGEX("\\P{", perl);
TEST_INVALID_REGEX("\\P", perl);
// try named characters:
TEST_REGEX_SEARCH("\\N{zero}", perl, "0", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{one}", perl, "1", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{two}", perl, "2", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{three}", perl, "3", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{a}", perl, "bac", match_default, make_array(1, 2, -2, -2));
TEST_REGEX_SEARCH("\\N{\xf0}", perl, "b\xf0x", match_default, make_array(1, 2, -2, -2));
TEST_REGEX_SEARCH("\\N{right-curly-bracket}", perl, "}", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("\\N{NUL}", perl, "\0", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH("[\\N{zero}-\\N{nine}]+", perl, " 0123456789 ", match_default, make_array(1, 11, -2, -2));
TEST_INVALID_REGEX("\\N", perl);
TEST_INVALID_REGEX("\\N{", perl);
TEST_INVALID_REGEX("\\N{}", perl);
TEST_INVALID_REGEX("\\N{invalid-name}", perl);
TEST_INVALID_REGEX("\\N{zero", perl);
}

View File

@ -0,0 +1,147 @@
/*
*
* Copyright (c) 2004
* Dr John Maddock
*
* Use, modification and distribution are subject to the
* Boost Software License, Version 1.0. (See accompanying file
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
*
*/
/*
* LOCATION: see http://www.boost.org for most recent version.
* FILE test_unicode.hpp
* VERSION see <boost/version.hpp>
* DESCRIPTION: Unicode specific tests (requires ICU).
*/
#include <boost/regex/config.hpp>
#ifdef BOOST_HAS_ICU
#include "test.hpp"
#ifdef BOOST_MSVC
#pragma warning(disable:4127)
#endif
#define TEST_REGEX_SEARCH_U(s, f, t, m, a)\
do{\
const wchar_t e[] = { s };\
std::wstring se(e, (sizeof(e) / sizeof(wchar_t)) - 1);\
const wchar_t st[] = { t };\
std::wstring sst(st, (sizeof(st) / sizeof(wchar_t)) - 1);\
test_info<wchar_t>::set_info(__FILE__, __LINE__, se, f, sst, m, a);\
test_icu(wchar_t(0), test_regex_search_tag());\
}while(0)
#define TEST_REGEX_CLASS_U(classname, character)\
TEST_REGEX_SEARCH_U(\
BOOST_JOIN(L, \
BOOST_STRINGIZE(\
BOOST_JOIN([[:, BOOST_JOIN(classname, :]])))), \
perl, \
BOOST_JOIN(L, \
BOOST_STRINGIZE(\
BOOST_JOIN(\x, character))), \
match_default, \
make_array(0, 1, -2, -2))
void test_unicode()
{
using namespace boost::regex_constants;
TEST_REGEX_CLASS_U(L*, 3108);
TEST_REGEX_CLASS_U(Letter, 3108);
TEST_REGEX_CLASS_U(Lu, 2145);
TEST_REGEX_CLASS_U(Uppercase Letter, 2145);
TEST_REGEX_CLASS_U(Ll, 2146);
TEST_REGEX_CLASS_U(Lowercase Letter, 2146);
TEST_REGEX_CLASS_U(Lt, 1FFC);
TEST_REGEX_CLASS_U(Titlecase Letter, 1FFC);
TEST_REGEX_CLASS_U(Lm, 1D61);
TEST_REGEX_CLASS_U(Modifier Letter, 1D61);
TEST_REGEX_CLASS_U(Lo, 1974);
TEST_REGEX_CLASS_U(Other Letter, 1974);
TEST_REGEX_CLASS_U(M*, 20EA);
TEST_REGEX_CLASS_U(Mark, 20EA);
TEST_REGEX_CLASS_U(Mn, 20EA);
TEST_REGEX_CLASS_U(Non-Spacing Mark, 20EA);
TEST_REGEX_CLASS_U(Mc, 1938);
TEST_REGEX_CLASS_U(Spacing Combining Mark, 1938);
TEST_REGEX_CLASS_U(Me, 06DE);
TEST_REGEX_CLASS_U(Enclosing Mark, 06DE);
TEST_REGEX_CLASS_U(N*, 0669);
TEST_REGEX_CLASS_U(Number, 0669);
TEST_REGEX_CLASS_U(Nd, 0669);
TEST_REGEX_CLASS_U(Decimal Digit Number, 0669);
TEST_REGEX_CLASS_U(Nl, 303A);
TEST_REGEX_CLASS_U(Letter Number, 303A);
TEST_REGEX_CLASS_U(No, 2793);
TEST_REGEX_CLASS_U(Other Number, 2793);
TEST_REGEX_CLASS_U(S*, 2144);
TEST_REGEX_CLASS_U(Symbol, 2144);
TEST_REGEX_CLASS_U(Sm, 2144);
TEST_REGEX_CLASS_U(Math Symbol, 2144);
TEST_REGEX_CLASS_U(Sc, 20B1);
TEST_REGEX_CLASS_U(Currency Symbol, 20B1);
TEST_REGEX_CLASS_U(Sk, 1FFE);
TEST_REGEX_CLASS_U(Modifier Symbol, 1FFE);
TEST_REGEX_CLASS_U(So, 19FF);
TEST_REGEX_CLASS_U(Other Symbol, 19FF);
TEST_REGEX_CLASS_U(P*, 005F);
TEST_REGEX_CLASS_U(Punctuation, 005F);
TEST_REGEX_CLASS_U(Pc, 005F);
TEST_REGEX_CLASS_U(Connector Punctuation, 005F);
TEST_REGEX_CLASS_U(Pd, 002D);
TEST_REGEX_CLASS_U(Dash Punctuation, 002D);
TEST_REGEX_CLASS_U(Ps, 0028);
TEST_REGEX_CLASS_U(Open Punctuation, 0028);
TEST_REGEX_CLASS_U(Pe, FF63);
TEST_REGEX_CLASS_U(Close Punctuation, FF63);
TEST_REGEX_CLASS_U(Pi, 2039);
TEST_REGEX_CLASS_U(Initial Punctuation, 2039);
TEST_REGEX_CLASS_U(Pf, 203A);
TEST_REGEX_CLASS_U(Final Punctuation, 203A);
TEST_REGEX_CLASS_U(Po, 2038);
TEST_REGEX_CLASS_U(Other Punctuation, 2038);
TEST_REGEX_CLASS_U(Z*, 202F);
TEST_REGEX_CLASS_U(Separator, 202F);
TEST_REGEX_CLASS_U(Zs, 202F);
TEST_REGEX_CLASS_U(Space Separator, 202F);
TEST_REGEX_CLASS_U(Zl, 2028);
TEST_REGEX_CLASS_U(Line Separator, 2028);
TEST_REGEX_CLASS_U(Zp, 2029);
TEST_REGEX_CLASS_U(Paragraph Separator, 2029);
TEST_REGEX_CLASS_U(C*, 009F);
TEST_REGEX_CLASS_U(Other, 009F);
TEST_REGEX_CLASS_U(Cc, 009F);
TEST_REGEX_CLASS_U(Control, 009F);
TEST_REGEX_CLASS_U(Cf, FFFB);
TEST_REGEX_CLASS_U(Format, FFFB);
TEST_REGEX_CLASS_U(Cs, DC00);
TEST_REGEX_CLASS_U(Surrogate, DC00);
TEST_REGEX_CLASS_U(Co, F8FF);
TEST_REGEX_CLASS_U(Private Use, F8FF);
TEST_REGEX_CLASS_U(Cn, FFFF);
TEST_REGEX_CLASS_U(Not Assigned, FFFF);
TEST_REGEX_CLASS_U(Any, 2038);
TEST_REGEX_CLASS_U(Assigned, 2038);
TEST_REGEX_CLASS_U(ASCII, 7f);
TEST_REGEX_SEARCH_U(L"[[:Assigned:]]", perl, L"\xffff", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH_U(L"[[:ASCII:]]", perl, L"\x80", match_default, make_array(-2, -2));
TEST_REGEX_SEARCH_U(L"\\N{KHMER DIGIT SIX}", perl, L"\x17E6", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"\\N{MODIFIER LETTER LOW ACUTE ACCENT}", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"\\N{SUPERSCRIPT ONE}", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"[\\N{KHMER DIGIT SIX}]", perl, L"\x17E6", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"[\\N{MODIFIER LETTER LOW ACUTE ACCENT}]", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"[\\N{SUPERSCRIPT ONE}]", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
TEST_REGEX_SEARCH_U(L"\\N{CJK UNIFIED IDEOGRAPH-7FED}", perl, L"\x7FED", match_default, make_array(0, 1, -2, -2));
}
#else
void test_unicode(){}
#endif

View File

@ -36,25 +36,70 @@ void spot_checks()
BOOST_CHECK_EQUAL(*--it, 0xDF02u);
BOOST_CHECK_EQUAL(*--it, 0xD800u);
::boost::uint32_t spot8[] = { 0x004Du, 0x0430u, 0x4E8Cu, 0x10302u, };
typedef boost::u32_to_u8_iterator<const ::boost::uint32_t*> u32to8type;
u32to8type it8(spot8);
BOOST_CHECK_EQUAL(*it8++, 0x4Du);
BOOST_CHECK_EQUAL(*it8++, 0xD0u);
BOOST_CHECK_EQUAL(*it8++, 0xB0u);
BOOST_CHECK_EQUAL(*it8++, 0xE4u);
BOOST_CHECK_EQUAL(*it8++, 0xBAu);
BOOST_CHECK_EQUAL(*it8++, 0x8Cu);
BOOST_CHECK_EQUAL(*it8++, 0xF0u);
BOOST_CHECK_EQUAL(*it8++, 0x90u);
BOOST_CHECK_EQUAL(*it8++, 0x8Cu);
BOOST_CHECK_EQUAL(*it8++, 0x82u);
BOOST_CHECK_EQUAL(*--it8, 0x82u);
BOOST_CHECK_EQUAL(*--it8, 0x8Cu);
BOOST_CHECK_EQUAL(*--it8, 0x90u);
BOOST_CHECK_EQUAL(*--it8, 0xF0u);
BOOST_CHECK_EQUAL(*--it8, 0x8Cu);
BOOST_CHECK_EQUAL(*--it8, 0xBAu);
BOOST_CHECK_EQUAL(*--it8, 0xE4u);
BOOST_CHECK_EQUAL(*--it8, 0xB0u);
BOOST_CHECK_EQUAL(*--it8, 0xD0u);
BOOST_CHECK_EQUAL(*--it8, 0x4Du);
}
void test(const std::vector< ::boost::uint32_t>& v)
{
typedef std::vector< ::boost::uint32_t> vector32_type;
typedef std::vector< ::boost::uint16_t> vector16_type;
typedef std::vector< ::boost::uint8_t> vector8_type;
typedef boost::u32_to_u16_iterator<vector32_type::const_iterator, ::boost::uint16_t> u32to16type;
typedef boost::u16_to_u32_iterator<vector16_type::const_iterator, ::boost::uint32_t> u16to32type;
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
typedef std::reverse_iterator<u32to16type> ru32to16type;
typedef std::reverse_iterator<u16to32type> ru16to32type;
#endif
typedef boost::u32_to_u8_iterator<vector32_type::const_iterator, ::boost::uint8_t> u32to8type;
typedef boost::u8_to_u32_iterator<vector8_type::const_iterator, ::boost::uint32_t> u8to32type;
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
typedef std::reverse_iterator<u32to8type> ru32to8type;
typedef std::reverse_iterator<u8to32type> ru8to32type;
#endif
vector8_type v8;
vector16_type v16;
vector32_type v32;
vector32_type::const_iterator i, j, k;
//
// begin by testing forward iteration, of 32-16 bit interconversions:
//
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
v16.assign(u32to16type(v.begin()), u32to16type(v.end()));
#else
v16.clear();
std::copy(u32to16type(v.begin()), u32to16type(v.end()), std::back_inserter(v16));
#endif
BOOST_CHECK_EQUAL(std::distance(u32to16type(v.begin()), u32to16type(v.end())), v16.size());
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
v32.assign(u16to32type(v16.begin()), u16to32type(v16.end()));
#else
v32.clear();
std::copy(u16to32type(v16.begin()), u16to32type(v16.end()), std::back_inserter(v32));
#endif
BOOST_CHECK_EQUAL(std::distance(u16to32type(v16.begin()), u16to32type(v16.end())), v32.size());
BOOST_CHECK_EQUAL(v.size(), v32.size());
i = v.begin();
@ -68,6 +113,7 @@ void test(const std::vector< ::boost::uint32_t>& v)
//
// test backward iteration, of 32-16 bit interconversions:
//
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
v16.assign(ru32to16type(u32to16type(v.end())), ru32to16type(u32to16type(v.begin())));
BOOST_CHECK_EQUAL(std::distance(ru32to16type(u32to16type(v.end())), ru32to16type(u32to16type(v.begin()))), v16.size());
std::reverse(v16.begin(), v16.end());
@ -83,6 +129,53 @@ void test(const std::vector< ::boost::uint32_t>& v)
i,
j,
k);
#endif
//
// Test forward iteration, of 32-8 bit interconversions:
//
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
v8.assign(u32to8type(v.begin()), u32to8type(v.end()));
#else
v8.clear();
std::copy(u32to8type(v.begin()), u32to8type(v.end()), std::back_inserter(v8));
#endif
BOOST_CHECK_EQUAL(std::distance(u32to8type(v.begin()), u32to8type(v.end())), v8.size());
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
v32.assign(u8to32type(v8.begin()), u8to32type(v8.end()));
#else
v32.clear();
std::copy(u8to32type(v8.begin()), u8to32type(v8.end()), std::back_inserter(v32));
#endif
BOOST_CHECK_EQUAL(std::distance(u8to32type(v8.begin()), u8to32type(v8.end())), v32.size());
BOOST_CHECK_EQUAL(v.size(), v32.size());
i = v.begin();
j = i;
std::advance(j, (std::min)(v.size(), v32.size()));
k = v32.begin();
BOOST_CHECK_EQUAL_COLLECTIONS(
i,
j,
k);
//
// test backward iteration, of 32-8 bit interconversions:
//
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
v8.assign(ru32to8type(u32to8type(v.end())), ru32to8type(u32to8type(v.begin())));
BOOST_CHECK_EQUAL(std::distance(ru32to8type(u32to8type(v.end())), ru32to8type(u32to8type(v.begin()))), v8.size());
std::reverse(v8.begin(), v8.end());
v32.assign(ru8to32type(u8to32type(v8.end())), ru8to32type(u8to32type(v8.begin())));
BOOST_CHECK_EQUAL(std::distance(ru8to32type(u8to32type(v8.end())), ru8to32type(u8to32type(v8.begin()))), v32.size());
BOOST_CHECK_EQUAL(v.size(), v32.size());
std::reverse(v32.begin(), v32.end());
i = v.begin();
j = i;
std::advance(j, (std::min)(v.size(), v32.size()));
k = v32.begin();
BOOST_CHECK_EQUAL_COLLECTIONS(
i,
j,
k);
#endif
}
int test_main( int, char* [] )
@ -98,6 +191,12 @@ int test_main( int, char* [] )
v.push_back(0xFFFF);
v.push_back(0x10000);
v.push_back(0x10FFFF);
v.push_back(0x80u);
v.push_back(0x80u - 1);
v.push_back(0x800u);
v.push_back(0x800u - 1);
v.push_back(0x10000u);
v.push_back(0x10000u - 1);
test(v);
return 0;
}