forked from boostorg/regex
Added support for Perl style \N \P and \p.
Completed first draft of Unicode UCS-4 support. Broken compiler compatibility fixes. Added unicode_iterators. [SVN r26185]
This commit is contained in:
@ -42,6 +42,7 @@ SOURCES =
|
||||
cpp_regex_traits.cpp
|
||||
cregex.cpp
|
||||
fileiter.cpp
|
||||
icu.cpp
|
||||
instances.cpp
|
||||
posix_api.cpp
|
||||
regex.cpp
|
||||
|
@ -179,7 +179,7 @@
|
||||
# define BOOST_REGEX_DECL
|
||||
#endif
|
||||
|
||||
#if (defined(BOOST_MSVC) || defined(__BORLANDC__)) && !defined(BOOST_REGEX_NO_LIB) && !defined(BOOST_REGEX_SOURCE) && !defined(BOOST_ALL_NO_LIB) && defined(__cplusplus)
|
||||
#if !defined(BOOST_REGEX_NO_LIB) && !defined(BOOST_REGEX_SOURCE) && !defined(BOOST_ALL_NO_LIB) && defined(__cplusplus)
|
||||
# define BOOST_LIB_NAME boost_regex
|
||||
# if defined(BOOST_REGEX_DYN_LINK) || defined(BOOST_ALL_DYN_LINK)
|
||||
# define BOOST_DYN_LINK
|
||||
|
@ -177,6 +177,8 @@ private:
|
||||
offset_xdigit = U_CHAR_CATEGORY_COUNT+2,
|
||||
offset_underscore = U_CHAR_CATEGORY_COUNT+3,
|
||||
offset_unicode = U_CHAR_CATEGORY_COUNT+4,
|
||||
offset_any = U_CHAR_CATEGORY_COUNT+5,
|
||||
offset_ascii = U_CHAR_CATEGORY_COUNT+6,
|
||||
};
|
||||
|
||||
//
|
||||
@ -187,6 +189,10 @@ private:
|
||||
static const char_class_type mask_xdigit;
|
||||
static const char_class_type mask_underscore;
|
||||
static const char_class_type mask_unicode;
|
||||
static const char_class_type mask_any;
|
||||
static const char_class_type mask_ascii;
|
||||
|
||||
static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2);
|
||||
|
||||
boost::shared_ptr< ::boost::re_detail::icu_regex_traits_implementation> m_pimpl;
|
||||
};
|
||||
|
@ -49,35 +49,63 @@ inline bool is_surrogate(T v)
|
||||
return (v & 0xF800u) == 0xd800;
|
||||
}
|
||||
|
||||
inline unsigned utf8_byte_count(boost::uint8_t c)
|
||||
{
|
||||
// if the most significant bit with a zero in it is in position
|
||||
// 8-N then there are N bytes in this UTF-8 sequence:
|
||||
boost::uint8_t mask = 0x80u;
|
||||
unsigned result = 0;
|
||||
while(c & mask)
|
||||
{
|
||||
++result;
|
||||
mask >>= 1;
|
||||
}
|
||||
return (result == 0) ? 1 : result;
|
||||
}
|
||||
|
||||
inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
|
||||
{
|
||||
return utf8_byte_count(c) - 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <class BaseIterator, class U16Type = ::boost::uint16_t>
|
||||
class u32_to_u16_iterator
|
||||
: public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
|
||||
{
|
||||
typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
|
||||
typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
|
||||
|
||||
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
||||
|
||||
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
|
||||
BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
|
||||
#endif
|
||||
|
||||
public:
|
||||
typename base_type::reference
|
||||
dereference()const
|
||||
{
|
||||
if(m_current == 2)
|
||||
const_cast<u32_to_u16_iterator*>(this)->extract_current();
|
||||
extract_current();
|
||||
return m_values[m_current];
|
||||
}
|
||||
bool equal(const u32_to_u16_iterator& that)const
|
||||
{
|
||||
if(m_position == that.m_position)
|
||||
{
|
||||
// Both m_currents must be equal, or both even
|
||||
// this is the same as saying their sum must be even:
|
||||
return (m_current + that.m_current) & 1u ? false : true;
|
||||
/*
|
||||
if((m_current >= 2) && (that.m_current < 2))
|
||||
const_cast<u32_to_u16_iterator*>(this)->extract_current();
|
||||
else if((m_current < 2) && (that.m_current >= 2))
|
||||
const_cast<u32_to_u16_iterator&>(that).extract_current();
|
||||
if(m_current == that.m_current)
|
||||
return true;
|
||||
*/
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -127,7 +155,7 @@ public:
|
||||
m_values[2] = 0;
|
||||
}
|
||||
private:
|
||||
void invalid_code_point(::boost::uint32_t val)
|
||||
static void invalid_code_point(::boost::uint32_t val)
|
||||
{
|
||||
#ifndef BOOST_NO_STD_LOCALE
|
||||
std::stringstream ss;
|
||||
@ -139,15 +167,15 @@ private:
|
||||
boost::throw_exception(e);
|
||||
}
|
||||
|
||||
void extract_current()
|
||||
void extract_current()const
|
||||
{
|
||||
// begin by checking for a code point out of range:
|
||||
if(static_cast< ::boost::uint32_t>(*m_position) >= 0x10000u)
|
||||
::boost::uint32_t v = *m_position;
|
||||
if(v >= 0x10000u)
|
||||
{
|
||||
if(static_cast< ::boost::uint32_t>(*m_position) > 0x10FFFFu)
|
||||
if(v > 0x10FFFFu)
|
||||
invalid_code_point(*m_position);
|
||||
// split into two surrogates:
|
||||
base_value_type v = *m_position;
|
||||
m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
|
||||
m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
|
||||
m_current = 0;
|
||||
@ -166,29 +194,31 @@ private:
|
||||
}
|
||||
}
|
||||
BaseIterator m_position;
|
||||
U16Type m_values[3];
|
||||
unsigned m_current;
|
||||
mutable U16Type m_values[3];
|
||||
mutable unsigned m_current;
|
||||
};
|
||||
|
||||
template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
||||
class u16_to_u32_iterator
|
||||
: public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
|
||||
{
|
||||
typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
||||
typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
||||
// special values for pending iterator reads:
|
||||
BOOST_STATIC_CONSTANT(::boost::uint32_t, pending_read = 0xffffffffu);
|
||||
|
||||
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
||||
|
||||
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
|
||||
BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
|
||||
#endif
|
||||
|
||||
public:
|
||||
typename base_type::reference
|
||||
dereference()const
|
||||
{
|
||||
if(m_value == pending_read)
|
||||
const_cast<u16_to_u32_iterator*>(this)->extract_current();
|
||||
extract_current();
|
||||
return m_value;
|
||||
}
|
||||
bool equal(const u16_to_u32_iterator& that)const
|
||||
@ -223,7 +253,7 @@ public:
|
||||
m_value = pending_read;
|
||||
}
|
||||
private:
|
||||
void invalid_code_point(::boost::uint16_t val)
|
||||
static void invalid_code_point(::boost::uint16_t val)
|
||||
{
|
||||
#ifndef BOOST_NO_STD_LOCALE
|
||||
std::stringstream ss;
|
||||
@ -234,28 +264,254 @@ private:
|
||||
#endif
|
||||
boost::throw_exception(e);
|
||||
}
|
||||
void extract_current()
|
||||
void extract_current()const
|
||||
{
|
||||
m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
|
||||
// if the last value is a high surrogate then adjust m_position and m_value as needed:
|
||||
if(detail::is_high_surrogate(*m_position))
|
||||
{
|
||||
// precondition; next value must have be a low-surrogate:
|
||||
::boost::uint16_t t = *++m_position;
|
||||
if((*m_position & 0xFC00u) != 0xDC00u)
|
||||
BaseIterator next(m_position);
|
||||
::boost::uint16_t t = *++next;
|
||||
if((t & 0xFC00u) != 0xDC00u)
|
||||
invalid_code_point(t);
|
||||
m_value = (m_value - detail::high_surrogate_base) << 10;
|
||||
m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
|
||||
--m_position;
|
||||
}
|
||||
// postcondition; result must not be a surrogate:
|
||||
if(detail::is_surrogate(m_value))
|
||||
invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
|
||||
}
|
||||
BaseIterator m_position;
|
||||
U32Type m_value;
|
||||
mutable U32Type m_value;
|
||||
};
|
||||
|
||||
}
|
||||
template <class BaseIterator, class U8Type = ::boost::uint8_t>
|
||||
class u32_to_u8_iterator
|
||||
: public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
|
||||
{
|
||||
typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
|
||||
|
||||
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
||||
|
||||
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
|
||||
BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
|
||||
#endif
|
||||
|
||||
public:
|
||||
typename base_type::reference
|
||||
dereference()const
|
||||
{
|
||||
if(m_current == 4)
|
||||
extract_current();
|
||||
return m_values[m_current];
|
||||
}
|
||||
bool equal(const u32_to_u8_iterator& that)const
|
||||
{
|
||||
if(m_position == that.m_position)
|
||||
{
|
||||
// either the m_current's must be equal, or one must be 0 and
|
||||
// the other 4: which means neither must have bits 1 or 2 set:
|
||||
return (m_current == that.m_current)
|
||||
|| (((m_current | that.m_current) & 3) == 0);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void increment()
|
||||
{
|
||||
// if we have a pending read then read now, so that we know whether
|
||||
// to skip a position, or move to a low-surrogate:
|
||||
if(m_current == 4)
|
||||
{
|
||||
// pending read:
|
||||
extract_current();
|
||||
}
|
||||
// move to the next surrogate position:
|
||||
++m_current;
|
||||
// if we've reached the end skip a position:
|
||||
if(m_values[m_current] == 0)
|
||||
{
|
||||
m_current = 4;
|
||||
++m_position;
|
||||
}
|
||||
}
|
||||
void decrement()
|
||||
{
|
||||
if((m_current & 3) == 0)
|
||||
{
|
||||
--m_position;
|
||||
extract_current();
|
||||
m_current = 3;
|
||||
while(m_current && (m_values[m_current] == 0))
|
||||
--m_current;
|
||||
}
|
||||
else
|
||||
--m_current;
|
||||
}
|
||||
BaseIterator base()const
|
||||
{
|
||||
return m_position;
|
||||
}
|
||||
// construct:
|
||||
u32_to_u8_iterator() : m_position(), m_current(0)
|
||||
{
|
||||
m_values[4] = 0;
|
||||
}
|
||||
u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
|
||||
{
|
||||
m_values[4] = 0;
|
||||
}
|
||||
private:
|
||||
static void invalid_code_point(::boost::uint32_t val)
|
||||
{
|
||||
#ifndef BOOST_NO_STD_LOCALE
|
||||
std::stringstream ss;
|
||||
ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-8 sequence";
|
||||
std::out_of_range e(ss.str());
|
||||
#else
|
||||
std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-8 sequence");
|
||||
#endif
|
||||
boost::throw_exception(e);
|
||||
}
|
||||
|
||||
void extract_current()const
|
||||
{
|
||||
boost::uint32_t c = *m_position;
|
||||
if(c > 0x10FFFFu)
|
||||
invalid_code_point(c);
|
||||
if(c < 0x80u)
|
||||
{
|
||||
m_values[0] = static_cast<unsigned char>(c);
|
||||
m_values[1] = static_cast<unsigned char>(0u);
|
||||
m_values[2] = static_cast<unsigned char>(0u);
|
||||
m_values[3] = static_cast<unsigned char>(0u);
|
||||
}
|
||||
else if(c < 0x800u)
|
||||
{
|
||||
m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
|
||||
m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
||||
m_values[2] = static_cast<unsigned char>(0u);
|
||||
m_values[3] = static_cast<unsigned char>(0u);
|
||||
}
|
||||
else if(c < 0x10000u)
|
||||
{
|
||||
m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
|
||||
m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
||||
m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
||||
m_values[3] = static_cast<unsigned char>(0u);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
|
||||
m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
|
||||
m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
||||
m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
||||
}
|
||||
m_current= 0;
|
||||
}
|
||||
BaseIterator m_position;
|
||||
mutable U8Type m_values[5];
|
||||
mutable unsigned m_current;
|
||||
};
|
||||
|
||||
template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
||||
class u8_to_u32_iterator
|
||||
: public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
|
||||
{
|
||||
typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
||||
// special values for pending iterator reads:
|
||||
BOOST_STATIC_CONSTANT(::boost::uint32_t, pending_read = 0xffffffffu);
|
||||
|
||||
#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
||||
|
||||
BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
|
||||
BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
|
||||
#endif
|
||||
|
||||
public:
|
||||
typename base_type::reference
|
||||
dereference()const
|
||||
{
|
||||
if(m_value == pending_read)
|
||||
extract_current();
|
||||
return m_value;
|
||||
}
|
||||
bool equal(const u8_to_u32_iterator& that)const
|
||||
{
|
||||
return m_position == that.m_position;
|
||||
}
|
||||
void increment()
|
||||
{
|
||||
// skip high surrogate first if there is one:
|
||||
unsigned c = detail::utf8_byte_count(*m_position);
|
||||
std::advance(m_position, c);
|
||||
m_value = pending_read;
|
||||
}
|
||||
void decrement()
|
||||
{
|
||||
// Keep backtracking until we don't have a trailing character:
|
||||
unsigned count = 0;
|
||||
while((*--m_position & 0xC0u) == 0x80u) ++count;
|
||||
// now check that the sequence was valid:
|
||||
if(count != detail::utf8_trailing_byte_count(*m_position))
|
||||
invalid_sequnce();
|
||||
m_value = pending_read;
|
||||
}
|
||||
BaseIterator base()const
|
||||
{
|
||||
return m_position;
|
||||
}
|
||||
// construct:
|
||||
u8_to_u32_iterator() : m_position()
|
||||
{
|
||||
m_value = pending_read;
|
||||
}
|
||||
u8_to_u32_iterator(BaseIterator b) : m_position(b)
|
||||
{
|
||||
m_value = pending_read;
|
||||
}
|
||||
private:
|
||||
static void invalid_sequnce()
|
||||
{
|
||||
std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
|
||||
boost::throw_exception(e);
|
||||
}
|
||||
void extract_current()const
|
||||
{
|
||||
m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
|
||||
// we must not have a continuation character:
|
||||
if((m_value & 0xC0u) == 0x80u)
|
||||
invalid_sequnce();
|
||||
// see how many extra byts we have:
|
||||
unsigned extra = detail::utf8_trailing_byte_count(*m_position);
|
||||
// extract the extra bits, 6 from each extra byte:
|
||||
BaseIterator next(m_position);
|
||||
for(unsigned c = 0; c < extra; ++c)
|
||||
{
|
||||
++next;
|
||||
m_value <<= 6;
|
||||
m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
|
||||
}
|
||||
// we now need to remove a few of the leftmost bits, but how many depends
|
||||
// upon how many extra bytes we've extracted:
|
||||
static const boost::uint32_t masks[] =
|
||||
{
|
||||
0x7Fu,
|
||||
0x7FFu,
|
||||
0xFFFFu,
|
||||
0x1FFFFFu,
|
||||
};
|
||||
m_value &= masks[extra];
|
||||
// check the result:
|
||||
if(m_value > 0x10FFFFu)
|
||||
invalid_sequnce();
|
||||
}
|
||||
BaseIterator m_position;
|
||||
mutable U32Type m_value;
|
||||
};
|
||||
|
||||
} // namespace boost
|
||||
|
||||
#endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
|
||||
|
@ -611,6 +611,52 @@ bool basic_regex_parser<charT, traits>::parse_extended_escape()
|
||||
++m_position;
|
||||
this->append_state(syntax_element_restart_continue);
|
||||
break;
|
||||
case regex_constants::escape_type_not_property:
|
||||
negate = true;
|
||||
// fall through:
|
||||
case regex_constants::escape_type_property:
|
||||
{
|
||||
++m_position;
|
||||
char_class_type m;
|
||||
if(m_position == m_end)
|
||||
{
|
||||
fail(regex_constants::error_escape, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
// maybe have \p{ddd}
|
||||
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
|
||||
{
|
||||
const charT* base = m_position;
|
||||
// skip forward until we find enclosing brace:
|
||||
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
|
||||
++m_position;
|
||||
if(m_position == m_end)
|
||||
{
|
||||
fail(regex_constants::error_escape, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
m = this->m_traits.lookup_classname(++base, m_position++);
|
||||
}
|
||||
else
|
||||
{
|
||||
m = this->m_traits.lookup_classname(m_position, m_position+1);
|
||||
++m_position;
|
||||
}
|
||||
if(m != 0)
|
||||
{
|
||||
basic_char_set<charT, traits> char_set;
|
||||
if(negate)
|
||||
char_set.negate();
|
||||
char_set.add_class(m);
|
||||
if(0 == this->append_set(char_set))
|
||||
{
|
||||
fail(regex_constants::error_ctype, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
fail(regex_constants::error_ctype, m_position - m_base);
|
||||
}
|
||||
default:
|
||||
this->append_literal(unescape_character());
|
||||
break;
|
||||
@ -948,6 +994,7 @@ bool basic_regex_parser<charT, traits>::parse_set()
|
||||
if(m != 0)
|
||||
{
|
||||
char_set.add_class(m);
|
||||
++m_position;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1373,6 +1420,41 @@ charT basic_regex_parser<charT, traits>::unescape_character()
|
||||
}
|
||||
return static_cast<charT>(val);
|
||||
}
|
||||
case regex_constants::escape_type_named_char:
|
||||
{
|
||||
++m_position;
|
||||
if(m_position == m_end)
|
||||
{
|
||||
fail(regex_constants::error_escape, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
// maybe have \N{name}
|
||||
if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
|
||||
{
|
||||
const charT* base = m_position;
|
||||
// skip forward until we find enclosing brace:
|
||||
while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
|
||||
++m_position;
|
||||
if(m_position == m_end)
|
||||
{
|
||||
fail(regex_constants::error_escape, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
string_type s = this->m_traits.lookup_collatename(++base, m_position++);
|
||||
if(s.empty())
|
||||
{
|
||||
fail(regex_constants::error_collate, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
if(s.size() == 1)
|
||||
{
|
||||
return s[0];
|
||||
}
|
||||
}
|
||||
// fall through is a failure:
|
||||
fail(regex_constants::error_escape, m_position - m_base);
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
result = *m_position;
|
||||
break;
|
||||
|
@ -75,7 +75,7 @@ class basic_regex_formatter
|
||||
public:
|
||||
typedef typename traits::char_type char_type;
|
||||
basic_regex_formatter(OutputIterator o, const Results& r, const traits& t)
|
||||
: m_traits(t), m_results(r), m_out(o), m_state(output_copy) {}
|
||||
: m_traits(t), m_results(r), m_out(o), m_state(output_copy), m_have_conditional(false) {}
|
||||
OutputIterator format(const char_type* p1, const char_type* p2, match_flag_type f);
|
||||
OutputIterator format(const char_type* p1, match_flag_type f)
|
||||
{
|
||||
@ -108,6 +108,7 @@ private:
|
||||
const char_type* m_end; // format string end
|
||||
match_flag_type m_flags; // format flags to use
|
||||
output_state m_state; // what to do with the next character
|
||||
bool m_have_conditional; // we are parsing a conditional
|
||||
private:
|
||||
basic_regex_formatter(const basic_regex_formatter&);
|
||||
basic_regex_formatter& operator=(const basic_regex_formatter&);
|
||||
@ -147,7 +148,10 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_all()
|
||||
if(m_flags & boost::regex_constants::format_all)
|
||||
{
|
||||
++m_position;
|
||||
bool have_conditional = m_have_conditional;
|
||||
m_have_conditional = false;
|
||||
format_until_scope_end();
|
||||
m_have_conditional = have_conditional;
|
||||
if(m_position == m_end)
|
||||
return;
|
||||
BOOST_ASSERT(*m_position == static_cast<char_type>(')'));
|
||||
@ -158,7 +162,6 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_all()
|
||||
++m_position;
|
||||
break;
|
||||
case ')':
|
||||
case ':':
|
||||
if(m_flags & boost::regex_constants::format_all)
|
||||
{
|
||||
return;
|
||||
@ -166,6 +169,14 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_all()
|
||||
put(*m_position);
|
||||
++m_position;
|
||||
break;
|
||||
case ':':
|
||||
if((m_flags & boost::regex_constants::format_all) && m_have_conditional)
|
||||
{
|
||||
return;
|
||||
}
|
||||
put(*m_position);
|
||||
++m_position;
|
||||
break;
|
||||
case '?':
|
||||
if(m_flags & boost::regex_constants::format_all)
|
||||
{
|
||||
@ -405,7 +416,9 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_conditional(
|
||||
// output varies depending upon whether sub-expression v matched or not:
|
||||
if(m_results[v].matched)
|
||||
{
|
||||
m_have_conditional = true;
|
||||
format_all();
|
||||
m_have_conditional = false;
|
||||
if((m_position != m_end) && (*m_position == static_cast<char_type>(':')))
|
||||
{
|
||||
// skip the ':':
|
||||
@ -425,7 +438,9 @@ void basic_regex_formatter<OutputIterator, Results, traits>::format_conditional(
|
||||
output_state saved_state = m_state;
|
||||
m_state = output_none;
|
||||
// format until ':' or ')':
|
||||
m_have_conditional = true;
|
||||
format_all();
|
||||
m_have_conditional = false;
|
||||
// restore state:
|
||||
m_state = saved_state;
|
||||
if((m_position != m_end) && (*m_position == static_cast<char_type>(':')))
|
||||
|
@ -89,7 +89,11 @@ static const escape_syntax_type escape_type_C = 50; /
|
||||
static const escape_syntax_type escape_type_Z = 51; // for \Z
|
||||
static const escape_syntax_type escape_type_G = 52; // for \G
|
||||
|
||||
static const escape_syntax_type syntax_max = 54;
|
||||
static const escape_syntax_type escape_type_property = 54; // for \p
|
||||
static const escape_syntax_type escape_type_not_property = 55; // for \P
|
||||
static const escape_syntax_type escape_type_named_char = 56; // for \N
|
||||
|
||||
static const escape_syntax_type syntax_max = 57;
|
||||
|
||||
}
|
||||
}
|
||||
|
318
src/icu.cpp
318
src/icu.cpp
@ -15,7 +15,10 @@
|
||||
* VERSION see <boost/version.hpp>
|
||||
* DESCRIPTION: Unicode regular expressions on top of the ICU Library.
|
||||
*/
|
||||
#define BOOST_REGEX_SOURCE
|
||||
|
||||
#include <boost/regex/config.hpp>
|
||||
#ifdef BOOST_HAS_ICU
|
||||
#include <boost/regex/icu.hpp>
|
||||
|
||||
namespace boost{
|
||||
@ -64,6 +67,264 @@ const icu_regex_traits::char_class_type icu_regex_traits::mask_space = icu_regex
|
||||
const icu_regex_traits::char_class_type icu_regex_traits::mask_xdigit = icu_regex_traits::char_class_type(1) << offset_xdigit;
|
||||
const icu_regex_traits::char_class_type icu_regex_traits::mask_underscore = icu_regex_traits::char_class_type(1) << offset_underscore;
|
||||
const icu_regex_traits::char_class_type icu_regex_traits::mask_unicode = icu_regex_traits::char_class_type(1) << offset_unicode;
|
||||
const icu_regex_traits::char_class_type icu_regex_traits::mask_any = icu_regex_traits::char_class_type(1) << offset_any;
|
||||
const icu_regex_traits::char_class_type icu_regex_traits::mask_ascii = icu_regex_traits::char_class_type(1) << offset_ascii;
|
||||
|
||||
icu_regex_traits::char_class_type icu_regex_traits::lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2)
|
||||
{
|
||||
static const ::UChar32 prop_name_table[] = {
|
||||
/* any */ 'a', 'n', 'y',
|
||||
/* ascii */ 'a', 's', 'c', 'i', 'i',
|
||||
/* assigned */ 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
|
||||
/* c* */ 'c', '*',
|
||||
/* cc */ 'c', 'c',
|
||||
/* cf */ 'c', 'f',
|
||||
/* closepunctuation */ 'c', 'l', 'o', 's', 'e', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* cn */ 'c', 'n',
|
||||
/* co */ 'c', 'o',
|
||||
/* connectorpunctuation */ 'c', 'o', 'n', 'n', 'e', 'c', 't', 'o', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* control */ 'c', 'o', 'n', 't', 'r', 'o', 'l',
|
||||
/* cs */ 'c', 's',
|
||||
/* currencysymbol */ 'c', 'u', 'r', 'r', 'e', 'n', 'c', 'y', 's', 'y', 'm', 'b', 'o', 'l',
|
||||
/* dashpunctuation */ 'd', 'a', 's', 'h', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* decimaldigitnumber */ 'd', 'e', 'c', 'i', 'm', 'a', 'l', 'd', 'i', 'g', 'i', 't', 'n', 'u', 'm', 'b', 'e', 'r',
|
||||
/* enclosingmark */ 'e', 'n', 'c', 'l', 'o', 's', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
|
||||
/* finalpunctuation */ 'f', 'i', 'n', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* format */ 'f', 'o', 'r', 'm', 'a', 't',
|
||||
/* initialpunctuation */ 'i', 'n', 'i', 't', 'i', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* l* */ 'l', '*',
|
||||
/* letter */ 'l', 'e', 't', 't', 'e', 'r',
|
||||
/* letternumber */ 'l', 'e', 't', 't', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
|
||||
/* lineseparator */ 'l', 'i', 'n', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
|
||||
/* ll */ 'l', 'l',
|
||||
/* lm */ 'l', 'm',
|
||||
/* lo */ 'l', 'o',
|
||||
/* lowercaseletter */ 'l', 'o', 'w', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
|
||||
/* lt */ 'l', 't',
|
||||
/* lu */ 'l', 'u',
|
||||
/* m* */ 'm', '*',
|
||||
/* mark */ 'm', 'a', 'r', 'k',
|
||||
/* mathsymbol */ 'm', 'a', 't', 'h', 's', 'y', 'm', 'b', 'o', 'l',
|
||||
/* mc */ 'm', 'c',
|
||||
/* me */ 'm', 'e',
|
||||
/* mn */ 'm', 'n',
|
||||
/* modifierletter */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
|
||||
/* modifiersymbol */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
|
||||
/* n* */ 'n', '*',
|
||||
/* nd */ 'n', 'd',
|
||||
/* nl */ 'n', 'l',
|
||||
/* no */ 'n', 'o',
|
||||
/* nonspacingmark */ 'n', 'o', 'n', 's', 'p', 'a', 'c', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
|
||||
/* notassigned */ 'n', 'o', 't', 'a', 's', 's', 'i', 'g', 'n', 'e', 'd',
|
||||
/* number */ 'n', 'u', 'm', 'b', 'e', 'r',
|
||||
/* openpunctuation */ 'o', 'p', 'e', 'n', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* other */ 'o', 't', 'h', 'e', 'r',
|
||||
/* otherletter */ 'o', 't', 'h', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r',
|
||||
/* othernumber */ 'o', 't', 'h', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r',
|
||||
/* otherpunctuation */ 'o', 't', 'h', 'e', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* othersymbol */ 'o', 't', 'h', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l',
|
||||
/* p* */ 'p', '*',
|
||||
/* paragraphseparator */ 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
|
||||
/* pc */ 'p', 'c',
|
||||
/* pd */ 'p', 'd',
|
||||
/* pe */ 'p', 'e',
|
||||
/* pf */ 'p', 'f',
|
||||
/* pi */ 'p', 'i',
|
||||
/* po */ 'p', 'o',
|
||||
/* privateuse */ 'p', 'r', 'i', 'v', 'a', 't', 'e', 'u', 's', 'e',
|
||||
/* ps */ 'p', 's',
|
||||
/* punctuation */ 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n',
|
||||
/* s* */ 's', '*',
|
||||
/* sc */ 's', 'c',
|
||||
/* separator */ 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
|
||||
/* sk */ 's', 'k',
|
||||
/* sm */ 's', 'm',
|
||||
/* so */ 's', 'o',
|
||||
/* spaceseparator */ 's', 'p', 'a', 'c', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r',
|
||||
/* spacingcombiningmark */ 's', 'p', 'a', 'c', 'i', 'n', 'g', 'c', 'o', 'm', 'b', 'i', 'n', 'i', 'n', 'g', 'm', 'a', 'r', 'k',
|
||||
/* surrogate */ 's', 'u', 'r', 'r', 'o', 'g', 'a', 't', 'e',
|
||||
/* symbol */ 's', 'y', 'm', 'b', 'o', 'l',
|
||||
/* titlecase */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e',
|
||||
/* titlecaseletter */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
|
||||
/* uppercaseletter */ 'u', 'p', 'p', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r',
|
||||
/* z* */ 'z', '*',
|
||||
/* zl */ 'z', 'l',
|
||||
/* zp */ 'z', 'p',
|
||||
/* zs */ 'z', 's',
|
||||
};
|
||||
|
||||
static const re_detail::character_pointer_range<::UChar32> range_data[] = {
|
||||
{ prop_name_table+0, prop_name_table+3, }, // any
|
||||
{ prop_name_table+3, prop_name_table+8, }, // ascii
|
||||
{ prop_name_table+8, prop_name_table+16, }, // assigned
|
||||
{ prop_name_table+16, prop_name_table+18, }, // c*
|
||||
{ prop_name_table+18, prop_name_table+20, }, // cc
|
||||
{ prop_name_table+20, prop_name_table+22, }, // cf
|
||||
{ prop_name_table+22, prop_name_table+38, }, // closepunctuation
|
||||
{ prop_name_table+38, prop_name_table+40, }, // cn
|
||||
{ prop_name_table+40, prop_name_table+42, }, // co
|
||||
{ prop_name_table+42, prop_name_table+62, }, // connectorpunctuation
|
||||
{ prop_name_table+62, prop_name_table+69, }, // control
|
||||
{ prop_name_table+69, prop_name_table+71, }, // cs
|
||||
{ prop_name_table+71, prop_name_table+85, }, // currencysymbol
|
||||
{ prop_name_table+85, prop_name_table+100, }, // dashpunctuation
|
||||
{ prop_name_table+100, prop_name_table+118, }, // decimaldigitnumber
|
||||
{ prop_name_table+118, prop_name_table+131, }, // enclosingmark
|
||||
{ prop_name_table+131, prop_name_table+147, }, // finalpunctuation
|
||||
{ prop_name_table+147, prop_name_table+153, }, // format
|
||||
{ prop_name_table+153, prop_name_table+171, }, // initialpunctuation
|
||||
{ prop_name_table+171, prop_name_table+173, }, // l*
|
||||
{ prop_name_table+173, prop_name_table+179, }, // letter
|
||||
{ prop_name_table+179, prop_name_table+191, }, // letternumber
|
||||
{ prop_name_table+191, prop_name_table+204, }, // lineseparator
|
||||
{ prop_name_table+204, prop_name_table+206, }, // ll
|
||||
{ prop_name_table+206, prop_name_table+208, }, // lm
|
||||
{ prop_name_table+208, prop_name_table+210, }, // lo
|
||||
{ prop_name_table+210, prop_name_table+225, }, // lowercaseletter
|
||||
{ prop_name_table+225, prop_name_table+227, }, // lt
|
||||
{ prop_name_table+227, prop_name_table+229, }, // lu
|
||||
{ prop_name_table+229, prop_name_table+231, }, // m*
|
||||
{ prop_name_table+231, prop_name_table+235, }, // mark
|
||||
{ prop_name_table+235, prop_name_table+245, }, // mathsymbol
|
||||
{ prop_name_table+245, prop_name_table+247, }, // mc
|
||||
{ prop_name_table+247, prop_name_table+249, }, // me
|
||||
{ prop_name_table+249, prop_name_table+251, }, // mn
|
||||
{ prop_name_table+251, prop_name_table+265, }, // modifierletter
|
||||
{ prop_name_table+265, prop_name_table+279, }, // modifiersymbol
|
||||
{ prop_name_table+279, prop_name_table+281, }, // n*
|
||||
{ prop_name_table+281, prop_name_table+283, }, // nd
|
||||
{ prop_name_table+283, prop_name_table+285, }, // nl
|
||||
{ prop_name_table+285, prop_name_table+287, }, // no
|
||||
{ prop_name_table+287, prop_name_table+301, }, // nonspacingmark
|
||||
{ prop_name_table+301, prop_name_table+312, }, // notassigned
|
||||
{ prop_name_table+312, prop_name_table+318, }, // number
|
||||
{ prop_name_table+318, prop_name_table+333, }, // openpunctuation
|
||||
{ prop_name_table+333, prop_name_table+338, }, // other
|
||||
{ prop_name_table+338, prop_name_table+349, }, // otherletter
|
||||
{ prop_name_table+349, prop_name_table+360, }, // othernumber
|
||||
{ prop_name_table+360, prop_name_table+376, }, // otherpunctuation
|
||||
{ prop_name_table+376, prop_name_table+387, }, // othersymbol
|
||||
{ prop_name_table+387, prop_name_table+389, }, // p*
|
||||
{ prop_name_table+389, prop_name_table+407, }, // paragraphseparator
|
||||
{ prop_name_table+407, prop_name_table+409, }, // pc
|
||||
{ prop_name_table+409, prop_name_table+411, }, // pd
|
||||
{ prop_name_table+411, prop_name_table+413, }, // pe
|
||||
{ prop_name_table+413, prop_name_table+415, }, // pf
|
||||
{ prop_name_table+415, prop_name_table+417, }, // pi
|
||||
{ prop_name_table+417, prop_name_table+419, }, // po
|
||||
{ prop_name_table+419, prop_name_table+429, }, // privateuse
|
||||
{ prop_name_table+429, prop_name_table+431, }, // ps
|
||||
{ prop_name_table+431, prop_name_table+442, }, // punctuation
|
||||
{ prop_name_table+442, prop_name_table+444, }, // s*
|
||||
{ prop_name_table+444, prop_name_table+446, }, // sc
|
||||
{ prop_name_table+446, prop_name_table+455, }, // separator
|
||||
{ prop_name_table+455, prop_name_table+457, }, // sk
|
||||
{ prop_name_table+457, prop_name_table+459, }, // sm
|
||||
{ prop_name_table+459, prop_name_table+461, }, // so
|
||||
{ prop_name_table+461, prop_name_table+475, }, // spaceseparator
|
||||
{ prop_name_table+475, prop_name_table+495, }, // spacingcombiningmark
|
||||
{ prop_name_table+495, prop_name_table+504, }, // surrogate
|
||||
{ prop_name_table+504, prop_name_table+510, }, // symbol
|
||||
{ prop_name_table+510, prop_name_table+519, }, // titlecase
|
||||
{ prop_name_table+519, prop_name_table+534, }, // titlecaseletter
|
||||
{ prop_name_table+534, prop_name_table+549, }, // uppercaseletter
|
||||
{ prop_name_table+549, prop_name_table+551, }, // z*
|
||||
{ prop_name_table+551, prop_name_table+553, }, // zl
|
||||
{ prop_name_table+553, prop_name_table+555, }, // zp
|
||||
{ prop_name_table+555, prop_name_table+557, }, // zs
|
||||
};
|
||||
|
||||
static const icu_regex_traits::char_class_type icu_class_map[] = {
|
||||
icu_regex_traits::mask_any, // any
|
||||
icu_regex_traits::mask_ascii, // ascii
|
||||
(0x3FFFFFFFu) & ~(U_GC_CN_MASK), // assigned
|
||||
U_GC_C_MASK, // c*
|
||||
U_GC_CC_MASK, // cc
|
||||
U_GC_CF_MASK, // cf
|
||||
U_GC_PE_MASK, // closepunctuation
|
||||
U_GC_CN_MASK, // cn
|
||||
U_GC_CO_MASK, // co
|
||||
U_GC_PC_MASK, // connectorpunctuation
|
||||
U_GC_CC_MASK, // control
|
||||
U_GC_CS_MASK, // cs
|
||||
U_GC_SC_MASK, // currencysymbol
|
||||
U_GC_PD_MASK, // dashpunctuation
|
||||
U_GC_ND_MASK, // decimaldigitnumber
|
||||
U_GC_ME_MASK, // enclosingmark
|
||||
U_GC_PF_MASK, // finalpunctuation
|
||||
U_GC_CF_MASK, // format
|
||||
U_GC_PI_MASK, // initialpunctuation
|
||||
U_GC_L_MASK, // l*
|
||||
U_GC_L_MASK, // letter
|
||||
U_GC_NL_MASK, // letternumber
|
||||
U_GC_ZL_MASK, // lineseparator
|
||||
U_GC_LL_MASK, // ll
|
||||
U_GC_LM_MASK, // lm
|
||||
U_GC_LO_MASK, // lo
|
||||
U_GC_LL_MASK, // lowercaseletter
|
||||
U_GC_LT_MASK, // lt
|
||||
U_GC_LU_MASK, // lu
|
||||
U_GC_M_MASK, // m*
|
||||
U_GC_M_MASK, // mark
|
||||
U_GC_SM_MASK, // mathsymbol
|
||||
U_GC_MC_MASK, // mc
|
||||
U_GC_ME_MASK, // me
|
||||
U_GC_MN_MASK, // mn
|
||||
U_GC_LM_MASK, // modifierletter
|
||||
U_GC_SK_MASK, // modifiersymbol
|
||||
U_GC_N_MASK, // n*
|
||||
U_GC_ND_MASK, // nd
|
||||
U_GC_NL_MASK, // nl
|
||||
U_GC_NO_MASK, // no
|
||||
U_GC_MN_MASK, // nonspacingmark
|
||||
U_GC_CN_MASK, // notassigned
|
||||
U_GC_N_MASK, // number
|
||||
U_GC_PS_MASK, // openpunctuation
|
||||
U_GC_C_MASK, // other
|
||||
U_GC_LO_MASK, // otherletter
|
||||
U_GC_NO_MASK, // othernumber
|
||||
U_GC_PO_MASK, // otherpunctuation
|
||||
U_GC_SO_MASK, // othersymbol
|
||||
U_GC_P_MASK, // p*
|
||||
U_GC_ZP_MASK, // paragraphseparator
|
||||
U_GC_PC_MASK, // pc
|
||||
U_GC_PD_MASK, // pd
|
||||
U_GC_PE_MASK, // pe
|
||||
U_GC_PF_MASK, // pf
|
||||
U_GC_PI_MASK, // pi
|
||||
U_GC_PO_MASK, // po
|
||||
U_GC_CO_MASK, // privateuse
|
||||
U_GC_PS_MASK, // ps
|
||||
U_GC_P_MASK, // punctuation
|
||||
U_GC_S_MASK, // s*
|
||||
U_GC_SC_MASK, // sc
|
||||
U_GC_Z_MASK, // separator
|
||||
U_GC_SK_MASK, // sk
|
||||
U_GC_SM_MASK, // sm
|
||||
U_GC_SO_MASK, // so
|
||||
U_GC_ZS_MASK, // spaceseparator
|
||||
U_GC_MC_MASK, // spacingcombiningmark
|
||||
U_GC_CS_MASK, // surrogate
|
||||
U_GC_S_MASK, // symbol
|
||||
U_GC_LT_MASK, // titlecase
|
||||
U_GC_LT_MASK, // titlecaseletter
|
||||
U_GC_LU_MASK, // uppercaseletter
|
||||
U_GC_Z_MASK, // z*
|
||||
U_GC_ZL_MASK, // zl
|
||||
U_GC_ZP_MASK, // zp
|
||||
U_GC_ZS_MASK, // zs
|
||||
};
|
||||
|
||||
|
||||
static const re_detail::character_pointer_range< ::UChar32>* ranges_begin = range_data;
|
||||
static const re_detail::character_pointer_range< ::UChar32>* ranges_end = range_data + (sizeof(range_data)/sizeof(range_data[0]));
|
||||
|
||||
re_detail::character_pointer_range< ::UChar32> t = { p1, p2, };
|
||||
const re_detail::character_pointer_range< ::UChar32>* p = std::lower_bound(ranges_begin, ranges_end, t);
|
||||
if((p != ranges_end) && (t == *p))
|
||||
return icu_class_map[p - ranges_begin];
|
||||
return 0;
|
||||
}
|
||||
|
||||
icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_type* p1, const char_type* p2) const
|
||||
{
|
||||
@ -92,12 +353,33 @@ icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_
|
||||
};
|
||||
|
||||
int id = ::boost::re_detail::get_default_class_id(p1, p2);
|
||||
if(id >= 0)
|
||||
return masks[id+1];
|
||||
char_class_type result = lookup_icu_mask(p1, p2);
|
||||
if(result != 0)
|
||||
return result;
|
||||
|
||||
if(id < 0)
|
||||
{
|
||||
string_type s(p1, p2);
|
||||
for(string_type::size_type i = 0; i < s.size(); ++i)
|
||||
string_type::size_type i = 0;
|
||||
while(i < s.size())
|
||||
{
|
||||
s[i] = static_cast<char>((::u_tolower)(s[i]));
|
||||
if(::u_isspace(s[i]) || (s[i] == '-') || (s[i] == '_'))
|
||||
s.erase(s.begin()+i, s.begin()+i+1);
|
||||
else
|
||||
{
|
||||
s[i] = static_cast<char>((::u_tolower)(s[i]));
|
||||
++i;
|
||||
}
|
||||
}
|
||||
id = ::boost::re_detail::get_default_class_id(&*s.begin(), &*s.begin() + s.size());
|
||||
if(id >= 0)
|
||||
return masks[id+1];
|
||||
result = lookup_icu_mask(&*s.begin(), &*s.begin() + s.size());
|
||||
if(result != 0)
|
||||
return result;
|
||||
}
|
||||
BOOST_ASSERT(id+1 < sizeof(masks) / sizeof(masks[0]));
|
||||
return masks[id+1];
|
||||
@ -109,6 +391,23 @@ icu_regex_traits::string_type icu_regex_traits::lookup_collatename(const char_ty
|
||||
if(std::find_if(p1, p2, std::bind2nd(std::greater< ::UChar32>(), 0x7f)) == p2)
|
||||
{
|
||||
std::string s(p1, p2);
|
||||
// Try Unicode name:
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
UChar32 c = ::u_charFromName(U_UNICODE_CHAR_NAME, s.c_str(), &err);
|
||||
if(U_SUCCESS(err))
|
||||
{
|
||||
result.push_back(c);
|
||||
return result;
|
||||
}
|
||||
// Try Unicode-extended name:
|
||||
err = U_ZERO_ERROR;
|
||||
c = ::u_charFromName(U_EXTENDED_CHAR_NAME, s.c_str(), &err);
|
||||
if(U_SUCCESS(err))
|
||||
{
|
||||
result.push_back(c);
|
||||
return result;
|
||||
}
|
||||
// try POSIX name:
|
||||
s = ::boost::re_detail::lookup_default_collate_name(s);
|
||||
result.assign(s.begin(), s.end());
|
||||
}
|
||||
@ -121,21 +420,26 @@ bool icu_regex_traits::isctype(char_type c, char_class_type f) const
|
||||
{
|
||||
// check for standard catagories first:
|
||||
char_class_type m = char_class_type(1u << u_charType(c));
|
||||
if((m & f).any())
|
||||
if((m & f) != 0)
|
||||
return true;
|
||||
// now check for special cases:
|
||||
if((f & mask_blank).any() && u_isblank(c))
|
||||
if(((f & mask_blank) != 0) && u_isblank(c))
|
||||
return true;
|
||||
if((f & mask_space).any() && u_isspace(c))
|
||||
if(((f & mask_space) != 0) && u_isspace(c))
|
||||
return true;
|
||||
if((f & mask_xdigit).any() && (u_digit(c, 16) >= 0))
|
||||
if(((f & mask_xdigit) != 0) && (u_digit(c, 16) >= 0))
|
||||
return true;
|
||||
if((f & mask_unicode).any() && (c >= 0x100))
|
||||
if(((f & mask_unicode) != 0) && (c >= 0x100))
|
||||
return true;
|
||||
if((f & mask_underscore).any() && (c == '_'))
|
||||
if(((f & mask_underscore) != 0) && (c == '_'))
|
||||
return true;
|
||||
if(((f & mask_any) != 0) && (c <= 0x10FFFF))
|
||||
return true;
|
||||
if(((f & mask_ascii) != 0) && (c <= 0x7F))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif // BOOST_HAS_ICU
|
||||
|
@ -96,7 +96,11 @@ BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants
|
||||
"C",
|
||||
"Z",
|
||||
"G",
|
||||
"!", };
|
||||
"!",
|
||||
"p",
|
||||
"P",
|
||||
"N",
|
||||
};
|
||||
|
||||
return ((n >= (sizeof(messages) / sizeof(messages[1]))) ? "" : messages[n]);
|
||||
}
|
||||
@ -374,9 +378,9 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul
|
||||
regex_constants::escape_type_not_class, /*K*/
|
||||
regex_constants::escape_type_not_class, /*L*/
|
||||
regex_constants::escape_type_not_class, /*M*/
|
||||
regex_constants::escape_type_not_class, /*N*/
|
||||
regex_constants::escape_type_named_char, /*N*/
|
||||
regex_constants::escape_type_not_class, /*O*/
|
||||
regex_constants::escape_type_not_class, /*P*/
|
||||
regex_constants::escape_type_not_property, /*P*/
|
||||
regex_constants::escape_type_Q, /*Q*/
|
||||
regex_constants::escape_type_not_class, /*R*/
|
||||
regex_constants::escape_type_not_class, /*S*/
|
||||
@ -408,7 +412,7 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul
|
||||
regex_constants::escape_type_class, /*m*/
|
||||
regex_constants::escape_type_control_n, /*n*/
|
||||
regex_constants::escape_type_class, /*o*/
|
||||
regex_constants::escape_type_class, /*p*/
|
||||
regex_constants::escape_type_property, /*p*/
|
||||
regex_constants::escape_type_class, /*q*/
|
||||
regex_constants::escape_type_control_r, /*r*/
|
||||
regex_constants::escape_type_class, /*s*/
|
||||
|
12
test/Jamfile
12
test/Jamfile
@ -26,6 +26,7 @@ test_sets.cpp
|
||||
test_simple_repeats.cpp
|
||||
test_tricky_cases.cpp
|
||||
test_icu.cpp
|
||||
test_unicode.cpp
|
||||
test_overloads.cpp
|
||||
test_operators.cpp
|
||||
;
|
||||
@ -120,6 +121,17 @@ test-suite regex
|
||||
<lib>../../test/build/boost_test_exec_monitor
|
||||
]
|
||||
|
||||
[ run unicode/unicode_iterator_test.cpp ]
|
||||
[ regex-test static_mutex_test
|
||||
: <template>test # sources
|
||||
static_mutex/static_mutex_test.cpp
|
||||
<dll>../../thread/build/boost_thread
|
||||
]
|
||||
[ regex-test object_cache_test
|
||||
: <template>test # sources
|
||||
object_cache/object_cache_test.cpp
|
||||
]
|
||||
|
||||
[ run config_info/regex_config_info.cpp <template>test
|
||||
: : : <test-info>always_show_run_output ]
|
||||
|
||||
|
@ -53,6 +53,7 @@ int cpp_main(int /*argc*/, char * /*argv*/[])
|
||||
test_emacs();
|
||||
test_operators();
|
||||
test_overloads();
|
||||
test_unicode();
|
||||
return error_count;
|
||||
}
|
||||
|
||||
|
@ -215,6 +215,7 @@ void test_en_locale();
|
||||
void test_emacs();
|
||||
void test_operators();
|
||||
void test_overloads();
|
||||
void test_unicode();
|
||||
|
||||
//
|
||||
// template instances:
|
||||
|
@ -19,7 +19,8 @@
|
||||
//
|
||||
// We can only build this if we have ICU support:
|
||||
//
|
||||
#ifdef TEST_ICU
|
||||
#include <boost/regex/config.hpp>
|
||||
#ifdef BOOST_HAS_ICU
|
||||
|
||||
#include <boost/regex/icu.hpp>
|
||||
#include "test.hpp"
|
||||
@ -27,7 +28,6 @@
|
||||
|
||||
void test_icu(const wchar_t&, const test_regex_search_tag& )
|
||||
{
|
||||
typedef boost::u16_to_u32_iterator<std::wstring::const_iterator, ::UChar32> conv_iterator;
|
||||
boost::u32regex r;
|
||||
if(*test_locale::c_str())
|
||||
{
|
||||
@ -37,26 +37,28 @@ void test_icu(const wchar_t&, const test_regex_search_tag& )
|
||||
r.imbue(l);
|
||||
}
|
||||
|
||||
const std::wstring& expression = test_info<wchar_t>::expression();
|
||||
std::vector< ::UChar32> expression;
|
||||
expression.assign(test_info<wchar_t>::expression().begin(), test_info<wchar_t>::expression().end());
|
||||
boost::regex_constants::syntax_option_type syntax_options = test_info<UChar32>::syntax_options();
|
||||
try{
|
||||
r.assign(conv_iterator(expression.begin()), conv_iterator(expression.end()), syntax_options);
|
||||
r.assign(expression.begin(), expression.end(), syntax_options);
|
||||
if(r.status())
|
||||
{
|
||||
BOOST_REGEX_TEST_ERROR("Expression did not compile when it should have done, error code = " << r.status(), UChar32);
|
||||
}
|
||||
const std::wstring& search_text = test_info<wchar_t>::search_text();
|
||||
std::vector< ::UChar32> search_text;
|
||||
search_text.assign(test_info<wchar_t>::search_text().begin(), test_info<wchar_t>::search_text().end());
|
||||
boost::regex_constants::match_flag_type opts = test_info<wchar_t>::match_options();
|
||||
const int* answer_table = test_info<wchar_t>::answer_table();
|
||||
boost::match_results<conv_iterator> what;
|
||||
boost::match_results<std::vector< ::UChar32>::const_iterator> what;
|
||||
if(boost::regex_search(
|
||||
conv_iterator(search_text.begin()),
|
||||
conv_iterator(search_text.end()),
|
||||
const_cast<std::vector< ::UChar32>const&>(search_text).begin(),
|
||||
const_cast<std::vector< ::UChar32>const&>(search_text).end(),
|
||||
what,
|
||||
r,
|
||||
opts))
|
||||
{
|
||||
test_result(what, conv_iterator(search_text.begin()), answer_table);
|
||||
test_result(what, const_cast<std::vector< ::UChar32>const&>(search_text).begin(), answer_table);
|
||||
}
|
||||
else if(answer_table[0] >= 0)
|
||||
{
|
||||
@ -85,7 +87,8 @@ void test_icu(const wchar_t&, const test_regex_search_tag& )
|
||||
void test_icu(const wchar_t&, const test_invalid_regex_tag&)
|
||||
{
|
||||
typedef boost::u16_to_u32_iterator<std::wstring::const_iterator, ::UChar32> conv_iterator;
|
||||
const std::wstring& expression = test_info<wchar_t>::expression();
|
||||
std::vector< ::UChar32> expression;
|
||||
expression.assign(test_info<wchar_t>::expression().begin(), test_info<wchar_t>::expression().end());
|
||||
boost::regex_constants::syntax_option_type syntax_options = test_info<wchar_t>::syntax_options();
|
||||
boost::u32regex r;
|
||||
if(*test_locale::c_str())
|
||||
@ -100,7 +103,7 @@ void test_icu(const wchar_t&, const test_invalid_regex_tag&)
|
||||
//
|
||||
try
|
||||
{
|
||||
if(0 == r.assign(conv_iterator(expression.begin()), conv_iterator(expression.end()), syntax_options | boost::regex_constants::no_except).status())
|
||||
if(0 == r.assign(expression.begin(), expression.end(), syntax_options | boost::regex_constants::no_except).status())
|
||||
{
|
||||
BOOST_REGEX_TEST_ERROR("Expression compiled when it should not have done so.", wchar_t);
|
||||
}
|
||||
@ -114,7 +117,7 @@ void test_icu(const wchar_t&, const test_invalid_regex_tag&)
|
||||
//
|
||||
bool have_catch = false;
|
||||
try{
|
||||
r.assign(conv_iterator(expression.begin()), conv_iterator(expression.end()), syntax_options);
|
||||
r.assign(expression.begin(), expression.end(), syntax_options);
|
||||
#ifdef BOOST_NO_EXCEPTIONS
|
||||
if(r.status())
|
||||
have_catch = true;
|
||||
@ -148,7 +151,8 @@ void test_icu(const wchar_t&, const test_invalid_regex_tag&)
|
||||
|
||||
void test_icu(const wchar_t&, const test_regex_replace_tag&)
|
||||
{
|
||||
const std::wstring& expression = test_info<wchar_t>::expression();
|
||||
std::vector< ::UChar32> expression;
|
||||
expression.assign(test_info<wchar_t>::expression().begin(), test_info<wchar_t>::expression().end());
|
||||
boost::regex_constants::syntax_option_type syntax_options = test_info<UChar32>::syntax_options();
|
||||
boost::u32regex r;
|
||||
try{
|
||||
|
@ -75,6 +75,8 @@ void test_replace()
|
||||
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\0", "\0");
|
||||
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "()?:", "()?:");
|
||||
TEST_REGEX_REPLACE("a+", perl, "...aaa,,", match_default|format_perl|format_no_copy, "\\0101", "A");
|
||||
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\1", "aa");
|
||||
TEST_REGEX_REPLACE("(a+)(b+)", perl, " ...aabb,,", match_default|format_perl|format_no_copy, "\\2", "bb");
|
||||
|
||||
// move to copying unmatched data:
|
||||
TEST_REGEX_REPLACE("a+", perl, "...aaa,,,", match_default|format_all, "bbb", "...bbb,,,");
|
||||
@ -101,5 +103,10 @@ void test_replace()
|
||||
TEST_REGEX_REPLACE("a+(b+)", perl, "...aaabb,,,", match_default|format_perl|format_no_copy, "(?1abc:def)", "(?1abc:def)");
|
||||
TEST_REGEX_REPLACE("a+(b+)", perl, "...", match_default|format_perl, "(?1abc:def)", "...");
|
||||
TEST_REGEX_REPLACE("a+(b+)", perl, "...", match_default|format_perl|format_no_copy, "(?1abc:def)", "");
|
||||
// probe bug reports and other special cases:
|
||||
TEST_REGEX_REPLACE("([^\\d]+).*", normal|icase, "tesd 999 test", match_default|format_all, "($1)replace", "tesd replace");
|
||||
TEST_REGEX_REPLACE("(a)(b)", perl, "ab", match_default|format_all, "$1:$2", "a:b");
|
||||
TEST_REGEX_REPLACE("(a(c)?)|(b)", perl, "acab", match_default|format_all, "(?1(?2(C:):A):B:)", "C:AB:");
|
||||
|
||||
}
|
||||
|
||||
|
@ -88,6 +88,7 @@ void test_sets()
|
||||
TEST_REGEX_SEARCH("[[:space:]]+", extended, "a \n\t\rb", match_default, make_array(1, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("[[:upper:]]+", extended, "aBCd", match_default, make_array(1, 3, -2, -2));
|
||||
TEST_REGEX_SEARCH("[[:xdigit:]]+", extended, "p0f3Cx", match_default, make_array(1, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("[\\d]+", perl, "a019b", match_default, make_array(1, 4, -2, -2));
|
||||
|
||||
//
|
||||
// escapes are supported in character classes if we have either
|
||||
@ -243,5 +244,66 @@ void test_sets2()
|
||||
TEST_REGEX_SEARCH("[\\s]+", perl, "AB AB", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_INVALID_REGEX("[\\S]", perl);
|
||||
TEST_REGEX_SEARCH("\\S+", perl, " abc ", match_default, make_array(2, 5, -2, -2));
|
||||
|
||||
// and some Perl style properties:
|
||||
TEST_REGEX_SEARCH("\\pl+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\Pl+", perl, "abABCab", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\pu+", perl, "abABCab", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\Pu+", perl, "ABabcAB", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\pd+", perl, "AB012AB", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\PD+", perl, "01abc01", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\ps+", perl, "AB AB", match_default, make_array(2, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\PS+", perl, " abc ", match_default, make_array(2, 5, -2, -2));
|
||||
|
||||
TEST_REGEX_SEARCH("\\p{alnum}+", perl, "-%@a0X_-", match_default, make_array(3, 6, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{alpha}+", perl, " -%@aX_0-", match_default, make_array(4, 6, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{blank}+", perl, "a \tb", match_default, make_array(1, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{cntrl}+", perl, " a\n\tb", match_default, make_array(2, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{digit}+", perl, "a019b", match_default, make_array(1, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{graph}+", perl, " a%b ", match_default, make_array(1, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{lower}+", perl, "AabC", match_default, make_array(1, 3, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{print}+", perl, "AabC", match_default, make_array(0, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{punct}+", perl, " %-&\t", match_default, make_array(1, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{space}+", perl, "a \n\t\rb", match_default, make_array(1, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{upper}+", perl, "aBCd", match_default, make_array(1, 3, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\p{xdigit}+", perl, "p0f3Cx", match_default, make_array(1, 5, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{alnum}+", perl, "-%@a", match_default, make_array(0, 3, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{alpha}+", perl, " -%@a", match_default, make_array(0, 4, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{blank}+", perl, "a ", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{cntrl}+", perl, " a\n", match_default, make_array(0, 2, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{digit}+", perl, "a0", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{graph}+", perl, " a", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{lower}+", perl, "Aa", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{print}+", perl, "Absc", match_default, make_array(-2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{punct}+", perl, " %", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{space}+", perl, "a ", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{upper}+", perl, "aB", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\P{xdigit}+", perl, "pf", match_default, make_array(0, 1, -2, -2));
|
||||
|
||||
TEST_INVALID_REGEX("\\p{invalid class}", perl);
|
||||
TEST_INVALID_REGEX("\\p{upper", perl);
|
||||
TEST_INVALID_REGEX("\\p{", perl);
|
||||
TEST_INVALID_REGEX("\\p", perl);
|
||||
TEST_INVALID_REGEX("\\P{invalid class}", perl);
|
||||
TEST_INVALID_REGEX("\\P{upper", perl);
|
||||
TEST_INVALID_REGEX("\\P{", perl);
|
||||
TEST_INVALID_REGEX("\\P", perl);
|
||||
|
||||
// try named characters:
|
||||
TEST_REGEX_SEARCH("\\N{zero}", perl, "0", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{one}", perl, "1", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{two}", perl, "2", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{three}", perl, "3", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{a}", perl, "bac", match_default, make_array(1, 2, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{\xf0}", perl, "b\xf0x", match_default, make_array(1, 2, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{right-curly-bracket}", perl, "}", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("\\N{NUL}", perl, "\0", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH("[\\N{zero}-\\N{nine}]+", perl, " 0123456789 ", match_default, make_array(1, 11, -2, -2));
|
||||
|
||||
TEST_INVALID_REGEX("\\N", perl);
|
||||
TEST_INVALID_REGEX("\\N{", perl);
|
||||
TEST_INVALID_REGEX("\\N{}", perl);
|
||||
TEST_INVALID_REGEX("\\N{invalid-name}", perl);
|
||||
TEST_INVALID_REGEX("\\N{zero", perl);
|
||||
}
|
||||
|
||||
|
147
test/regress/test_unicode.cpp
Normal file
147
test/regress/test_unicode.cpp
Normal file
@ -0,0 +1,147 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (c) 2004
|
||||
* Dr John Maddock
|
||||
*
|
||||
* Use, modification and distribution are subject to the
|
||||
* Boost Software License, Version 1.0. (See accompanying file
|
||||
* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* LOCATION: see http://www.boost.org for most recent version.
|
||||
* FILE test_unicode.hpp
|
||||
* VERSION see <boost/version.hpp>
|
||||
* DESCRIPTION: Unicode specific tests (requires ICU).
|
||||
*/
|
||||
|
||||
#include <boost/regex/config.hpp>
|
||||
#ifdef BOOST_HAS_ICU
|
||||
#include "test.hpp"
|
||||
|
||||
#ifdef BOOST_MSVC
|
||||
#pragma warning(disable:4127)
|
||||
#endif
|
||||
|
||||
#define TEST_REGEX_SEARCH_U(s, f, t, m, a)\
|
||||
do{\
|
||||
const wchar_t e[] = { s };\
|
||||
std::wstring se(e, (sizeof(e) / sizeof(wchar_t)) - 1);\
|
||||
const wchar_t st[] = { t };\
|
||||
std::wstring sst(st, (sizeof(st) / sizeof(wchar_t)) - 1);\
|
||||
test_info<wchar_t>::set_info(__FILE__, __LINE__, se, f, sst, m, a);\
|
||||
test_icu(wchar_t(0), test_regex_search_tag());\
|
||||
}while(0)
|
||||
|
||||
#define TEST_REGEX_CLASS_U(classname, character)\
|
||||
TEST_REGEX_SEARCH_U(\
|
||||
BOOST_JOIN(L, \
|
||||
BOOST_STRINGIZE(\
|
||||
BOOST_JOIN([[:, BOOST_JOIN(classname, :]])))), \
|
||||
perl, \
|
||||
BOOST_JOIN(L, \
|
||||
BOOST_STRINGIZE(\
|
||||
BOOST_JOIN(\x, character))), \
|
||||
match_default, \
|
||||
make_array(0, 1, -2, -2))
|
||||
|
||||
void test_unicode()
|
||||
{
|
||||
using namespace boost::regex_constants;
|
||||
|
||||
TEST_REGEX_CLASS_U(L*, 3108);
|
||||
TEST_REGEX_CLASS_U(Letter, 3108);
|
||||
TEST_REGEX_CLASS_U(Lu, 2145);
|
||||
TEST_REGEX_CLASS_U(Uppercase Letter, 2145);
|
||||
TEST_REGEX_CLASS_U(Ll, 2146);
|
||||
TEST_REGEX_CLASS_U(Lowercase Letter, 2146);
|
||||
TEST_REGEX_CLASS_U(Lt, 1FFC);
|
||||
TEST_REGEX_CLASS_U(Titlecase Letter, 1FFC);
|
||||
TEST_REGEX_CLASS_U(Lm, 1D61);
|
||||
TEST_REGEX_CLASS_U(Modifier Letter, 1D61);
|
||||
TEST_REGEX_CLASS_U(Lo, 1974);
|
||||
TEST_REGEX_CLASS_U(Other Letter, 1974);
|
||||
TEST_REGEX_CLASS_U(M*, 20EA);
|
||||
TEST_REGEX_CLASS_U(Mark, 20EA);
|
||||
TEST_REGEX_CLASS_U(Mn, 20EA);
|
||||
TEST_REGEX_CLASS_U(Non-Spacing Mark, 20EA);
|
||||
TEST_REGEX_CLASS_U(Mc, 1938);
|
||||
TEST_REGEX_CLASS_U(Spacing Combining Mark, 1938);
|
||||
TEST_REGEX_CLASS_U(Me, 06DE);
|
||||
TEST_REGEX_CLASS_U(Enclosing Mark, 06DE);
|
||||
TEST_REGEX_CLASS_U(N*, 0669);
|
||||
TEST_REGEX_CLASS_U(Number, 0669);
|
||||
TEST_REGEX_CLASS_U(Nd, 0669);
|
||||
TEST_REGEX_CLASS_U(Decimal Digit Number, 0669);
|
||||
TEST_REGEX_CLASS_U(Nl, 303A);
|
||||
TEST_REGEX_CLASS_U(Letter Number, 303A);
|
||||
TEST_REGEX_CLASS_U(No, 2793);
|
||||
TEST_REGEX_CLASS_U(Other Number, 2793);
|
||||
|
||||
TEST_REGEX_CLASS_U(S*, 2144);
|
||||
TEST_REGEX_CLASS_U(Symbol, 2144);
|
||||
TEST_REGEX_CLASS_U(Sm, 2144);
|
||||
TEST_REGEX_CLASS_U(Math Symbol, 2144);
|
||||
TEST_REGEX_CLASS_U(Sc, 20B1);
|
||||
TEST_REGEX_CLASS_U(Currency Symbol, 20B1);
|
||||
TEST_REGEX_CLASS_U(Sk, 1FFE);
|
||||
TEST_REGEX_CLASS_U(Modifier Symbol, 1FFE);
|
||||
TEST_REGEX_CLASS_U(So, 19FF);
|
||||
TEST_REGEX_CLASS_U(Other Symbol, 19FF);
|
||||
|
||||
TEST_REGEX_CLASS_U(P*, 005F);
|
||||
TEST_REGEX_CLASS_U(Punctuation, 005F);
|
||||
TEST_REGEX_CLASS_U(Pc, 005F);
|
||||
TEST_REGEX_CLASS_U(Connector Punctuation, 005F);
|
||||
TEST_REGEX_CLASS_U(Pd, 002D);
|
||||
TEST_REGEX_CLASS_U(Dash Punctuation, 002D);
|
||||
TEST_REGEX_CLASS_U(Ps, 0028);
|
||||
TEST_REGEX_CLASS_U(Open Punctuation, 0028);
|
||||
TEST_REGEX_CLASS_U(Pe, FF63);
|
||||
TEST_REGEX_CLASS_U(Close Punctuation, FF63);
|
||||
TEST_REGEX_CLASS_U(Pi, 2039);
|
||||
TEST_REGEX_CLASS_U(Initial Punctuation, 2039);
|
||||
TEST_REGEX_CLASS_U(Pf, 203A);
|
||||
TEST_REGEX_CLASS_U(Final Punctuation, 203A);
|
||||
TEST_REGEX_CLASS_U(Po, 2038);
|
||||
TEST_REGEX_CLASS_U(Other Punctuation, 2038);
|
||||
|
||||
TEST_REGEX_CLASS_U(Z*, 202F);
|
||||
TEST_REGEX_CLASS_U(Separator, 202F);
|
||||
TEST_REGEX_CLASS_U(Zs, 202F);
|
||||
TEST_REGEX_CLASS_U(Space Separator, 202F);
|
||||
TEST_REGEX_CLASS_U(Zl, 2028);
|
||||
TEST_REGEX_CLASS_U(Line Separator, 2028);
|
||||
TEST_REGEX_CLASS_U(Zp, 2029);
|
||||
TEST_REGEX_CLASS_U(Paragraph Separator, 2029);
|
||||
TEST_REGEX_CLASS_U(C*, 009F);
|
||||
TEST_REGEX_CLASS_U(Other, 009F);
|
||||
TEST_REGEX_CLASS_U(Cc, 009F);
|
||||
TEST_REGEX_CLASS_U(Control, 009F);
|
||||
TEST_REGEX_CLASS_U(Cf, FFFB);
|
||||
TEST_REGEX_CLASS_U(Format, FFFB);
|
||||
TEST_REGEX_CLASS_U(Cs, DC00);
|
||||
TEST_REGEX_CLASS_U(Surrogate, DC00);
|
||||
TEST_REGEX_CLASS_U(Co, F8FF);
|
||||
TEST_REGEX_CLASS_U(Private Use, F8FF);
|
||||
TEST_REGEX_CLASS_U(Cn, FFFF);
|
||||
TEST_REGEX_CLASS_U(Not Assigned, FFFF);
|
||||
TEST_REGEX_CLASS_U(Any, 2038);
|
||||
TEST_REGEX_CLASS_U(Assigned, 2038);
|
||||
TEST_REGEX_CLASS_U(ASCII, 7f);
|
||||
TEST_REGEX_SEARCH_U(L"[[:Assigned:]]", perl, L"\xffff", match_default, make_array(-2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"[[:ASCII:]]", perl, L"\x80", match_default, make_array(-2, -2));
|
||||
|
||||
TEST_REGEX_SEARCH_U(L"\\N{KHMER DIGIT SIX}", perl, L"\x17E6", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"\\N{MODIFIER LETTER LOW ACUTE ACCENT}", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"\\N{SUPERSCRIPT ONE}", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"[\\N{KHMER DIGIT SIX}]", perl, L"\x17E6", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"[\\N{MODIFIER LETTER LOW ACUTE ACCENT}]", perl, L"\x02CF", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"[\\N{SUPERSCRIPT ONE}]", perl, L"\x00B9", match_default, make_array(0, 1, -2, -2));
|
||||
TEST_REGEX_SEARCH_U(L"\\N{CJK UNIFIED IDEOGRAPH-7FED}", perl, L"\x7FED", match_default, make_array(0, 1, -2, -2));
|
||||
}
|
||||
|
||||
#else
|
||||
void test_unicode(){}
|
||||
#endif
|
@ -36,25 +36,70 @@ void spot_checks()
|
||||
BOOST_CHECK_EQUAL(*--it, 0xDF02u);
|
||||
BOOST_CHECK_EQUAL(*--it, 0xD800u);
|
||||
|
||||
::boost::uint32_t spot8[] = { 0x004Du, 0x0430u, 0x4E8Cu, 0x10302u, };
|
||||
typedef boost::u32_to_u8_iterator<const ::boost::uint32_t*> u32to8type;
|
||||
|
||||
u32to8type it8(spot8);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0x4Du);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0xD0u);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0xB0u);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0xE4u);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0xBAu);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0x8Cu);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0xF0u);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0x90u);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0x8Cu);
|
||||
BOOST_CHECK_EQUAL(*it8++, 0x82u);
|
||||
|
||||
BOOST_CHECK_EQUAL(*--it8, 0x82u);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0x8Cu);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0x90u);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0xF0u);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0x8Cu);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0xBAu);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0xE4u);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0xB0u);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0xD0u);
|
||||
BOOST_CHECK_EQUAL(*--it8, 0x4Du);
|
||||
}
|
||||
|
||||
void test(const std::vector< ::boost::uint32_t>& v)
|
||||
{
|
||||
typedef std::vector< ::boost::uint32_t> vector32_type;
|
||||
typedef std::vector< ::boost::uint16_t> vector16_type;
|
||||
typedef std::vector< ::boost::uint8_t> vector8_type;
|
||||
typedef boost::u32_to_u16_iterator<vector32_type::const_iterator, ::boost::uint16_t> u32to16type;
|
||||
typedef boost::u16_to_u32_iterator<vector16_type::const_iterator, ::boost::uint32_t> u16to32type;
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
typedef std::reverse_iterator<u32to16type> ru32to16type;
|
||||
typedef std::reverse_iterator<u16to32type> ru16to32type;
|
||||
#endif
|
||||
typedef boost::u32_to_u8_iterator<vector32_type::const_iterator, ::boost::uint8_t> u32to8type;
|
||||
typedef boost::u8_to_u32_iterator<vector8_type::const_iterator, ::boost::uint32_t> u8to32type;
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
typedef std::reverse_iterator<u32to8type> ru32to8type;
|
||||
typedef std::reverse_iterator<u8to32type> ru8to32type;
|
||||
#endif
|
||||
vector8_type v8;
|
||||
vector16_type v16;
|
||||
vector32_type v32;
|
||||
vector32_type::const_iterator i, j, k;
|
||||
//
|
||||
// begin by testing forward iteration, of 32-16 bit interconversions:
|
||||
//
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
v16.assign(u32to16type(v.begin()), u32to16type(v.end()));
|
||||
#else
|
||||
v16.clear();
|
||||
std::copy(u32to16type(v.begin()), u32to16type(v.end()), std::back_inserter(v16));
|
||||
#endif
|
||||
BOOST_CHECK_EQUAL(std::distance(u32to16type(v.begin()), u32to16type(v.end())), v16.size());
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
v32.assign(u16to32type(v16.begin()), u16to32type(v16.end()));
|
||||
#else
|
||||
v32.clear();
|
||||
std::copy(u16to32type(v16.begin()), u16to32type(v16.end()), std::back_inserter(v32));
|
||||
#endif
|
||||
BOOST_CHECK_EQUAL(std::distance(u16to32type(v16.begin()), u16to32type(v16.end())), v32.size());
|
||||
BOOST_CHECK_EQUAL(v.size(), v32.size());
|
||||
i = v.begin();
|
||||
@ -68,6 +113,7 @@ void test(const std::vector< ::boost::uint32_t>& v)
|
||||
//
|
||||
// test backward iteration, of 32-16 bit interconversions:
|
||||
//
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
v16.assign(ru32to16type(u32to16type(v.end())), ru32to16type(u32to16type(v.begin())));
|
||||
BOOST_CHECK_EQUAL(std::distance(ru32to16type(u32to16type(v.end())), ru32to16type(u32to16type(v.begin()))), v16.size());
|
||||
std::reverse(v16.begin(), v16.end());
|
||||
@ -83,6 +129,53 @@ void test(const std::vector< ::boost::uint32_t>& v)
|
||||
i,
|
||||
j,
|
||||
k);
|
||||
#endif
|
||||
//
|
||||
// Test forward iteration, of 32-8 bit interconversions:
|
||||
//
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
v8.assign(u32to8type(v.begin()), u32to8type(v.end()));
|
||||
#else
|
||||
v8.clear();
|
||||
std::copy(u32to8type(v.begin()), u32to8type(v.end()), std::back_inserter(v8));
|
||||
#endif
|
||||
BOOST_CHECK_EQUAL(std::distance(u32to8type(v.begin()), u32to8type(v.end())), v8.size());
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
v32.assign(u8to32type(v8.begin()), u8to32type(v8.end()));
|
||||
#else
|
||||
v32.clear();
|
||||
std::copy(u8to32type(v8.begin()), u8to32type(v8.end()), std::back_inserter(v32));
|
||||
#endif
|
||||
BOOST_CHECK_EQUAL(std::distance(u8to32type(v8.begin()), u8to32type(v8.end())), v32.size());
|
||||
BOOST_CHECK_EQUAL(v.size(), v32.size());
|
||||
i = v.begin();
|
||||
j = i;
|
||||
std::advance(j, (std::min)(v.size(), v32.size()));
|
||||
k = v32.begin();
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(
|
||||
i,
|
||||
j,
|
||||
k);
|
||||
//
|
||||
// test backward iteration, of 32-8 bit interconversions:
|
||||
//
|
||||
#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
||||
v8.assign(ru32to8type(u32to8type(v.end())), ru32to8type(u32to8type(v.begin())));
|
||||
BOOST_CHECK_EQUAL(std::distance(ru32to8type(u32to8type(v.end())), ru32to8type(u32to8type(v.begin()))), v8.size());
|
||||
std::reverse(v8.begin(), v8.end());
|
||||
v32.assign(ru8to32type(u8to32type(v8.end())), ru8to32type(u8to32type(v8.begin())));
|
||||
BOOST_CHECK_EQUAL(std::distance(ru8to32type(u8to32type(v8.end())), ru8to32type(u8to32type(v8.begin()))), v32.size());
|
||||
BOOST_CHECK_EQUAL(v.size(), v32.size());
|
||||
std::reverse(v32.begin(), v32.end());
|
||||
i = v.begin();
|
||||
j = i;
|
||||
std::advance(j, (std::min)(v.size(), v32.size()));
|
||||
k = v32.begin();
|
||||
BOOST_CHECK_EQUAL_COLLECTIONS(
|
||||
i,
|
||||
j,
|
||||
k);
|
||||
#endif
|
||||
}
|
||||
|
||||
int test_main( int, char* [] )
|
||||
@ -98,6 +191,12 @@ int test_main( int, char* [] )
|
||||
v.push_back(0xFFFF);
|
||||
v.push_back(0x10000);
|
||||
v.push_back(0x10FFFF);
|
||||
v.push_back(0x80u);
|
||||
v.push_back(0x80u - 1);
|
||||
v.push_back(0x800u);
|
||||
v.push_back(0x800u - 1);
|
||||
v.push_back(0x10000u);
|
||||
v.push_back(0x10000u - 1);
|
||||
test(v);
|
||||
return 0;
|
||||
}
|
||||
|
Reference in New Issue
Block a user