From d35e5a088ea8bc427fd4cbaa447b4a256ebc5add Mon Sep 17 00:00:00 2001 From: John Maddock Date: Thu, 11 Nov 2004 17:04:17 +0000 Subject: [PATCH] Added support for Perl style \N \P and \p. Completed first draft of Unicode UCS-4 support. Broken compiler compatibility fixes. Added unicode_iterators. [SVN r26185] --- build/Jamfile | 1 + include/boost/regex/config.hpp | 2 +- include/boost/regex/icu.hpp | 6 + .../boost/regex/pending/unicode_iterator.hpp | 292 +++++++++++++++- include/boost/regex/v4/basic_regex_parser.hpp | 82 +++++ include/boost/regex/v4/regex_format.hpp | 19 +- include/boost/regex/v4/syntax_type.hpp | 6 +- src/icu.cpp | 318 +++++++++++++++++- src/regex_traits_defaults.cpp | 12 +- test/Jamfile | 12 + test/regress/main.cpp | 1 + test/regress/test.hpp | 1 + test/regress/test_icu.cpp | 30 +- test/regress/test_replace.cpp | 7 + test/regress/test_sets.cpp | 62 ++++ test/regress/test_unicode.cpp | 147 ++++++++ test/unicode/unicode_iterator_test.cpp | 99 ++++++ 17 files changed, 1051 insertions(+), 46 deletions(-) create mode 100644 test/regress/test_unicode.cpp diff --git a/build/Jamfile b/build/Jamfile index 185ad381..bfce7ae7 100644 --- a/build/Jamfile +++ b/build/Jamfile @@ -42,6 +42,7 @@ SOURCES = cpp_regex_traits.cpp cregex.cpp fileiter.cpp + icu.cpp instances.cpp posix_api.cpp regex.cpp diff --git a/include/boost/regex/config.hpp b/include/boost/regex/config.hpp index 05ef21e6..631c4e9d 100644 --- a/include/boost/regex/config.hpp +++ b/include/boost/regex/config.hpp @@ -179,7 +179,7 @@ # define BOOST_REGEX_DECL #endif -#if (defined(BOOST_MSVC) || defined(__BORLANDC__)) && !defined(BOOST_REGEX_NO_LIB) && !defined(BOOST_REGEX_SOURCE) && !defined(BOOST_ALL_NO_LIB) && defined(__cplusplus) +#if !defined(BOOST_REGEX_NO_LIB) && !defined(BOOST_REGEX_SOURCE) && !defined(BOOST_ALL_NO_LIB) && defined(__cplusplus) # define BOOST_LIB_NAME boost_regex # if defined(BOOST_REGEX_DYN_LINK) || defined(BOOST_ALL_DYN_LINK) # define BOOST_DYN_LINK diff --git a/include/boost/regex/icu.hpp b/include/boost/regex/icu.hpp index f674f9e9..15477fd5 100644 --- a/include/boost/regex/icu.hpp +++ b/include/boost/regex/icu.hpp @@ -177,6 +177,8 @@ private: offset_xdigit = U_CHAR_CATEGORY_COUNT+2, offset_underscore = U_CHAR_CATEGORY_COUNT+3, offset_unicode = U_CHAR_CATEGORY_COUNT+4, + offset_any = U_CHAR_CATEGORY_COUNT+5, + offset_ascii = U_CHAR_CATEGORY_COUNT+6, }; // @@ -187,6 +189,10 @@ private: static const char_class_type mask_xdigit; static const char_class_type mask_underscore; static const char_class_type mask_unicode; + static const char_class_type mask_any; + static const char_class_type mask_ascii; + + static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2); boost::shared_ptr< ::boost::re_detail::icu_regex_traits_implementation> m_pimpl; }; diff --git a/include/boost/regex/pending/unicode_iterator.hpp b/include/boost/regex/pending/unicode_iterator.hpp index a0167c8b..986860b6 100644 --- a/include/boost/regex/pending/unicode_iterator.hpp +++ b/include/boost/regex/pending/unicode_iterator.hpp @@ -49,35 +49,63 @@ inline bool is_surrogate(T v) return (v & 0xF800u) == 0xd800; } +inline unsigned utf8_byte_count(boost::uint8_t c) +{ + // if the most significant bit with a zero in it is in position + // 8-N then there are N bytes in this UTF-8 sequence: + boost::uint8_t mask = 0x80u; + unsigned result = 0; + while(c & mask) + { + ++result; + mask >>= 1; + } + return (result == 0) ? 1 : result; +} + +inline unsigned utf8_trailing_byte_count(boost::uint8_t c) +{ + return utf8_byte_count(c) - 1; +} + } template class u32_to_u16_iterator : public boost::iterator_facade, U16Type, std::bidirectional_iterator_tag, const U16Type> { - typedef boost::iterator_facade, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type; + typedef boost::iterator_facade, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type; + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) typedef typename std::iterator_traits::value_type base_value_type; BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16); +#endif + public: typename base_type::reference dereference()const { if(m_current == 2) - const_cast(this)->extract_current(); + extract_current(); return m_values[m_current]; } bool equal(const u32_to_u16_iterator& that)const { if(m_position == that.m_position) { + // Both m_currents must be equal, or both even + // this is the same as saying their sum must be even: + return (m_current + that.m_current) & 1u ? false : true; + /* if((m_current >= 2) && (that.m_current < 2)) const_cast(this)->extract_current(); else if((m_current < 2) && (that.m_current >= 2)) const_cast(that).extract_current(); if(m_current == that.m_current) return true; + */ } return false; } @@ -127,7 +155,7 @@ public: m_values[2] = 0; } private: - void invalid_code_point(::boost::uint32_t val) + static void invalid_code_point(::boost::uint32_t val) { #ifndef BOOST_NO_STD_LOCALE std::stringstream ss; @@ -139,15 +167,15 @@ private: boost::throw_exception(e); } - void extract_current() + void extract_current()const { // begin by checking for a code point out of range: - if(static_cast< ::boost::uint32_t>(*m_position) >= 0x10000u) + ::boost::uint32_t v = *m_position; + if(v >= 0x10000u) { - if(static_cast< ::boost::uint32_t>(*m_position) > 0x10FFFFu) + if(v > 0x10FFFFu) invalid_code_point(*m_position); // split into two surrogates: - base_value_type v = *m_position; m_values[0] = static_cast(v >> 10) + detail::high_surrogate_base; m_values[1] = static_cast(v & detail::ten_bit_mask) + detail::low_surrogate_base; m_current = 0; @@ -166,29 +194,31 @@ private: } } BaseIterator m_position; - U16Type m_values[3]; - unsigned m_current; + mutable U16Type m_values[3]; + mutable unsigned m_current; }; template class u16_to_u32_iterator : public boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> { - typedef boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; + typedef boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; // special values for pending iterator reads: BOOST_STATIC_CONSTANT(::boost::uint32_t, pending_read = 0xffffffffu); +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) typedef typename std::iterator_traits::value_type base_value_type; BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16); BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); +#endif public: typename base_type::reference dereference()const { if(m_value == pending_read) - const_cast(this)->extract_current(); + extract_current(); return m_value; } bool equal(const u16_to_u32_iterator& that)const @@ -223,7 +253,7 @@ public: m_value = pending_read; } private: - void invalid_code_point(::boost::uint16_t val) + static void invalid_code_point(::boost::uint16_t val) { #ifndef BOOST_NO_STD_LOCALE std::stringstream ss; @@ -234,28 +264,254 @@ private: #endif boost::throw_exception(e); } - void extract_current() + void extract_current()const { m_value = static_cast(static_cast< ::boost::uint16_t>(*m_position)); // if the last value is a high surrogate then adjust m_position and m_value as needed: if(detail::is_high_surrogate(*m_position)) { // precondition; next value must have be a low-surrogate: - ::boost::uint16_t t = *++m_position; - if((*m_position & 0xFC00u) != 0xDC00u) + BaseIterator next(m_position); + ::boost::uint16_t t = *++next; + if((t & 0xFC00u) != 0xDC00u) invalid_code_point(t); m_value = (m_value - detail::high_surrogate_base) << 10; m_value |= (static_cast(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask); - --m_position; } // postcondition; result must not be a surrogate: if(detail::is_surrogate(m_value)) invalid_code_point(static_cast< ::boost::uint16_t>(m_value)); } BaseIterator m_position; - U32Type m_value; + mutable U32Type m_value; }; -} +template +class u32_to_u8_iterator + : public boost::iterator_facade, U8Type, std::bidirectional_iterator_tag, const U8Type> +{ + typedef boost::iterator_facade, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type; + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) + typedef typename std::iterator_traits::value_type base_value_type; + + BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); + BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8); +#endif + +public: + typename base_type::reference + dereference()const + { + if(m_current == 4) + extract_current(); + return m_values[m_current]; + } + bool equal(const u32_to_u8_iterator& that)const + { + if(m_position == that.m_position) + { + // either the m_current's must be equal, or one must be 0 and + // the other 4: which means neither must have bits 1 or 2 set: + return (m_current == that.m_current) + || (((m_current | that.m_current) & 3) == 0); + } + return false; + } + void increment() + { + // if we have a pending read then read now, so that we know whether + // to skip a position, or move to a low-surrogate: + if(m_current == 4) + { + // pending read: + extract_current(); + } + // move to the next surrogate position: + ++m_current; + // if we've reached the end skip a position: + if(m_values[m_current] == 0) + { + m_current = 4; + ++m_position; + } + } + void decrement() + { + if((m_current & 3) == 0) + { + --m_position; + extract_current(); + m_current = 3; + while(m_current && (m_values[m_current] == 0)) + --m_current; + } + else + --m_current; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u32_to_u8_iterator() : m_position(), m_current(0) + { + m_values[4] = 0; + } + u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4) + { + m_values[4] = 0; + } +private: + static void invalid_code_point(::boost::uint32_t val) + { +#ifndef BOOST_NO_STD_LOCALE + std::stringstream ss; + ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-8 sequence"; + std::out_of_range e(ss.str()); +#else + std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-8 sequence"); +#endif + boost::throw_exception(e); + } + + void extract_current()const + { + boost::uint32_t c = *m_position; + if(c > 0x10FFFFu) + invalid_code_point(c); + if(c < 0x80u) + { + m_values[0] = static_cast(c); + m_values[1] = static_cast(0u); + m_values[2] = static_cast(0u); + m_values[3] = static_cast(0u); + } + else if(c < 0x800u) + { + m_values[0] = static_cast(0xC0u + (c >> 6)); + m_values[1] = static_cast(0x80u + (c & 0x3Fu)); + m_values[2] = static_cast(0u); + m_values[3] = static_cast(0u); + } + else if(c < 0x10000u) + { + m_values[0] = static_cast(0xE0u + (c >> 12)); + m_values[1] = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + m_values[2] = static_cast(0x80u + (c & 0x3Fu)); + m_values[3] = static_cast(0u); + } + else + { + m_values[0] = static_cast(0xF0u + (c >> 18)); + m_values[1] = static_cast(0x80u + ((c >> 12) & 0x3Fu)); + m_values[2] = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + m_values[3] = static_cast(0x80u + (c & 0x3Fu)); + } + m_current= 0; + } + BaseIterator m_position; + mutable U8Type m_values[5]; + mutable unsigned m_current; +}; + +template +class u8_to_u32_iterator + : public boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> +{ + typedef boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; + // special values for pending iterator reads: + BOOST_STATIC_CONSTANT(::boost::uint32_t, pending_read = 0xffffffffu); + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) + typedef typename std::iterator_traits::value_type base_value_type; + + BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8); + BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); +#endif + +public: + typename base_type::reference + dereference()const + { + if(m_value == pending_read) + extract_current(); + return m_value; + } + bool equal(const u8_to_u32_iterator& that)const + { + return m_position == that.m_position; + } + void increment() + { + // skip high surrogate first if there is one: + unsigned c = detail::utf8_byte_count(*m_position); + std::advance(m_position, c); + m_value = pending_read; + } + void decrement() + { + // Keep backtracking until we don't have a trailing character: + unsigned count = 0; + while((*--m_position & 0xC0u) == 0x80u) ++count; + // now check that the sequence was valid: + if(count != detail::utf8_trailing_byte_count(*m_position)) + invalid_sequnce(); + m_value = pending_read; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u8_to_u32_iterator() : m_position() + { + m_value = pending_read; + } + u8_to_u32_iterator(BaseIterator b) : m_position(b) + { + m_value = pending_read; + } +private: + static void invalid_sequnce() + { + std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character"); + boost::throw_exception(e); + } + void extract_current()const + { + m_value = static_cast(static_cast< ::boost::uint8_t>(*m_position)); + // we must not have a continuation character: + if((m_value & 0xC0u) == 0x80u) + invalid_sequnce(); + // see how many extra byts we have: + unsigned extra = detail::utf8_trailing_byte_count(*m_position); + // extract the extra bits, 6 from each extra byte: + BaseIterator next(m_position); + for(unsigned c = 0; c < extra; ++c) + { + ++next; + m_value <<= 6; + m_value += static_cast(*next) & 0x3Fu; + } + // we now need to remove a few of the leftmost bits, but how many depends + // upon how many extra bytes we've extracted: + static const boost::uint32_t masks[] = + { + 0x7Fu, + 0x7FFu, + 0xFFFFu, + 0x1FFFFFu, + }; + m_value &= masks[extra]; + // check the result: + if(m_value > 0x10FFFFu) + invalid_sequnce(); + } + BaseIterator m_position; + mutable U32Type m_value; +}; + +} // namespace boost #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP diff --git a/include/boost/regex/v4/basic_regex_parser.hpp b/include/boost/regex/v4/basic_regex_parser.hpp index 4bca85fd..d92d6d3a 100644 --- a/include/boost/regex/v4/basic_regex_parser.hpp +++ b/include/boost/regex/v4/basic_regex_parser.hpp @@ -611,6 +611,52 @@ bool basic_regex_parser::parse_extended_escape() ++m_position; this->append_state(syntax_element_restart_continue); break; + case regex_constants::escape_type_not_property: + negate = true; + // fall through: + case regex_constants::escape_type_property: + { + ++m_position; + char_class_type m; + if(m_position == m_end) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + // maybe have \p{ddd} + if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) + { + const charT* base = m_position; + // skip forward until we find enclosing brace: + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + m = this->m_traits.lookup_classname(++base, m_position++); + } + else + { + m = this->m_traits.lookup_classname(m_position, m_position+1); + ++m_position; + } + if(m != 0) + { + basic_char_set char_set; + if(negate) + char_set.negate(); + char_set.add_class(m); + if(0 == this->append_set(char_set)) + { + fail(regex_constants::error_ctype, m_position - m_base); + return false; + } + return true; + } + fail(regex_constants::error_ctype, m_position - m_base); + } default: this->append_literal(unescape_character()); break; @@ -948,6 +994,7 @@ bool basic_regex_parser::parse_set() if(m != 0) { char_set.add_class(m); + ++m_position; break; } } @@ -1373,6 +1420,41 @@ charT basic_regex_parser::unescape_character() } return static_cast(val); } + case regex_constants::escape_type_named_char: + { + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + // maybe have \N{name} + if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace) + { + const charT* base = m_position; + // skip forward until we find enclosing brace: + while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace)) + ++m_position; + if(m_position == m_end) + { + fail(regex_constants::error_escape, m_position - m_base); + return false; + } + string_type s = this->m_traits.lookup_collatename(++base, m_position++); + if(s.empty()) + { + fail(regex_constants::error_collate, m_position - m_base); + return false; + } + if(s.size() == 1) + { + return s[0]; + } + } + // fall through is a failure: + fail(regex_constants::error_escape, m_position - m_base); + return false; + } default: result = *m_position; break; diff --git a/include/boost/regex/v4/regex_format.hpp b/include/boost/regex/v4/regex_format.hpp index 9d030a07..7c1a6314 100644 --- a/include/boost/regex/v4/regex_format.hpp +++ b/include/boost/regex/v4/regex_format.hpp @@ -75,7 +75,7 @@ class basic_regex_formatter public: typedef typename traits::char_type char_type; basic_regex_formatter(OutputIterator o, const Results& r, const traits& t) - : m_traits(t), m_results(r), m_out(o), m_state(output_copy) {} + : m_traits(t), m_results(r), m_out(o), m_state(output_copy), m_have_conditional(false) {} OutputIterator format(const char_type* p1, const char_type* p2, match_flag_type f); OutputIterator format(const char_type* p1, match_flag_type f) { @@ -108,6 +108,7 @@ private: const char_type* m_end; // format string end match_flag_type m_flags; // format flags to use output_state m_state; // what to do with the next character + bool m_have_conditional; // we are parsing a conditional private: basic_regex_formatter(const basic_regex_formatter&); basic_regex_formatter& operator=(const basic_regex_formatter&); @@ -147,7 +148,10 @@ void basic_regex_formatter::format_all() if(m_flags & boost::regex_constants::format_all) { ++m_position; + bool have_conditional = m_have_conditional; + m_have_conditional = false; format_until_scope_end(); + m_have_conditional = have_conditional; if(m_position == m_end) return; BOOST_ASSERT(*m_position == static_cast(')')); @@ -158,7 +162,6 @@ void basic_regex_formatter::format_all() ++m_position; break; case ')': - case ':': if(m_flags & boost::regex_constants::format_all) { return; @@ -166,6 +169,14 @@ void basic_regex_formatter::format_all() put(*m_position); ++m_position; break; + case ':': + if((m_flags & boost::regex_constants::format_all) && m_have_conditional) + { + return; + } + put(*m_position); + ++m_position; + break; case '?': if(m_flags & boost::regex_constants::format_all) { @@ -405,7 +416,9 @@ void basic_regex_formatter::format_conditional( // output varies depending upon whether sub-expression v matched or not: if(m_results[v].matched) { + m_have_conditional = true; format_all(); + m_have_conditional = false; if((m_position != m_end) && (*m_position == static_cast(':'))) { // skip the ':': @@ -425,7 +438,9 @@ void basic_regex_formatter::format_conditional( output_state saved_state = m_state; m_state = output_none; // format until ':' or ')': + m_have_conditional = true; format_all(); + m_have_conditional = false; // restore state: m_state = saved_state; if((m_position != m_end) && (*m_position == static_cast(':'))) diff --git a/include/boost/regex/v4/syntax_type.hpp b/include/boost/regex/v4/syntax_type.hpp index 6a978a68..b79d1664 100644 --- a/include/boost/regex/v4/syntax_type.hpp +++ b/include/boost/regex/v4/syntax_type.hpp @@ -89,7 +89,11 @@ static const escape_syntax_type escape_type_C = 50; / static const escape_syntax_type escape_type_Z = 51; // for \Z static const escape_syntax_type escape_type_G = 52; // for \G -static const escape_syntax_type syntax_max = 54; +static const escape_syntax_type escape_type_property = 54; // for \p +static const escape_syntax_type escape_type_not_property = 55; // for \P +static const escape_syntax_type escape_type_named_char = 56; // for \N + +static const escape_syntax_type syntax_max = 57; } } diff --git a/src/icu.cpp b/src/icu.cpp index 5c90103b..86335c28 100644 --- a/src/icu.cpp +++ b/src/icu.cpp @@ -15,7 +15,10 @@ * VERSION see * DESCRIPTION: Unicode regular expressions on top of the ICU Library. */ +#define BOOST_REGEX_SOURCE +#include +#ifdef BOOST_HAS_ICU #include namespace boost{ @@ -64,6 +67,264 @@ const icu_regex_traits::char_class_type icu_regex_traits::mask_space = icu_regex const icu_regex_traits::char_class_type icu_regex_traits::mask_xdigit = icu_regex_traits::char_class_type(1) << offset_xdigit; const icu_regex_traits::char_class_type icu_regex_traits::mask_underscore = icu_regex_traits::char_class_type(1) << offset_underscore; const icu_regex_traits::char_class_type icu_regex_traits::mask_unicode = icu_regex_traits::char_class_type(1) << offset_unicode; +const icu_regex_traits::char_class_type icu_regex_traits::mask_any = icu_regex_traits::char_class_type(1) << offset_any; +const icu_regex_traits::char_class_type icu_regex_traits::mask_ascii = icu_regex_traits::char_class_type(1) << offset_ascii; + +icu_regex_traits::char_class_type icu_regex_traits::lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2) +{ + static const ::UChar32 prop_name_table[] = { + /* any */ 'a', 'n', 'y', + /* ascii */ 'a', 's', 'c', 'i', 'i', + /* assigned */ 'a', 's', 's', 'i', 'g', 'n', 'e', 'd', + /* c* */ 'c', '*', + /* cc */ 'c', 'c', + /* cf */ 'c', 'f', + /* closepunctuation */ 'c', 'l', 'o', 's', 'e', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* cn */ 'c', 'n', + /* co */ 'c', 'o', + /* connectorpunctuation */ 'c', 'o', 'n', 'n', 'e', 'c', 't', 'o', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* control */ 'c', 'o', 'n', 't', 'r', 'o', 'l', + /* cs */ 'c', 's', + /* currencysymbol */ 'c', 'u', 'r', 'r', 'e', 'n', 'c', 'y', 's', 'y', 'm', 'b', 'o', 'l', + /* dashpunctuation */ 'd', 'a', 's', 'h', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* decimaldigitnumber */ 'd', 'e', 'c', 'i', 'm', 'a', 'l', 'd', 'i', 'g', 'i', 't', 'n', 'u', 'm', 'b', 'e', 'r', + /* enclosingmark */ 'e', 'n', 'c', 'l', 'o', 's', 'i', 'n', 'g', 'm', 'a', 'r', 'k', + /* finalpunctuation */ 'f', 'i', 'n', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* format */ 'f', 'o', 'r', 'm', 'a', 't', + /* initialpunctuation */ 'i', 'n', 'i', 't', 'i', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* l* */ 'l', '*', + /* letter */ 'l', 'e', 't', 't', 'e', 'r', + /* letternumber */ 'l', 'e', 't', 't', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r', + /* lineseparator */ 'l', 'i', 'n', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* ll */ 'l', 'l', + /* lm */ 'l', 'm', + /* lo */ 'l', 'o', + /* lowercaseletter */ 'l', 'o', 'w', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r', + /* lt */ 'l', 't', + /* lu */ 'l', 'u', + /* m* */ 'm', '*', + /* mark */ 'm', 'a', 'r', 'k', + /* mathsymbol */ 'm', 'a', 't', 'h', 's', 'y', 'm', 'b', 'o', 'l', + /* mc */ 'm', 'c', + /* me */ 'm', 'e', + /* mn */ 'm', 'n', + /* modifierletter */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r', + /* modifiersymbol */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l', + /* n* */ 'n', '*', + /* nd */ 'n', 'd', + /* nl */ 'n', 'l', + /* no */ 'n', 'o', + /* nonspacingmark */ 'n', 'o', 'n', 's', 'p', 'a', 'c', 'i', 'n', 'g', 'm', 'a', 'r', 'k', + /* notassigned */ 'n', 'o', 't', 'a', 's', 's', 'i', 'g', 'n', 'e', 'd', + /* number */ 'n', 'u', 'm', 'b', 'e', 'r', + /* openpunctuation */ 'o', 'p', 'e', 'n', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* other */ 'o', 't', 'h', 'e', 'r', + /* otherletter */ 'o', 't', 'h', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r', + /* othernumber */ 'o', 't', 'h', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r', + /* otherpunctuation */ 'o', 't', 'h', 'e', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* othersymbol */ 'o', 't', 'h', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l', + /* p* */ 'p', '*', + /* paragraphseparator */ 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* pc */ 'p', 'c', + /* pd */ 'p', 'd', + /* pe */ 'p', 'e', + /* pf */ 'p', 'f', + /* pi */ 'p', 'i', + /* po */ 'p', 'o', + /* privateuse */ 'p', 'r', 'i', 'v', 'a', 't', 'e', 'u', 's', 'e', + /* ps */ 'p', 's', + /* punctuation */ 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* s* */ 's', '*', + /* sc */ 's', 'c', + /* separator */ 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* sk */ 's', 'k', + /* sm */ 's', 'm', + /* so */ 's', 'o', + /* spaceseparator */ 's', 'p', 'a', 'c', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* spacingcombiningmark */ 's', 'p', 'a', 'c', 'i', 'n', 'g', 'c', 'o', 'm', 'b', 'i', 'n', 'i', 'n', 'g', 'm', 'a', 'r', 'k', + /* surrogate */ 's', 'u', 'r', 'r', 'o', 'g', 'a', 't', 'e', + /* symbol */ 's', 'y', 'm', 'b', 'o', 'l', + /* titlecase */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', + /* titlecaseletter */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r', + /* uppercaseletter */ 'u', 'p', 'p', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r', + /* z* */ 'z', '*', + /* zl */ 'z', 'l', + /* zp */ 'z', 'p', + /* zs */ 'z', 's', + }; + + static const re_detail::character_pointer_range<::UChar32> range_data[] = { + { prop_name_table+0, prop_name_table+3, }, // any + { prop_name_table+3, prop_name_table+8, }, // ascii + { prop_name_table+8, prop_name_table+16, }, // assigned + { prop_name_table+16, prop_name_table+18, }, // c* + { prop_name_table+18, prop_name_table+20, }, // cc + { prop_name_table+20, prop_name_table+22, }, // cf + { prop_name_table+22, prop_name_table+38, }, // closepunctuation + { prop_name_table+38, prop_name_table+40, }, // cn + { prop_name_table+40, prop_name_table+42, }, // co + { prop_name_table+42, prop_name_table+62, }, // connectorpunctuation + { prop_name_table+62, prop_name_table+69, }, // control + { prop_name_table+69, prop_name_table+71, }, // cs + { prop_name_table+71, prop_name_table+85, }, // currencysymbol + { prop_name_table+85, prop_name_table+100, }, // dashpunctuation + { prop_name_table+100, prop_name_table+118, }, // decimaldigitnumber + { prop_name_table+118, prop_name_table+131, }, // enclosingmark + { prop_name_table+131, prop_name_table+147, }, // finalpunctuation + { prop_name_table+147, prop_name_table+153, }, // format + { prop_name_table+153, prop_name_table+171, }, // initialpunctuation + { prop_name_table+171, prop_name_table+173, }, // l* + { prop_name_table+173, prop_name_table+179, }, // letter + { prop_name_table+179, prop_name_table+191, }, // letternumber + { prop_name_table+191, prop_name_table+204, }, // lineseparator + { prop_name_table+204, prop_name_table+206, }, // ll + { prop_name_table+206, prop_name_table+208, }, // lm + { prop_name_table+208, prop_name_table+210, }, // lo + { prop_name_table+210, prop_name_table+225, }, // lowercaseletter + { prop_name_table+225, prop_name_table+227, }, // lt + { prop_name_table+227, prop_name_table+229, }, // lu + { prop_name_table+229, prop_name_table+231, }, // m* + { prop_name_table+231, prop_name_table+235, }, // mark + { prop_name_table+235, prop_name_table+245, }, // mathsymbol + { prop_name_table+245, prop_name_table+247, }, // mc + { prop_name_table+247, prop_name_table+249, }, // me + { prop_name_table+249, prop_name_table+251, }, // mn + { prop_name_table+251, prop_name_table+265, }, // modifierletter + { prop_name_table+265, prop_name_table+279, }, // modifiersymbol + { prop_name_table+279, prop_name_table+281, }, // n* + { prop_name_table+281, prop_name_table+283, }, // nd + { prop_name_table+283, prop_name_table+285, }, // nl + { prop_name_table+285, prop_name_table+287, }, // no + { prop_name_table+287, prop_name_table+301, }, // nonspacingmark + { prop_name_table+301, prop_name_table+312, }, // notassigned + { prop_name_table+312, prop_name_table+318, }, // number + { prop_name_table+318, prop_name_table+333, }, // openpunctuation + { prop_name_table+333, prop_name_table+338, }, // other + { prop_name_table+338, prop_name_table+349, }, // otherletter + { prop_name_table+349, prop_name_table+360, }, // othernumber + { prop_name_table+360, prop_name_table+376, }, // otherpunctuation + { prop_name_table+376, prop_name_table+387, }, // othersymbol + { prop_name_table+387, prop_name_table+389, }, // p* + { prop_name_table+389, prop_name_table+407, }, // paragraphseparator + { prop_name_table+407, prop_name_table+409, }, // pc + { prop_name_table+409, prop_name_table+411, }, // pd + { prop_name_table+411, prop_name_table+413, }, // pe + { prop_name_table+413, prop_name_table+415, }, // pf + { prop_name_table+415, prop_name_table+417, }, // pi + { prop_name_table+417, prop_name_table+419, }, // po + { prop_name_table+419, prop_name_table+429, }, // privateuse + { prop_name_table+429, prop_name_table+431, }, // ps + { prop_name_table+431, prop_name_table+442, }, // punctuation + { prop_name_table+442, prop_name_table+444, }, // s* + { prop_name_table+444, prop_name_table+446, }, // sc + { prop_name_table+446, prop_name_table+455, }, // separator + { prop_name_table+455, prop_name_table+457, }, // sk + { prop_name_table+457, prop_name_table+459, }, // sm + { prop_name_table+459, prop_name_table+461, }, // so + { prop_name_table+461, prop_name_table+475, }, // spaceseparator + { prop_name_table+475, prop_name_table+495, }, // spacingcombiningmark + { prop_name_table+495, prop_name_table+504, }, // surrogate + { prop_name_table+504, prop_name_table+510, }, // symbol + { prop_name_table+510, prop_name_table+519, }, // titlecase + { prop_name_table+519, prop_name_table+534, }, // titlecaseletter + { prop_name_table+534, prop_name_table+549, }, // uppercaseletter + { prop_name_table+549, prop_name_table+551, }, // z* + { prop_name_table+551, prop_name_table+553, }, // zl + { prop_name_table+553, prop_name_table+555, }, // zp + { prop_name_table+555, prop_name_table+557, }, // zs + }; + + static const icu_regex_traits::char_class_type icu_class_map[] = { + icu_regex_traits::mask_any, // any + icu_regex_traits::mask_ascii, // ascii + (0x3FFFFFFFu) & ~(U_GC_CN_MASK), // assigned + U_GC_C_MASK, // c* + U_GC_CC_MASK, // cc + U_GC_CF_MASK, // cf + U_GC_PE_MASK, // closepunctuation + U_GC_CN_MASK, // cn + U_GC_CO_MASK, // co + U_GC_PC_MASK, // connectorpunctuation + U_GC_CC_MASK, // control + U_GC_CS_MASK, // cs + U_GC_SC_MASK, // currencysymbol + U_GC_PD_MASK, // dashpunctuation + U_GC_ND_MASK, // decimaldigitnumber + U_GC_ME_MASK, // enclosingmark + U_GC_PF_MASK, // finalpunctuation + U_GC_CF_MASK, // format + U_GC_PI_MASK, // initialpunctuation + U_GC_L_MASK, // l* + U_GC_L_MASK, // letter + U_GC_NL_MASK, // letternumber + U_GC_ZL_MASK, // lineseparator + U_GC_LL_MASK, // ll + U_GC_LM_MASK, // lm + U_GC_LO_MASK, // lo + U_GC_LL_MASK, // lowercaseletter + U_GC_LT_MASK, // lt + U_GC_LU_MASK, // lu + U_GC_M_MASK, // m* + U_GC_M_MASK, // mark + U_GC_SM_MASK, // mathsymbol + U_GC_MC_MASK, // mc + U_GC_ME_MASK, // me + U_GC_MN_MASK, // mn + U_GC_LM_MASK, // modifierletter + U_GC_SK_MASK, // modifiersymbol + U_GC_N_MASK, // n* + U_GC_ND_MASK, // nd + U_GC_NL_MASK, // nl + U_GC_NO_MASK, // no + U_GC_MN_MASK, // nonspacingmark + U_GC_CN_MASK, // notassigned + U_GC_N_MASK, // number + U_GC_PS_MASK, // openpunctuation + U_GC_C_MASK, // other + U_GC_LO_MASK, // otherletter + U_GC_NO_MASK, // othernumber + U_GC_PO_MASK, // otherpunctuation + U_GC_SO_MASK, // othersymbol + U_GC_P_MASK, // p* + U_GC_ZP_MASK, // paragraphseparator + U_GC_PC_MASK, // pc + U_GC_PD_MASK, // pd + U_GC_PE_MASK, // pe + U_GC_PF_MASK, // pf + U_GC_PI_MASK, // pi + U_GC_PO_MASK, // po + U_GC_CO_MASK, // privateuse + U_GC_PS_MASK, // ps + U_GC_P_MASK, // punctuation + U_GC_S_MASK, // s* + U_GC_SC_MASK, // sc + U_GC_Z_MASK, // separator + U_GC_SK_MASK, // sk + U_GC_SM_MASK, // sm + U_GC_SO_MASK, // so + U_GC_ZS_MASK, // spaceseparator + U_GC_MC_MASK, // spacingcombiningmark + U_GC_CS_MASK, // surrogate + U_GC_S_MASK, // symbol + U_GC_LT_MASK, // titlecase + U_GC_LT_MASK, // titlecaseletter + U_GC_LU_MASK, // uppercaseletter + U_GC_Z_MASK, // z* + U_GC_ZL_MASK, // zl + U_GC_ZP_MASK, // zp + U_GC_ZS_MASK, // zs + }; + + + static const re_detail::character_pointer_range< ::UChar32>* ranges_begin = range_data; + static const re_detail::character_pointer_range< ::UChar32>* ranges_end = range_data + (sizeof(range_data)/sizeof(range_data[0])); + + re_detail::character_pointer_range< ::UChar32> t = { p1, p2, }; + const re_detail::character_pointer_range< ::UChar32>* p = std::lower_bound(ranges_begin, ranges_end, t); + if((p != ranges_end) && (t == *p)) + return icu_class_map[p - ranges_begin]; + return 0; +} icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_type* p1, const char_type* p2) const { @@ -92,12 +353,33 @@ icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_ }; int id = ::boost::re_detail::get_default_class_id(p1, p2); + if(id >= 0) + return masks[id+1]; + char_class_type result = lookup_icu_mask(p1, p2); + if(result != 0) + return result; + if(id < 0) { string_type s(p1, p2); - for(string_type::size_type i = 0; i < s.size(); ++i) + string_type::size_type i = 0; + while(i < s.size()) + { s[i] = static_cast((::u_tolower)(s[i])); + if(::u_isspace(s[i]) || (s[i] == '-') || (s[i] == '_')) + s.erase(s.begin()+i, s.begin()+i+1); + else + { + s[i] = static_cast((::u_tolower)(s[i])); + ++i; + } + } id = ::boost::re_detail::get_default_class_id(&*s.begin(), &*s.begin() + s.size()); + if(id >= 0) + return masks[id+1]; + result = lookup_icu_mask(&*s.begin(), &*s.begin() + s.size()); + if(result != 0) + return result; } BOOST_ASSERT(id+1 < sizeof(masks) / sizeof(masks[0])); return masks[id+1]; @@ -109,6 +391,23 @@ icu_regex_traits::string_type icu_regex_traits::lookup_collatename(const char_ty if(std::find_if(p1, p2, std::bind2nd(std::greater< ::UChar32>(), 0x7f)) == p2) { std::string s(p1, p2); + // Try Unicode name: + UErrorCode err = U_ZERO_ERROR; + UChar32 c = ::u_charFromName(U_UNICODE_CHAR_NAME, s.c_str(), &err); + if(U_SUCCESS(err)) + { + result.push_back(c); + return result; + } + // Try Unicode-extended name: + err = U_ZERO_ERROR; + c = ::u_charFromName(U_EXTENDED_CHAR_NAME, s.c_str(), &err); + if(U_SUCCESS(err)) + { + result.push_back(c); + return result; + } + // try POSIX name: s = ::boost::re_detail::lookup_default_collate_name(s); result.assign(s.begin(), s.end()); } @@ -121,21 +420,26 @@ bool icu_regex_traits::isctype(char_type c, char_class_type f) const { // check for standard catagories first: char_class_type m = char_class_type(1u << u_charType(c)); - if((m & f).any()) + if((m & f) != 0) return true; // now check for special cases: - if((f & mask_blank).any() && u_isblank(c)) + if(((f & mask_blank) != 0) && u_isblank(c)) return true; - if((f & mask_space).any() && u_isspace(c)) + if(((f & mask_space) != 0) && u_isspace(c)) return true; - if((f & mask_xdigit).any() && (u_digit(c, 16) >= 0)) + if(((f & mask_xdigit) != 0) && (u_digit(c, 16) >= 0)) return true; - if((f & mask_unicode).any() && (c >= 0x100)) + if(((f & mask_unicode) != 0) && (c >= 0x100)) return true; - if((f & mask_underscore).any() && (c == '_')) + if(((f & mask_underscore) != 0) && (c == '_')) + return true; + if(((f & mask_any) != 0) && (c <= 0x10FFFF)) + return true; + if(((f & mask_ascii) != 0) && (c <= 0x7F)) return true; return false; } } +#endif // BOOST_HAS_ICU diff --git a/src/regex_traits_defaults.cpp b/src/regex_traits_defaults.cpp index 402c8820..e48d25ce 100644 --- a/src/regex_traits_defaults.cpp +++ b/src/regex_traits_defaults.cpp @@ -96,7 +96,11 @@ BOOST_REGEX_DECL const char* BOOST_REGEX_CALL get_default_syntax(regex_constants "C", "Z", "G", - "!", }; + "!", + "p", + "P", + "N", + }; return ((n >= (sizeof(messages) / sizeof(messages[1]))) ? "" : messages[n]); } @@ -374,9 +378,9 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul regex_constants::escape_type_not_class, /*K*/ regex_constants::escape_type_not_class, /*L*/ regex_constants::escape_type_not_class, /*M*/ - regex_constants::escape_type_not_class, /*N*/ + regex_constants::escape_type_named_char, /*N*/ regex_constants::escape_type_not_class, /*O*/ - regex_constants::escape_type_not_class, /*P*/ + regex_constants::escape_type_not_property, /*P*/ regex_constants::escape_type_Q, /*Q*/ regex_constants::escape_type_not_class, /*R*/ regex_constants::escape_type_not_class, /*S*/ @@ -408,7 +412,7 @@ BOOST_REGEX_DECL regex_constants::escape_syntax_type BOOST_REGEX_CALL get_defaul regex_constants::escape_type_class, /*m*/ regex_constants::escape_type_control_n, /*n*/ regex_constants::escape_type_class, /*o*/ - regex_constants::escape_type_class, /*p*/ + regex_constants::escape_type_property, /*p*/ regex_constants::escape_type_class, /*q*/ regex_constants::escape_type_control_r, /*r*/ regex_constants::escape_type_class, /*s*/ diff --git a/test/Jamfile b/test/Jamfile index 4d3b3830..6e2432e9 100644 --- a/test/Jamfile +++ b/test/Jamfile @@ -26,6 +26,7 @@ test_sets.cpp test_simple_repeats.cpp test_tricky_cases.cpp test_icu.cpp +test_unicode.cpp test_overloads.cpp test_operators.cpp ; @@ -120,6 +121,17 @@ test-suite regex ../../test/build/boost_test_exec_monitor ] + [ run unicode/unicode_iterator_test.cpp ] + [ regex-test static_mutex_test + :