diff --git a/include/boost/regex/v4/icu.hpp b/include/boost/regex/v4/icu.hpp new file mode 100644 index 00000000..bd646a8a --- /dev/null +++ b/include/boost/regex/v4/icu.hpp @@ -0,0 +1,1069 @@ +/* + * + * Copyright (c) 2004 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE icu.hpp + * VERSION see + * DESCRIPTION: Unicode regular expressions on top of the ICU Library. + */ + +#ifndef BOOST_REGEX_ICU_V4_HPP +#define BOOST_REGEX_ICU_V4_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef BOOST_MSVC +#pragma warning (push) +#pragma warning (disable: 4251) +#endif + +namespace boost { + + namespace BOOST_REGEX_DETAIL_NS { + + // + // Implementation details: + // + class BOOST_REGEX_DECL icu_regex_traits_implementation + { + typedef UChar32 char_type; + typedef std::size_t size_type; + typedef std::vector string_type; + typedef U_NAMESPACE_QUALIFIER Locale locale_type; + typedef boost::uint_least32_t char_class_type; + public: + icu_regex_traits_implementation(const U_NAMESPACE_QUALIFIER Locale& l) + : m_locale(l) + { + UErrorCode success = U_ZERO_ERROR; + m_collator.reset(U_NAMESPACE_QUALIFIER Collator::createInstance(l, success)); + if (U_SUCCESS(success) == 0) + init_error(); + m_collator->setStrength(U_NAMESPACE_QUALIFIER Collator::IDENTICAL); + success = U_ZERO_ERROR; + m_primary_collator.reset(U_NAMESPACE_QUALIFIER Collator::createInstance(l, success)); + if (U_SUCCESS(success) == 0) + init_error(); + m_primary_collator->setStrength(U_NAMESPACE_QUALIFIER Collator::PRIMARY); + } + U_NAMESPACE_QUALIFIER Locale getloc()const + { + return m_locale; + } + string_type do_transform(const char_type* p1, const char_type* p2, const U_NAMESPACE_QUALIFIER Collator* pcoll) const; + string_type transform(const char_type* p1, const char_type* p2) const + { + return do_transform(p1, p2, m_collator.get()); + } + string_type transform_primary(const char_type* p1, const char_type* p2) const + { + return do_transform(p1, p2, m_primary_collator.get()); + } + private: + void init_error() + { + std::runtime_error e("Could not initialize ICU resources"); + boost::throw_exception(e); + } + U_NAMESPACE_QUALIFIER Locale m_locale; // The ICU locale that we're using + boost::scoped_ptr< U_NAMESPACE_QUALIFIER Collator> m_collator; // The full collation object + boost::scoped_ptr< U_NAMESPACE_QUALIFIER Collator> m_primary_collator; // The primary collation object + }; + + inline boost::shared_ptr get_icu_regex_traits_implementation(const U_NAMESPACE_QUALIFIER Locale& loc) + { + return boost::shared_ptr(new icu_regex_traits_implementation(loc)); + } + + } + + class BOOST_REGEX_DECL icu_regex_traits + { + public: + typedef UChar32 char_type; + typedef std::size_t size_type; + typedef std::vector string_type; + typedef U_NAMESPACE_QUALIFIER Locale locale_type; +#ifdef BOOST_NO_INT64_T + typedef std::bitset<64> char_class_type; +#else + typedef boost::uint64_t char_class_type; +#endif + + struct boost_extensions_tag {}; + + icu_regex_traits() + : m_pimpl(BOOST_REGEX_DETAIL_NS::get_icu_regex_traits_implementation(U_NAMESPACE_QUALIFIER Locale())) + { + } + static size_type length(const char_type* p); + + ::boost::regex_constants::syntax_type syntax_type(char_type c)const + { + return ((c < 0x7f) && (c > 0)) ? BOOST_REGEX_DETAIL_NS::get_default_syntax_type(static_cast(c)) : regex_constants::syntax_char; + } + ::boost::regex_constants::escape_syntax_type escape_syntax_type(char_type c) const + { + return ((c < 0x7f) && (c > 0)) ? BOOST_REGEX_DETAIL_NS::get_default_escape_syntax_type(static_cast(c)) : regex_constants::syntax_char; + } + char_type translate(char_type c) const + { + return c; + } + char_type translate_nocase(char_type c) const + { + return ::u_tolower(c); + } + char_type translate(char_type c, bool icase) const + { + return icase ? translate_nocase(c) : translate(c); + } + char_type tolower(char_type c) const + { + return ::u_tolower(c); + } + char_type toupper(char_type c) const + { + return ::u_toupper(c); + } + string_type transform(const char_type* p1, const char_type* p2) const + { + return m_pimpl->transform(p1, p2); + } + string_type transform_primary(const char_type* p1, const char_type* p2) const + { + return m_pimpl->transform_primary(p1, p2); + } + char_class_type lookup_classname(const char_type* p1, const char_type* p2) const; + string_type lookup_collatename(const char_type* p1, const char_type* p2) const; + bool isctype(char_type c, char_class_type f) const; + boost::intmax_t toi(const char_type*& p1, const char_type* p2, int radix)const + { + return BOOST_REGEX_DETAIL_NS::global_toi(p1, p2, radix, *this); + } + int value(char_type c, int radix)const + { + return u_digit(c, static_cast<::int8_t>(radix)); + } + locale_type imbue(locale_type l) + { + locale_type result(m_pimpl->getloc()); + m_pimpl = BOOST_REGEX_DETAIL_NS::get_icu_regex_traits_implementation(l); + return result; + } + locale_type getloc()const + { + return locale_type(); + } + std::string error_string(::boost::regex_constants::error_type n) const + { + return BOOST_REGEX_DETAIL_NS::get_default_error_string(n); + } + private: + icu_regex_traits(const icu_regex_traits&); + icu_regex_traits& operator=(const icu_regex_traits&); + + // + // define the bitmasks offsets we need for additional character properties: + // + enum { + offset_blank = U_CHAR_CATEGORY_COUNT, + offset_space = U_CHAR_CATEGORY_COUNT + 1, + offset_xdigit = U_CHAR_CATEGORY_COUNT + 2, + offset_underscore = U_CHAR_CATEGORY_COUNT + 3, + offset_unicode = U_CHAR_CATEGORY_COUNT + 4, + offset_any = U_CHAR_CATEGORY_COUNT + 5, + offset_ascii = U_CHAR_CATEGORY_COUNT + 6, + offset_horizontal = U_CHAR_CATEGORY_COUNT + 7, + offset_vertical = U_CHAR_CATEGORY_COUNT + 8 + }; + + // + // and now the masks: + // + static const char_class_type mask_blank; + static const char_class_type mask_space; + static const char_class_type mask_xdigit; + static const char_class_type mask_underscore; + static const char_class_type mask_unicode; + static const char_class_type mask_any; + static const char_class_type mask_ascii; + static const char_class_type mask_horizontal; + static const char_class_type mask_vertical; + + static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2); + + boost::shared_ptr< ::boost::BOOST_REGEX_DETAIL_NS::icu_regex_traits_implementation> m_pimpl; + }; + +} // namespace boost + +// +// template instances: +// +#define BOOST_REGEX_CHAR_T UChar32 +#undef BOOST_REGEX_TRAITS_T +#define BOOST_REGEX_TRAITS_T , icu_regex_traits +#define BOOST_REGEX_ICU_INSTANCES +#ifdef BOOST_REGEX_ICU_INSTANTIATE +# define BOOST_REGEX_INSTANTIATE +#endif +#include +#undef BOOST_REGEX_CHAR_T +#undef BOOST_REGEX_TRAITS_T +#undef BOOST_REGEX_ICU_INSTANCES +#ifdef BOOST_REGEX_INSTANTIATE +# undef BOOST_REGEX_INSTANTIATE +#endif + +namespace boost { + + // types: + typedef basic_regex< ::UChar32, icu_regex_traits> u32regex; + typedef match_results u32match; + typedef match_results u16match; + + // + // Construction of 32-bit regex types from UTF-8 and UTF-16 primitives: + // + namespace BOOST_REGEX_DETAIL_NS { + +#if !defined(BOOST_NO_MEMBER_TEMPLATES) && !defined(__IBMCPP__) + template + inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const boost::mpl::int_<1>*) + { + typedef boost::u8_to_u32_iterator conv_type; + return u32regex(conv_type(i, i, j), conv_type(j, i, j), opt); + } + + template + inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const boost::mpl::int_<2>*) + { + typedef boost::u16_to_u32_iterator conv_type; + return u32regex(conv_type(i, i, j), conv_type(j, i, j), opt); + } + + template + inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const boost::mpl::int_<4>*) + { + return u32regex(i, j, opt); + } +#else + template + inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const boost::mpl::int_<1>*) + { + typedef boost::u8_to_u32_iterator conv_type; + typedef std::vector vector_type; + vector_type v; + conv_type a(i, i, j), b(j, i, j); + while (a != b) + { + v.push_back(*a); + ++a; + } + if (v.size()) + return u32regex(&*v.begin(), v.size(), opt); + return u32regex(static_cast(0), static_cast(0), opt); + } + + template + inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const boost::mpl::int_<2>*) + { + typedef boost::u16_to_u32_iterator conv_type; + typedef std::vector vector_type; + vector_type v; + conv_type a(i, i, j), b(j, i, j); + while (a != b) + { + v.push_back(*a); + ++a; + } + if (v.size()) + return u32regex(&*v.begin(), v.size(), opt); + return u32regex(static_cast(0), static_cast(0), opt); + } + + template + inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const boost::mpl::int_<4>*) + { + typedef std::vector vector_type; + vector_type v; + while (i != j) + { + v.push_back((UChar32)(*i)); + ++i; + } + if (v.size()) + return u32regex(&*v.begin(), v.size(), opt); + return u32regex(static_cast(0), static_cast(0), opt); + } +#endif + } + + // BOOST_REGEX_UCHAR_IS_WCHAR_T + // + // Source inspection of unicode/umachine.h in ICU version 59 indicates that: + // + // On version 59, UChar is always char16_t in C++ mode (and uint16_t in C mode) + // + // On earlier versions, the logic is + // + // #if U_SIZEOF_WCHAR_T==2 + // typedef wchar_t OldUChar; + // #elif defined(__CHAR16_TYPE__) + // typedef __CHAR16_TYPE__ OldUChar; + // #else + // typedef uint16_t OldUChar; + // #endif + // + // That is, UChar is wchar_t only on versions below 59, when U_SIZEOF_WCHAR_T==2 + // + // Hence, + +#define BOOST_REGEX_UCHAR_IS_WCHAR_T (U_ICU_VERSION_MAJOR_NUM < 59 && U_SIZEOF_WCHAR_T == 2) + +#if BOOST_REGEX_UCHAR_IS_WCHAR_T + BOOST_STATIC_ASSERT((boost::is_same::value)); +#else + BOOST_STATIC_ASSERT(!(boost::is_same::value)); +#endif + + // + // Construction from an iterator pair: + // + template + inline u32regex make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(i, j, opt, static_cast const*>(0)); + } + // + // construction from UTF-8 nul-terminated strings: + // + inline u32regex make_u32regex(const char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + std::strlen(p), opt, static_cast const*>(0)); + } + inline u32regex make_u32regex(const unsigned char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + std::strlen(reinterpret_cast(p)), opt, static_cast const*>(0)); + } + // + // construction from UTF-16 nul-terminated strings: + // +#ifndef BOOST_NO_WREGEX + inline u32regex make_u32regex(const wchar_t* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + std::wcslen(p), opt, static_cast const*>(0)); + } +#endif +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T + inline u32regex make_u32regex(const UChar* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + u_strlen(p), opt, static_cast const*>(0)); + } +#endif + // + // construction from basic_string class-template: + // + template + inline u32regex make_u32regex(const std::basic_string& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(s.begin(), s.end(), opt, static_cast const*>(0)); + } + // + // Construction from ICU string type: + // + inline u32regex make_u32regex(const U_NAMESPACE_QUALIFIER UnicodeString& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) + { + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(s.getBuffer(), s.getBuffer() + s.length(), opt, static_cast const*>(0)); + } + + // + // regex_match overloads that widen the character type as appropriate: + // + namespace BOOST_REGEX_DETAIL_NS { + template + void copy_results(MR1& out, MR2 const& in, NSubs named_subs) + { + // copy results from an adapted MR2 match_results: + out.set_size(in.size(), in.prefix().first.base(), in.suffix().second.base()); + out.set_base(in.base().base()); + out.set_named_subs(named_subs); + for (int i = 0; i < (int)in.size(); ++i) + { + if (in[i].matched || !i) + { + out.set_first(in[i].first.base(), i); + out.set_second(in[i].second.base(), i, in[i].matched); + } + } +#ifdef BOOST_REGEX_MATCH_EXTRA + // Copy full capture info as well: + for (int i = 0; i < (int)in.size(); ++i) + { + if (in[i].captures().size()) + { + out[i].get_captures().assign(in[i].captures().size(), typename MR1::value_type()); + for (int j = 0; j < (int)out[i].captures().size(); ++j) + { + out[i].get_captures()[j].first = in[i].captures()[j].first.base(); + out[i].get_captures()[j].second = in[i].captures()[j].second.base(); + out[i].get_captures()[j].matched = in[i].captures()[j].matched; + } + } + } +#endif + } + + template + inline bool do_regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + boost::mpl::int_<4> const*) + { + return ::boost::regex_match(first, last, m, e, flags); + } + template + bool do_regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + boost::mpl::int_<2> const*) + { + typedef u16_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_match(conv_type(first, first, last), conv_type(last, first, last), what, e, flags); + // copy results across to m: + if (result) copy_results(m, what, e.get_named_subs()); + return result; + } + template + bool do_regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + boost::mpl::int_<1> const*) + { + typedef u8_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_match(conv_type(first, first, last), conv_type(last, first, last), what, e, flags); + // copy results across to m: + if (result) copy_results(m, what, e.get_named_subs()); + return result; + } + } // namespace BOOST_REGEX_DETAIL_NS + + template + inline bool u32regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(first, last, m, e, flags, static_cast const*>(0)); + } + inline bool u32regex_match(const UChar* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + u_strlen(p), m, e, flags, static_cast const*>(0)); + } +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) + inline bool u32regex_match(const wchar_t* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + std::wcslen(p), m, e, flags, static_cast const*>(0)); + } +#endif + inline bool u32regex_match(const char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + std::strlen(p), m, e, flags, static_cast const*>(0)); + } + inline bool u32regex_match(const unsigned char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + std::strlen((const char*)p), m, e, flags, static_cast const*>(0)); + } + inline bool u32regex_match(const std::string& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); + } +#ifndef BOOST_NO_STD_WSTRING + inline bool u32regex_match(const std::wstring& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); + } +#endif + inline bool u32regex_match(const U_NAMESPACE_QUALIFIER UnicodeString& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, static_cast const*>(0)); + } + // + // regex_match overloads that do not return what matched: + // + template + inline bool u32regex_match(BidiIterator first, BidiIterator last, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(first, last, m, e, flags, static_cast const*>(0)); + } + inline bool u32regex_match(const UChar* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + u_strlen(p), m, e, flags, static_cast const*>(0)); + } +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) + inline bool u32regex_match(const wchar_t* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + std::wcslen(p), m, e, flags, static_cast const*>(0)); + } +#endif + inline bool u32regex_match(const char* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + std::strlen(p), m, e, flags, static_cast const*>(0)); + } + inline bool u32regex_match(const unsigned char* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p + std::strlen((const char*)p), m, e, flags, static_cast const*>(0)); + } + inline bool u32regex_match(const std::string& s, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); + } +#ifndef BOOST_NO_STD_WSTRING + inline bool u32regex_match(const std::wstring& s, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); + } +#endif + inline bool u32regex_match(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, static_cast const*>(0)); + } + + // + // regex_search overloads that widen the character type as appropriate: + // + namespace BOOST_REGEX_DETAIL_NS { + template + inline bool do_regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base, + boost::mpl::int_<4> const*) + { + return ::boost::regex_search(first, last, m, e, flags, base); + } + template + bool do_regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base, + boost::mpl::int_<2> const*) + { + typedef u16_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_search(conv_type(first, first, last), conv_type(last, first, last), what, e, flags, conv_type(base)); + // copy results across to m: + if (result) copy_results(m, what, e.get_named_subs()); + return result; + } + template + bool do_regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base, + boost::mpl::int_<1> const*) + { + typedef u8_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_search(conv_type(first, first, last), conv_type(last, first, last), what, e, flags, conv_type(base)); + // copy results across to m: + if (result) copy_results(m, what, e.get_named_subs()); + return result; + } + } + + template + inline bool u32regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(first, last, m, e, flags, first, static_cast const*>(0)); + } + template + inline bool u32regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(first, last, m, e, flags, base, static_cast const*>(0)); + } + inline bool u32regex_search(const UChar* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + u_strlen(p), m, e, flags, p, static_cast const*>(0)); + } +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) + inline bool u32regex_search(const wchar_t* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + std::wcslen(p), m, e, flags, p, static_cast const*>(0)); + } +#endif + inline bool u32regex_search(const char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + std::strlen(p), m, e, flags, p, static_cast const*>(0)); + } + inline bool u32regex_search(const unsigned char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + std::strlen((const char*)p), m, e, flags, p, static_cast const*>(0)); + } + inline bool u32regex_search(const std::string& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); + } +#ifndef BOOST_NO_STD_WSTRING + inline bool u32regex_search(const std::wstring& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); + } +#endif + inline bool u32regex_search(const U_NAMESPACE_QUALIFIER UnicodeString& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, s.getBuffer(), static_cast const*>(0)); + } + template + inline bool u32regex_search(BidiIterator first, BidiIterator last, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(first, last, m, e, flags, first, static_cast const*>(0)); + } + inline bool u32regex_search(const UChar* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + u_strlen(p), m, e, flags, p, static_cast const*>(0)); + } +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) + inline bool u32regex_search(const wchar_t* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + std::wcslen(p), m, e, flags, p, static_cast const*>(0)); + } +#endif + inline bool u32regex_search(const char* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + std::strlen(p), m, e, flags, p, static_cast const*>(0)); + } + inline bool u32regex_search(const unsigned char* p, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p + std::strlen((const char*)p), m, e, flags, p, static_cast const*>(0)); + } + inline bool u32regex_search(const std::string& s, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); + } +#ifndef BOOST_NO_STD_WSTRING + inline bool u32regex_search(const std::wstring& s, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); + } +#endif + inline bool u32regex_search(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + match_flag_type flags = match_default) + { + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, s.getBuffer(), static_cast const*>(0)); + } + + // + // overloads for regex_replace with utf-8 and utf-16 data types: + // + namespace BOOST_REGEX_DETAIL_NS { + template + inline std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator > + make_utf32_seq(I i, I j, mpl::int_<1> const*) + { + return std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator >(boost::u8_to_u32_iterator(i, i, j), boost::u8_to_u32_iterator(j, i, j)); + } + template + inline std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator > + make_utf32_seq(I i, I j, mpl::int_<2> const*) + { + return std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator >(boost::u16_to_u32_iterator(i, i, j), boost::u16_to_u32_iterator(j, i, j)); + } + template + inline std::pair< I, I > + make_utf32_seq(I i, I j, mpl::int_<4> const*) + { + return std::pair< I, I >(i, j); + } + template + inline std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator > + make_utf32_seq(const charT* p, mpl::int_<1> const*) + { + std::size_t len = std::strlen((const char*)p); + return std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator >(boost::u8_to_u32_iterator(p, p, p + len), boost::u8_to_u32_iterator(p + len, p, p + len)); + } + template + inline std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator > + make_utf32_seq(const charT* p, mpl::int_<2> const*) + { + std::size_t len = u_strlen((const UChar*)p); + return std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator >(boost::u16_to_u32_iterator(p, p, p + len), boost::u16_to_u32_iterator(p + len, p, p + len)); + } + template + inline std::pair< const charT*, const charT* > + make_utf32_seq(const charT* p, mpl::int_<4> const*) + { + return std::pair< const charT*, const charT* >(p, p + icu_regex_traits::length((UChar32 const*)p)); + } + template + inline OutputIterator make_utf32_out(OutputIterator o, mpl::int_<4> const*) + { + return o; + } + template + inline utf16_output_iterator make_utf32_out(OutputIterator o, mpl::int_<2> const*) + { + return o; + } + template + inline utf8_output_iterator make_utf32_out(OutputIterator o, mpl::int_<1> const*) + { + return o; + } + + template + OutputIterator do_regex_replace(OutputIterator out, + std::pair const& in, + const u32regex& e, + const std::pair& fmt, + match_flag_type flags + ) + { + // unfortunately we have to copy the format string in order to pass in onward: + std::vector f; +#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS + f.assign(fmt.first, fmt.second); +#else + f.clear(); + I2 pos = fmt.first; + while (pos != fmt.second) + f.push_back(*pos++); +#endif + + regex_iterator i(in.first, in.second, e, flags); + regex_iterator j; + if (i == j) + { + if (!(flags & regex_constants::format_no_copy)) + out = BOOST_REGEX_DETAIL_NS::copy(in.first, in.second, out); + } + else + { + I1 last_m = in.first; + while (i != j) + { + if (!(flags & regex_constants::format_no_copy)) + out = BOOST_REGEX_DETAIL_NS::copy(i->prefix().first, i->prefix().second, out); + if (!f.empty()) + out = ::boost::BOOST_REGEX_DETAIL_NS::regex_format_imp(out, *i, &*f.begin(), &*f.begin() + f.size(), flags, e.get_traits()); + else + out = ::boost::BOOST_REGEX_DETAIL_NS::regex_format_imp(out, *i, static_cast(0), static_cast(0), flags, e.get_traits()); + last_m = (*i)[0].second; + if (flags & regex_constants::format_first_only) + break; + ++i; + } + if (!(flags & regex_constants::format_no_copy)) + out = BOOST_REGEX_DETAIL_NS::copy(last_m, in.second, out); + } + return out; + } + template + inline const BaseIterator& extract_output_base(const BaseIterator& b) + { + return b; + } + template + inline BaseIterator extract_output_base(const utf8_output_iterator& b) + { + return b.base(); + } + template + inline BaseIterator extract_output_base(const utf16_output_iterator& b) + { + return b.base(); + } + } // BOOST_REGEX_DETAIL_NS + + template + inline OutputIterator u32regex_replace(OutputIterator out, + BidirectionalIterator first, + BidirectionalIterator last, + const u32regex& e, + const charT* fmt, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::extract_output_base + ( + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(out, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(first, last, static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt, static_cast const*>(0)), + flags) + ); + } + + template + inline OutputIterator u32regex_replace(OutputIterator out, + Iterator first, + Iterator last, + const u32regex& e, + const std::basic_string& fmt, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::extract_output_base + ( + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(out, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(first, last, static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt.begin(), fmt.end(), static_cast const*>(0)), + flags) + ); + } + + template + inline OutputIterator u32regex_replace(OutputIterator out, + Iterator first, + Iterator last, + const u32regex& e, + const U_NAMESPACE_QUALIFIER UnicodeString& fmt, + match_flag_type flags = match_default) + { + return BOOST_REGEX_DETAIL_NS::extract_output_base + ( + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(out, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(first, last, static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt.getBuffer(), fmt.getBuffer() + fmt.length(), static_cast const*>(0)), + flags) + ); + } + + template + std::basic_string u32regex_replace(const std::basic_string& s, + const u32regex& e, + const charT* fmt, + match_flag_type flags = match_default) + { + std::basic_string result; + BOOST_REGEX_DETAIL_NS::string_out_iterator > i(result); + u32regex_replace(i, s.begin(), s.end(), e, fmt, flags); + return result; + } + + template + std::basic_string u32regex_replace(const std::basic_string& s, + const u32regex& e, + const std::basic_string& fmt, + match_flag_type flags = match_default) + { + std::basic_string result; + BOOST_REGEX_DETAIL_NS::string_out_iterator > i(result); + u32regex_replace(i, s.begin(), s.end(), e, fmt.c_str(), flags); + return result; + } + + namespace BOOST_REGEX_DETAIL_NS { + + class unicode_string_out_iterator + { + U_NAMESPACE_QUALIFIER UnicodeString* out; + public: + unicode_string_out_iterator(U_NAMESPACE_QUALIFIER UnicodeString& s) : out(&s) {} + unicode_string_out_iterator& operator++() { return *this; } + unicode_string_out_iterator& operator++(int) { return *this; } + unicode_string_out_iterator& operator*() { return *this; } + unicode_string_out_iterator& operator=(UChar v) + { + *out += v; + return *this; + } + typedef std::ptrdiff_t difference_type; + typedef UChar value_type; + typedef value_type* pointer; + typedef value_type& reference; + typedef std::output_iterator_tag iterator_category; + }; + + } + + inline U_NAMESPACE_QUALIFIER UnicodeString u32regex_replace(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + const UChar* fmt, + match_flag_type flags = match_default) + { + U_NAMESPACE_QUALIFIER UnicodeString result; + BOOST_REGEX_DETAIL_NS::unicode_string_out_iterator i(result); + u32regex_replace(i, s.getBuffer(), s.getBuffer() + s.length(), e, fmt, flags); + return result; + } + + inline U_NAMESPACE_QUALIFIER UnicodeString u32regex_replace(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + const U_NAMESPACE_QUALIFIER UnicodeString& fmt, + match_flag_type flags = match_default) + { + U_NAMESPACE_QUALIFIER UnicodeString result; + BOOST_REGEX_DETAIL_NS::unicode_string_out_iterator i(result); + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(i, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(s.getBuffer(), s.getBuffer() + s.length(), static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt.getBuffer(), fmt.getBuffer() + fmt.length(), static_cast const*>(0)), + flags); + return result; + } + +} // namespace boost. + +#ifdef BOOST_MSVC +#pragma warning (pop) +#endif + +#include +#include + +#endif diff --git a/include/boost/regex/v4/object_cache.hpp b/include/boost/regex/v4/object_cache.hpp new file mode 100644 index 00000000..98ba4c19 --- /dev/null +++ b/include/boost/regex/v4/object_cache.hpp @@ -0,0 +1,171 @@ +/* + * + * Copyright (c) 2004 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE object_cache.hpp + * VERSION see + * DESCRIPTION: Implements a generic object cache. + */ + +#ifndef BOOST_REGEX_OBJECT_CACHE_HPP +#define BOOST_REGEX_OBJECT_CACHE_HPP + +#include +#include +#include +#include +#include +#include +#ifdef BOOST_HAS_THREADS +#include +#endif + +namespace boost{ + +template +class object_cache +{ +public: + typedef std::pair< ::boost::shared_ptr, Key const*> value_type; + typedef std::list list_type; + typedef typename list_type::iterator list_iterator; + typedef std::map map_type; + typedef typename map_type::iterator map_iterator; + typedef typename list_type::size_type size_type; + static boost::shared_ptr get(const Key& k, size_type l_max_cache_size); + +private: + static boost::shared_ptr do_get(const Key& k, size_type l_max_cache_size); + + struct data + { + list_type cont; + map_type index; + }; + + // Needed by compilers not implementing the resolution to DR45. For reference, + // see http://www.open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#45. + friend struct data; +}; + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable: 4702) +#endif +template +boost::shared_ptr object_cache::get(const Key& k, size_type l_max_cache_size) +{ +#ifdef BOOST_HAS_THREADS + static boost::static_mutex mut = BOOST_STATIC_MUTEX_INIT; + boost::static_mutex::scoped_lock l(mut); + if (l) + { + return do_get(k, l_max_cache_size); + } + // + // what do we do if the lock fails? + // for now just throw, but we should never really get here... + // + ::boost::throw_exception(std::runtime_error("Error in thread safety code: could not acquire a lock")); +#if defined(BOOST_NO_UNREACHABLE_RETURN_DETECTION) || defined(BOOST_NO_EXCEPTIONS) + return boost::shared_ptr(); +#endif +#else + return do_get(k, l_max_cache_size); +#endif +} +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +template +boost::shared_ptr object_cache::do_get(const Key& k, size_type l_max_cache_size) +{ + typedef typename object_cache::data object_data; + typedef typename map_type::size_type map_size_type; + static object_data s_data; + + // + // see if the object is already in the cache: + // + map_iterator mpos = s_data.index.find(k); + if(mpos != s_data.index.end()) + { + // + // Eureka! + // We have a cached item, bump it up the list and return it: + // + if(--(s_data.cont.end()) != mpos->second) + { + // splice out the item we want to move: + list_type temp; + temp.splice(temp.end(), s_data.cont, mpos->second); + // and now place it at the end of the list: + s_data.cont.splice(s_data.cont.end(), temp, temp.begin()); + BOOST_REGEX_ASSERT(*(s_data.cont.back().second) == k); + // update index with new position: + mpos->second = --(s_data.cont.end()); + BOOST_REGEX_ASSERT(&(mpos->first) == mpos->second->second); + BOOST_REGEX_ASSERT(&(mpos->first) == s_data.cont.back().second); + } + return s_data.cont.back().first; + } + // + // if we get here then the item is not in the cache, + // so create it: + // + boost::shared_ptr result(new Object(k)); + // + // Add it to the list, and index it: + // + s_data.cont.push_back(value_type(result, static_cast(0))); + s_data.index.insert(std::make_pair(k, --(s_data.cont.end()))); + s_data.cont.back().second = &(s_data.index.find(k)->first); + map_size_type s = s_data.index.size(); + BOOST_REGEX_ASSERT(s_data.index[k]->first.get() == result.get()); + BOOST_REGEX_ASSERT(&(s_data.index.find(k)->first) == s_data.cont.back().second); + BOOST_REGEX_ASSERT(s_data.index.find(k)->first == k); + if(s > l_max_cache_size) + { + // + // We have too many items in the list, so we need to start + // popping them off the back of the list, but only if they're + // being held uniquely by us: + // + list_iterator pos = s_data.cont.begin(); + list_iterator last = s_data.cont.end(); + while((pos != last) && (s > l_max_cache_size)) + { + if(pos->first.unique()) + { + list_iterator condemmed(pos); + ++pos; + // now remove the items from our containers, + // then order has to be as follows: + BOOST_REGEX_ASSERT(s_data.index.find(*(condemmed->second)) != s_data.index.end()); + s_data.index.erase(*(condemmed->second)); + s_data.cont.erase(condemmed); + --s; + } + else + ++pos; + } + BOOST_REGEX_ASSERT(s_data.index[k]->first.get() == result.get()); + BOOST_REGEX_ASSERT(&(s_data.index.find(k)->first) == s_data.cont.back().second); + BOOST_REGEX_ASSERT(s_data.index.find(k)->first == k); + } + return result; +} + +} + +#endif diff --git a/include/boost/regex/v4/pattern_except.hpp b/include/boost/regex/v4/pattern_except.hpp new file mode 100644 index 00000000..56e83cc9 --- /dev/null +++ b/include/boost/regex/v4/pattern_except.hpp @@ -0,0 +1,127 @@ +/* + * + * Copyright (c) 1998-2002 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE pattern_except.hpp + * VERSION see + * DESCRIPTION: Declares pattern-matching exception classes. + */ + +#ifndef BOOST_RE_V4_PAT_EXCEPT_HPP +#define BOOST_RE_V4_PAT_EXCEPT_HPP + +#ifndef BOOST_REGEX_CONFIG_HPP +#include +#endif + +#include +#include +#include +#include + +namespace boost{ + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable: 4103) +#endif +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_PREFIX +#endif +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable : 4275) +#if BOOST_MSVC >= 1800 +#pragma warning(disable : 26812) +#endif +#endif +class regex_error : public std::runtime_error +{ +public: + explicit regex_error(const std::string& s, regex_constants::error_type err = regex_constants::error_unknown, std::ptrdiff_t pos = 0) + : std::runtime_error(s) + , m_error_code(err) + , m_position(pos) + { + } + explicit regex_error(regex_constants::error_type err) + : std::runtime_error(::boost::BOOST_REGEX_DETAIL_NS::get_default_error_string(err)) + , m_error_code(err) + , m_position(0) + { + } + ~regex_error() BOOST_NOEXCEPT_OR_NOTHROW BOOST_OVERRIDE {} + regex_constants::error_type code()const + { return m_error_code; } + std::ptrdiff_t position()const + { return m_position; } + void raise()const + { +#ifndef BOOST_NO_EXCEPTIONS +#ifndef BOOST_REGEX_STANDALONE + ::boost::throw_exception(*this); +#else + throw* this; +#endif +#endif + } +private: + regex_constants::error_type m_error_code; + std::ptrdiff_t m_position; +}; + +typedef regex_error bad_pattern; +typedef regex_error bad_expression; + +namespace BOOST_REGEX_DETAIL_NS{ + +inline void BOOST_REGEX_CALL raise_runtime_error(const std::runtime_error& ex) +{ +#ifndef BOOST_REGEX_STANDALONE + ::boost::throw_exception(ex); +#else + throw ex; +#endif +} + +template +void raise_error(const traits& t, regex_constants::error_type code) +{ + (void)t; // warning suppression + std::runtime_error e(t.error_string(code)); + ::boost::BOOST_REGEX_DETAIL_NS::raise_runtime_error(e); +} + +} + +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable: 4103) +#endif +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_SUFFIX +#endif +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +} // namespace boost + +#endif diff --git a/include/boost/regex/v4/unicode_iterator.hpp b/include/boost/regex/v4/unicode_iterator.hpp new file mode 100644 index 00000000..37fa5102 --- /dev/null +++ b/include/boost/regex/v4/unicode_iterator.hpp @@ -0,0 +1,790 @@ +/* + * + * Copyright (c) 2004 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE unicode_iterator.hpp + * VERSION see + * DESCRIPTION: Iterator adapters for converting between different Unicode encodings. + */ + +/**************************************************************************** + +Contents: +~~~~~~~~~ + +1) Read Only, Input Adapters: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +class u32_to_u8_iterator; + +Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8. + +template +class u8_to_u32_iterator; + +Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32. + +template +class u32_to_u16_iterator; + +Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16. + +template +class u16_to_u32_iterator; + +Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32. + +2) Single pass output iterator adapters: + +template +class utf8_output_iterator; + +Accepts UTF-32 code points and forwards them on as UTF-8 code points. + +template +class utf16_output_iterator; + +Accepts UTF-32 code points and forwards them on as UTF-16 code points. + +****************************************************************************/ + +#ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP +#define BOOST_REGEX_UNICODE_ITERATOR_HPP +#include +#include +#include +#include +#include +#include +#ifndef BOOST_NO_STD_LOCALE +#include +#include +#endif +#include // CHAR_BIT + +#ifdef BOOST_REGEX_CXX03 + +#else +#endif + +namespace boost{ + +namespace detail{ + +static const ::boost::uint16_t high_surrogate_base = 0xD7C0u; +static const ::boost::uint16_t low_surrogate_base = 0xDC00u; +static const ::boost::uint32_t ten_bit_mask = 0x3FFu; + +inline bool is_high_surrogate(::boost::uint16_t v) +{ + return (v & 0xFFFFFC00u) == 0xd800u; +} +inline bool is_low_surrogate(::boost::uint16_t v) +{ + return (v & 0xFFFFFC00u) == 0xdc00u; +} +template +inline bool is_surrogate(T v) +{ + return (v & 0xFFFFF800u) == 0xd800; +} + +inline unsigned utf8_byte_count(boost::uint8_t c) +{ + // if the most significant bit with a zero in it is in position + // 8-N then there are N bytes in this UTF-8 sequence: + boost::uint8_t mask = 0x80u; + unsigned result = 0; + while(c & mask) + { + ++result; + mask >>= 1; + } + return (result == 0) ? 1 : ((result > 4) ? 4 : result); +} + +inline unsigned utf8_trailing_byte_count(boost::uint8_t c) +{ + return utf8_byte_count(c) - 1; +} + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable:4100) +#endif +#ifndef BOOST_NO_EXCEPTIONS +BOOST_NORETURN +#endif +inline void invalid_utf32_code_point(::boost::uint32_t val) +{ +#ifndef BOOST_NO_STD_LOCALE + std::stringstream ss; + ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence"; + std::out_of_range e(ss.str()); +#else + std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence"); +#endif + boost::throw_exception(e); +} +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + + +} // namespace detail + +template +class u32_to_u16_iterator + : public boost::iterator_facade, U16Type, std::bidirectional_iterator_tag, const U16Type> +{ + typedef boost::iterator_facade, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type; + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) + typedef typename std::iterator_traits::value_type base_value_type; + + BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); + BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16); +#endif + +public: + typename base_type::reference + dereference()const + { + if(m_current == 2) + extract_current(); + return m_values[m_current]; + } + bool equal(const u32_to_u16_iterator& that)const + { + if(m_position == that.m_position) + { + // Both m_currents must be equal, or both even + // this is the same as saying their sum must be even: + return (m_current + that.m_current) & 1u ? false : true; + } + return false; + } + void increment() + { + // if we have a pending read then read now, so that we know whether + // to skip a position, or move to a low-surrogate: + if(m_current == 2) + { + // pending read: + extract_current(); + } + // move to the next surrogate position: + ++m_current; + // if we've reached the end skip a position: + if(m_values[m_current] == 0) + { + m_current = 2; + ++m_position; + } + } + void decrement() + { + if(m_current != 1) + { + // decrementing an iterator always leads to a valid position: + --m_position; + extract_current(); + m_current = m_values[1] ? 1 : 0; + } + else + { + m_current = 0; + } + } + BaseIterator base()const + { + return m_position; + } + // construct: + u32_to_u16_iterator() : m_position(), m_current(0) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + } + u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + } +private: + + void extract_current()const + { + // begin by checking for a code point out of range: + ::boost::uint32_t v = *m_position; + if(v >= 0x10000u) + { + if(v > 0x10FFFFu) + detail::invalid_utf32_code_point(*m_position); + // split into two surrogates: + m_values[0] = static_cast(v >> 10) + detail::high_surrogate_base; + m_values[1] = static_cast(v & detail::ten_bit_mask) + detail::low_surrogate_base; + m_current = 0; + BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0])); + BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1])); + } + else + { + // 16-bit code point: + m_values[0] = static_cast(*m_position); + m_values[1] = 0; + m_current = 0; + // value must not be a surrogate: + if(detail::is_surrogate(m_values[0])) + detail::invalid_utf32_code_point(*m_position); + } + } + BaseIterator m_position; + mutable U16Type m_values[3]; + mutable unsigned m_current; +}; + +template +class u16_to_u32_iterator + : public boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> +{ + typedef boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; + // special values for pending iterator reads: + BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu); + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) + typedef typename std::iterator_traits::value_type base_value_type; + + BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16); + BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); +#endif + +public: + typename base_type::reference + dereference()const + { + if(m_value == pending_read) + extract_current(); + return m_value; + } + bool equal(const u16_to_u32_iterator& that)const + { + return m_position == that.m_position; + } + void increment() + { + // skip high surrogate first if there is one: + if(detail::is_high_surrogate(*m_position)) ++m_position; + ++m_position; + m_value = pending_read; + } + void decrement() + { + --m_position; + // if we have a low surrogate then go back one more: + if(detail::is_low_surrogate(*m_position)) + --m_position; + m_value = pending_read; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u16_to_u32_iterator() : m_position() + { + m_value = pending_read; + } + u16_to_u32_iterator(BaseIterator b) : m_position(b) + { + m_value = pending_read; + } + // + // Range checked version: + // + u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b) + { + m_value = pending_read; + // + // The range must not start with a low surrogate, or end in a high surrogate, + // otherwise we run the risk of running outside the underlying input range. + // Likewise b must not be located at a low surrogate. + // + boost::uint16_t val; + if(start != end) + { + if((b != start) && (b != end)) + { + val = *b; + if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u)) + invalid_code_point(val); + } + val = *start; + if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u)) + invalid_code_point(val); + val = *--end; + if(detail::is_high_surrogate(val)) + invalid_code_point(val); + } + } +private: + static void invalid_code_point(::boost::uint16_t val) + { +#ifndef BOOST_NO_STD_LOCALE + std::stringstream ss; + ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence"; + std::out_of_range e(ss.str()); +#else + std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence"); +#endif + boost::throw_exception(e); + } + void extract_current()const + { + m_value = static_cast(static_cast< ::boost::uint16_t>(*m_position)); + // if the last value is a high surrogate then adjust m_position and m_value as needed: + if(detail::is_high_surrogate(*m_position)) + { + // precondition; next value must have be a low-surrogate: + BaseIterator next(m_position); + ::boost::uint16_t t = *++next; + if((t & 0xFC00u) != 0xDC00u) + invalid_code_point(t); + m_value = (m_value - detail::high_surrogate_base) << 10; + m_value |= (static_cast(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask); + } + // postcondition; result must not be a surrogate: + if(detail::is_surrogate(m_value)) + invalid_code_point(static_cast< ::boost::uint16_t>(m_value)); + } + BaseIterator m_position; + mutable U32Type m_value; +}; + +template +class u32_to_u8_iterator + : public boost::iterator_facade, U8Type, std::bidirectional_iterator_tag, const U8Type> +{ + typedef boost::iterator_facade, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type; + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) + typedef typename std::iterator_traits::value_type base_value_type; + + BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32); + BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8); +#endif + +public: + typename base_type::reference + dereference()const + { + if(m_current == 4) + extract_current(); + return m_values[m_current]; + } + bool equal(const u32_to_u8_iterator& that)const + { + if(m_position == that.m_position) + { + // either the m_current's must be equal, or one must be 0 and + // the other 4: which means neither must have bits 1 or 2 set: + return (m_current == that.m_current) + || (((m_current | that.m_current) & 3) == 0); + } + return false; + } + void increment() + { + // if we have a pending read then read now, so that we know whether + // to skip a position, or move to a low-surrogate: + if(m_current == 4) + { + // pending read: + extract_current(); + } + // move to the next surrogate position: + ++m_current; + // if we've reached the end skip a position: + if(m_values[m_current] == 0) + { + m_current = 4; + ++m_position; + } + } + void decrement() + { + if((m_current & 3) == 0) + { + --m_position; + extract_current(); + m_current = 3; + while(m_current && (m_values[m_current] == 0)) + --m_current; + } + else + --m_current; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u32_to_u8_iterator() : m_position(), m_current(0) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + m_values[3] = 0; + m_values[4] = 0; + } + u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + m_values[3] = 0; + m_values[4] = 0; + } +private: + + void extract_current()const + { + boost::uint32_t c = *m_position; + if(c > 0x10FFFFu) + detail::invalid_utf32_code_point(c); + if(c < 0x80u) + { + m_values[0] = static_cast(c); + m_values[1] = static_cast(0u); + m_values[2] = static_cast(0u); + m_values[3] = static_cast(0u); + } + else if(c < 0x800u) + { + m_values[0] = static_cast(0xC0u + (c >> 6)); + m_values[1] = static_cast(0x80u + (c & 0x3Fu)); + m_values[2] = static_cast(0u); + m_values[3] = static_cast(0u); + } + else if(c < 0x10000u) + { + m_values[0] = static_cast(0xE0u + (c >> 12)); + m_values[1] = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + m_values[2] = static_cast(0x80u + (c & 0x3Fu)); + m_values[3] = static_cast(0u); + } + else + { + m_values[0] = static_cast(0xF0u + (c >> 18)); + m_values[1] = static_cast(0x80u + ((c >> 12) & 0x3Fu)); + m_values[2] = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + m_values[3] = static_cast(0x80u + (c & 0x3Fu)); + } + m_current= 0; + } + BaseIterator m_position; + mutable U8Type m_values[5]; + mutable unsigned m_current; +}; + +template +class u8_to_u32_iterator + : public boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> +{ + typedef boost::iterator_facade, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type; + // special values for pending iterator reads: + BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu); + +#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) + typedef typename std::iterator_traits::value_type base_value_type; + + BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8); + BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32); +#endif + +public: + typename base_type::reference + dereference()const + { + if(m_value == pending_read) + extract_current(); + return m_value; + } + bool equal(const u8_to_u32_iterator& that)const + { + return m_position == that.m_position; + } + void increment() + { + // We must not start with a continuation character: + if((static_cast(*m_position) & 0xC0) == 0x80) + invalid_sequence(); + // skip high surrogate first if there is one: + unsigned c = detail::utf8_byte_count(*m_position); + if(m_value == pending_read) + { + // Since we haven't read in a value, we need to validate the code points: + for(unsigned i = 0; i < c; ++i) + { + ++m_position; + // We must have a continuation byte: + if((i != c - 1) && ((static_cast(*m_position) & 0xC0) != 0x80)) + invalid_sequence(); + } + } + else + { + std::advance(m_position, c); + } + m_value = pending_read; + } + void decrement() + { + // Keep backtracking until we don't have a trailing character: + unsigned count = 0; + while((*--m_position & 0xC0u) == 0x80u) ++count; + // now check that the sequence was valid: + if(count != detail::utf8_trailing_byte_count(*m_position)) + invalid_sequence(); + m_value = pending_read; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u8_to_u32_iterator() : m_position() + { + m_value = pending_read; + } + u8_to_u32_iterator(BaseIterator b) : m_position(b) + { + m_value = pending_read; + } + // + // Checked constructor: + // + u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b) + { + m_value = pending_read; + // + // We must not start with a continuation character, or end with a + // truncated UTF-8 sequence otherwise we run the risk of going past + // the start/end of the underlying sequence: + // + if(start != end) + { + unsigned char v = *start; + if((v & 0xC0u) == 0x80u) + invalid_sequence(); + if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u)) + invalid_sequence(); + BaseIterator pos = end; + do + { + v = *--pos; + } + while((start != pos) && ((v & 0xC0u) == 0x80u)); + std::ptrdiff_t extra = detail::utf8_byte_count(v); + if(std::distance(pos, end) < extra) + invalid_sequence(); + } + } +private: + static void invalid_sequence() + { + std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character"); + boost::throw_exception(e); + } + void extract_current()const + { + m_value = static_cast(static_cast< ::boost::uint8_t>(*m_position)); + // we must not have a continuation character: + if((m_value & 0xC0u) == 0x80u) + invalid_sequence(); + // see how many extra bytes we have: + unsigned extra = detail::utf8_trailing_byte_count(*m_position); + // extract the extra bits, 6 from each extra byte: + BaseIterator next(m_position); + for(unsigned c = 0; c < extra; ++c) + { + ++next; + m_value <<= 6; + // We must have a continuation byte: + if((static_cast(*next) & 0xC0) != 0x80) + invalid_sequence(); + m_value += static_cast(*next) & 0x3Fu; + } + // we now need to remove a few of the leftmost bits, but how many depends + // upon how many extra bytes we've extracted: + static const boost::uint32_t masks[4] = + { + 0x7Fu, + 0x7FFu, + 0xFFFFu, + 0x1FFFFFu, + }; + m_value &= masks[extra]; + // check the result is in range: + if(m_value > static_cast(0x10FFFFu)) + invalid_sequence(); + // The result must not be a surrogate: + if((m_value >= static_cast(0xD800)) && (m_value <= static_cast(0xDFFF))) + invalid_sequence(); + // We should not have had an invalidly encoded UTF8 sequence: + if((extra > 0) && (m_value <= static_cast(masks[extra - 1]))) + invalid_sequence(); + } + BaseIterator m_position; + mutable U32Type m_value; +}; + +template +class utf16_output_iterator +{ +public: + typedef void difference_type; + typedef void value_type; + typedef boost::uint32_t* pointer; + typedef boost::uint32_t& reference; + typedef std::output_iterator_tag iterator_category; + + utf16_output_iterator(const BaseIterator& b) + : m_position(b){} + utf16_output_iterator(const utf16_output_iterator& that) + : m_position(that.m_position){} + utf16_output_iterator& operator=(const utf16_output_iterator& that) + { + m_position = that.m_position; + return *this; + } + const utf16_output_iterator& operator*()const + { + return *this; + } + void operator=(boost::uint32_t val)const + { + push(val); + } + utf16_output_iterator& operator++() + { + return *this; + } + utf16_output_iterator& operator++(int) + { + return *this; + } + BaseIterator base()const + { + return m_position; + } +private: + void push(boost::uint32_t v)const + { + if(v >= 0x10000u) + { + // begin by checking for a code point out of range: + if(v > 0x10FFFFu) + detail::invalid_utf32_code_point(v); + // split into two surrogates: + *m_position++ = static_cast(v >> 10) + detail::high_surrogate_base; + *m_position++ = static_cast(v & detail::ten_bit_mask) + detail::low_surrogate_base; + } + else + { + // 16-bit code point: + // value must not be a surrogate: + if(detail::is_surrogate(v)) + detail::invalid_utf32_code_point(v); + *m_position++ = static_cast(v); + } + } + mutable BaseIterator m_position; +}; + +template +class utf8_output_iterator +{ +public: + typedef void difference_type; + typedef void value_type; + typedef boost::uint32_t* pointer; + typedef boost::uint32_t& reference; + typedef std::output_iterator_tag iterator_category; + + utf8_output_iterator(const BaseIterator& b) + : m_position(b){} + utf8_output_iterator(const utf8_output_iterator& that) + : m_position(that.m_position){} + utf8_output_iterator& operator=(const utf8_output_iterator& that) + { + m_position = that.m_position; + return *this; + } + const utf8_output_iterator& operator*()const + { + return *this; + } + void operator=(boost::uint32_t val)const + { + push(val); + } + utf8_output_iterator& operator++() + { + return *this; + } + utf8_output_iterator& operator++(int) + { + return *this; + } + BaseIterator base()const + { + return m_position; + } +private: + void push(boost::uint32_t c)const + { + if(c > 0x10FFFFu) + detail::invalid_utf32_code_point(c); + if(c < 0x80u) + { + *m_position++ = static_cast(c); + } + else if(c < 0x800u) + { + *m_position++ = static_cast(0xC0u + (c >> 6)); + *m_position++ = static_cast(0x80u + (c & 0x3Fu)); + } + else if(c < 0x10000u) + { + *m_position++ = static_cast(0xE0u + (c >> 12)); + *m_position++ = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + *m_position++ = static_cast(0x80u + (c & 0x3Fu)); + } + else + { + *m_position++ = static_cast(0xF0u + (c >> 18)); + *m_position++ = static_cast(0x80u + ((c >> 12) & 0x3Fu)); + *m_position++ = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + *m_position++ = static_cast(0x80u + (c & 0x3Fu)); + } + } + mutable BaseIterator m_position; +}; + +} // namespace boost + +#endif // BOOST_REGEX_UNICODE_ITERATOR_HPP + diff --git a/include/boost/regex/v5/icu.hpp b/include/boost/regex/v5/icu.hpp new file mode 100644 index 00000000..bc21849b --- /dev/null +++ b/include/boost/regex/v5/icu.hpp @@ -0,0 +1,1402 @@ +/* + * + * Copyright (c) 2004 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE icu.hpp + * VERSION see + * DESCRIPTION: Unicode regular expressions on top of the ICU Library. + */ + +#ifndef BOOST_REGEX_ICU_V5_HPP +#define BOOST_REGEX_ICU_V5_HPP + +#include +#include +#include +#include +#include +#include +#include + +#ifdef BOOST_MSVC +#pragma warning (push) +#pragma warning (disable: 4251) +#endif + +namespace boost{ + +namespace BOOST_REGEX_DETAIL_NS{ + +// +// Implementation details: +// +class icu_regex_traits_implementation +{ + typedef UChar32 char_type; + typedef std::size_t size_type; + typedef std::vector string_type; + typedef U_NAMESPACE_QUALIFIER Locale locale_type; + typedef std::uint_least32_t char_class_type; +public: + icu_regex_traits_implementation(const U_NAMESPACE_QUALIFIER Locale& l) + : m_locale(l) + { + UErrorCode success = U_ZERO_ERROR; + m_collator.reset(U_NAMESPACE_QUALIFIER Collator::createInstance(l, success)); + if(U_SUCCESS(success) == 0) + init_error(); + m_collator->setStrength(U_NAMESPACE_QUALIFIER Collator::IDENTICAL); + success = U_ZERO_ERROR; + m_primary_collator.reset(U_NAMESPACE_QUALIFIER Collator::createInstance(l, success)); + if(U_SUCCESS(success) == 0) + init_error(); + m_primary_collator->setStrength(U_NAMESPACE_QUALIFIER Collator::PRIMARY); + } + U_NAMESPACE_QUALIFIER Locale getloc()const + { + return m_locale; + } + string_type do_transform(const char_type* p1, const char_type* p2, const U_NAMESPACE_QUALIFIER Collator* pcoll) const + { + // TODO make thread safe!!!! : + typedef u32_to_u16_iterator itt; + itt i(p1), j(p2); + std::vector< ::UChar> t(i, j); + std::uint8_t result[100]; + std::int32_t len; + if (!t.empty()) + len = pcoll->getSortKey(&*t.begin(), static_cast(t.size()), result, sizeof(result)); + else + len = pcoll->getSortKey(static_cast(0), static_cast(0), result, sizeof(result)); + if (std::size_t(len) > sizeof(result)) + { + std::unique_ptr< std::uint8_t[]> presult(new ::uint8_t[len + 1]); + if (!t.empty()) + len = pcoll->getSortKey(&*t.begin(), static_cast(t.size()), presult.get(), len + 1); + else + len = pcoll->getSortKey(static_cast(0), static_cast(0), presult.get(), len + 1); + if ((0 == presult[len - 1]) && (len > 1)) + --len; + return string_type(presult.get(), presult.get() + len); + } + if ((0 == result[len - 1]) && (len > 1)) + --len; + return string_type(result, result + len); + } + string_type transform(const char_type* p1, const char_type* p2) const + { + return do_transform(p1, p2, m_collator.get()); + } + string_type transform_primary(const char_type* p1, const char_type* p2) const + { + return do_transform(p1, p2, m_primary_collator.get()); + } +private: + void init_error() + { + std::runtime_error e("Could not initialize ICU resources"); +#ifndef BOOST_REGEX_STANDALONE + boost::throw_exception(e); +#else + throw e; +#endif + } + U_NAMESPACE_QUALIFIER Locale m_locale; // The ICU locale that we're using + std::unique_ptr< U_NAMESPACE_QUALIFIER Collator> m_collator; // The full collation object + std::unique_ptr< U_NAMESPACE_QUALIFIER Collator> m_primary_collator; // The primary collation object +}; +inline std::shared_ptr get_icu_regex_traits_implementation(const U_NAMESPACE_QUALIFIER Locale& loc) +{ + return std::shared_ptr(new icu_regex_traits_implementation(loc)); +} + +} + +class icu_regex_traits +{ +public: + typedef UChar32 char_type; + typedef std::size_t size_type; + typedef std::vector string_type; + typedef U_NAMESPACE_QUALIFIER Locale locale_type; + typedef std::uint64_t char_class_type; + + struct boost_extensions_tag{}; + + icu_regex_traits() + : m_pimpl(BOOST_REGEX_DETAIL_NS::get_icu_regex_traits_implementation(U_NAMESPACE_QUALIFIER Locale())) + { + } + static size_type length(const char_type* p) + { + size_type result = 0; + while (*p) + { + ++p; + ++result; + } + return result; + } + + ::boost::regex_constants::syntax_type syntax_type(char_type c)const + { + return ((c < 0x7f) && (c > 0)) ? BOOST_REGEX_DETAIL_NS::get_default_syntax_type(static_cast(c)) : regex_constants::syntax_char; + } + ::boost::regex_constants::escape_syntax_type escape_syntax_type(char_type c) const + { + return ((c < 0x7f) && (c > 0)) ? BOOST_REGEX_DETAIL_NS::get_default_escape_syntax_type(static_cast(c)) : regex_constants::syntax_char; + } + char_type translate(char_type c) const + { + return c; + } + char_type translate_nocase(char_type c) const + { + return ::u_tolower(c); + } + char_type translate(char_type c, bool icase) const + { + return icase ? translate_nocase(c) : translate(c); + } + char_type tolower(char_type c) const + { + return ::u_tolower(c); + } + char_type toupper(char_type c) const + { + return ::u_toupper(c); + } + string_type transform(const char_type* p1, const char_type* p2) const + { + return m_pimpl->transform(p1, p2); + } + string_type transform_primary(const char_type* p1, const char_type* p2) const + { + return m_pimpl->transform_primary(p1, p2); + } + char_class_type lookup_classname(const char_type* p1, const char_type* p2) const + { + constexpr char_class_type mask_blank = char_class_type(1) << offset_blank; + constexpr char_class_type mask_space = char_class_type(1) << offset_space; + constexpr char_class_type mask_xdigit = char_class_type(1) << offset_xdigit; + constexpr char_class_type mask_underscore = char_class_type(1) << offset_underscore; + constexpr char_class_type mask_unicode = char_class_type(1) << offset_unicode; + constexpr char_class_type mask_any = char_class_type(1) << offset_any; + constexpr char_class_type mask_ascii = char_class_type(1) << offset_ascii; + constexpr char_class_type mask_horizontal = char_class_type(1) << offset_horizontal; + constexpr char_class_type mask_vertical = char_class_type(1) << offset_vertical; + + static const char_class_type masks[] = + { + 0, + U_GC_L_MASK | U_GC_ND_MASK, + U_GC_L_MASK, + mask_blank, + U_GC_CC_MASK | U_GC_CF_MASK | U_GC_ZL_MASK | U_GC_ZP_MASK, + U_GC_ND_MASK, + U_GC_ND_MASK, + (0x3FFFFFFFu) & ~(U_GC_CC_MASK | U_GC_CF_MASK | U_GC_CS_MASK | U_GC_CN_MASK | U_GC_Z_MASK), + mask_horizontal, + U_GC_LL_MASK, + U_GC_LL_MASK, + ~(U_GC_C_MASK), + U_GC_P_MASK, + char_class_type(U_GC_Z_MASK) | mask_space, + char_class_type(U_GC_Z_MASK) | mask_space, + U_GC_LU_MASK, + mask_unicode, + U_GC_LU_MASK, + mask_vertical, + char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore, + char_class_type(U_GC_L_MASK | U_GC_ND_MASK | U_GC_MN_MASK) | mask_underscore, + char_class_type(U_GC_ND_MASK) | mask_xdigit, + }; + + int idx = ::boost::BOOST_REGEX_DETAIL_NS::get_default_class_id(p1, p2); + if (idx >= 0) + return masks[idx + 1]; + char_class_type result = lookup_icu_mask(p1, p2); + if (result != 0) + return result; + + if (idx < 0) + { + string_type s(p1, p2); + string_type::size_type i = 0; + while (i < s.size()) + { + s[i] = static_cast((::u_tolower)(s[i])); + if (::u_isspace(s[i]) || (s[i] == '-') || (s[i] == '_')) + s.erase(s.begin() + i, s.begin() + i + 1); + else + { + s[i] = static_cast((::u_tolower)(s[i])); + ++i; + } + } + if (!s.empty()) + idx = ::boost::BOOST_REGEX_DETAIL_NS::get_default_class_id(&*s.begin(), &*s.begin() + s.size()); + if (idx >= 0) + return masks[idx + 1]; + if (!s.empty()) + result = lookup_icu_mask(&*s.begin(), &*s.begin() + s.size()); + if (result != 0) + return result; + } + BOOST_REGEX_ASSERT(std::size_t(idx + 1) < sizeof(masks) / sizeof(masks[0])); + return masks[idx + 1]; + } + string_type lookup_collatename(const char_type* p1, const char_type* p2) const + { + string_type result; + if (std::find_if(p1, p2, std::bind(std::greater< ::UChar32>(), std::placeholders::_1, 0x7f)) == p2) + { + std::string s(p1, p2); + // Try Unicode name: + UErrorCode err = U_ZERO_ERROR; + UChar32 c = ::u_charFromName(U_UNICODE_CHAR_NAME, s.c_str(), &err); + if (U_SUCCESS(err)) + { + result.push_back(c); + return result; + } + // Try Unicode-extended name: + err = U_ZERO_ERROR; + c = ::u_charFromName(U_EXTENDED_CHAR_NAME, s.c_str(), &err); + if (U_SUCCESS(err)) + { + result.push_back(c); + return result; + } + // try POSIX name: + s = ::boost::BOOST_REGEX_DETAIL_NS::lookup_default_collate_name(s); + result.assign(s.begin(), s.end()); + } + if (result.empty() && (p2 - p1 == 1)) + result.push_back(*p1); + return result; + } + bool isctype(char_type c, char_class_type f) const + { + constexpr char_class_type mask_blank = char_class_type(1) << offset_blank; + constexpr char_class_type mask_space = char_class_type(1) << offset_space; + constexpr char_class_type mask_xdigit = char_class_type(1) << offset_xdigit; + constexpr char_class_type mask_underscore = char_class_type(1) << offset_underscore; + constexpr char_class_type mask_unicode = char_class_type(1) << offset_unicode; + constexpr char_class_type mask_any = char_class_type(1) << offset_any; + constexpr char_class_type mask_ascii = char_class_type(1) << offset_ascii; + constexpr char_class_type mask_horizontal = char_class_type(1) << offset_horizontal; + constexpr char_class_type mask_vertical = char_class_type(1) << offset_vertical; + + // check for standard catagories first: + char_class_type m = char_class_type(static_cast(1) << u_charType(c)); + if ((m & f) != 0) + return true; + // now check for special cases: + if (((f & mask_blank) != 0) && u_isblank(c)) + return true; + if (((f & mask_space) != 0) && u_isspace(c)) + return true; + if (((f & mask_xdigit) != 0) && (u_digit(c, 16) >= 0)) + return true; + if (((f & mask_unicode) != 0) && (c >= 0x100)) + return true; + if (((f & mask_underscore) != 0) && (c == '_')) + return true; + if (((f & mask_any) != 0) && (c <= 0x10FFFF)) + return true; + if (((f & mask_ascii) != 0) && (c <= 0x7F)) + return true; + if (((f & mask_vertical) != 0) && (::boost::BOOST_REGEX_DETAIL_NS::is_separator(c) || (c == static_cast('\v')) || (m == U_GC_ZL_MASK) || (m == U_GC_ZP_MASK))) + return true; + if (((f & mask_horizontal) != 0) && !::boost::BOOST_REGEX_DETAIL_NS::is_separator(c) && u_isspace(c) && (c != static_cast('\v'))) + return true; + return false; + } + std::intmax_t toi(const char_type*& p1, const char_type* p2, int radix)const + { + return BOOST_REGEX_DETAIL_NS::global_toi(p1, p2, radix, *this); + } + int value(char_type c, int radix)const + { + return u_digit(c, static_cast< std::int8_t>(radix)); + } + locale_type imbue(locale_type l) + { + locale_type result(m_pimpl->getloc()); + m_pimpl = BOOST_REGEX_DETAIL_NS::get_icu_regex_traits_implementation(l); + return result; + } + locale_type getloc()const + { + return locale_type(); + } + std::string error_string(::boost::regex_constants::error_type n) const + { + return BOOST_REGEX_DETAIL_NS::get_default_error_string(n); + } +private: + icu_regex_traits(const icu_regex_traits&); + icu_regex_traits& operator=(const icu_regex_traits&); + + // + // define the bitmasks offsets we need for additional character properties: + // + enum{ + offset_blank = U_CHAR_CATEGORY_COUNT, + offset_space = U_CHAR_CATEGORY_COUNT+1, + offset_xdigit = U_CHAR_CATEGORY_COUNT+2, + offset_underscore = U_CHAR_CATEGORY_COUNT+3, + offset_unicode = U_CHAR_CATEGORY_COUNT+4, + offset_any = U_CHAR_CATEGORY_COUNT+5, + offset_ascii = U_CHAR_CATEGORY_COUNT+6, + offset_horizontal = U_CHAR_CATEGORY_COUNT+7, + offset_vertical = U_CHAR_CATEGORY_COUNT+8 + }; + + static char_class_type lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2) + { + constexpr char_class_type mask_blank = char_class_type(1) << offset_blank; + constexpr char_class_type mask_space = char_class_type(1) << offset_space; + constexpr char_class_type mask_xdigit = char_class_type(1) << offset_xdigit; + constexpr char_class_type mask_underscore = char_class_type(1) << offset_underscore; + constexpr char_class_type mask_unicode = char_class_type(1) << offset_unicode; + constexpr char_class_type mask_any = char_class_type(1) << offset_any; + constexpr char_class_type mask_ascii = char_class_type(1) << offset_ascii; + constexpr char_class_type mask_horizontal = char_class_type(1) << offset_horizontal; + constexpr char_class_type mask_vertical = char_class_type(1) << offset_vertical; + + static const ::UChar32 prop_name_table[] = { + /* any */ 'a', 'n', 'y', + /* ascii */ 'a', 's', 'c', 'i', 'i', + /* assigned */ 'a', 's', 's', 'i', 'g', 'n', 'e', 'd', + /* c* */ 'c', '*', + /* cc */ 'c', 'c', + /* cf */ 'c', 'f', + /* closepunctuation */ 'c', 'l', 'o', 's', 'e', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* cn */ 'c', 'n', + /* co */ 'c', 'o', + /* connectorpunctuation */ 'c', 'o', 'n', 'n', 'e', 'c', 't', 'o', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* control */ 'c', 'o', 'n', 't', 'r', 'o', 'l', + /* cs */ 'c', 's', + /* currencysymbol */ 'c', 'u', 'r', 'r', 'e', 'n', 'c', 'y', 's', 'y', 'm', 'b', 'o', 'l', + /* dashpunctuation */ 'd', 'a', 's', 'h', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* decimaldigitnumber */ 'd', 'e', 'c', 'i', 'm', 'a', 'l', 'd', 'i', 'g', 'i', 't', 'n', 'u', 'm', 'b', 'e', 'r', + /* enclosingmark */ 'e', 'n', 'c', 'l', 'o', 's', 'i', 'n', 'g', 'm', 'a', 'r', 'k', + /* finalpunctuation */ 'f', 'i', 'n', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* format */ 'f', 'o', 'r', 'm', 'a', 't', + /* initialpunctuation */ 'i', 'n', 'i', 't', 'i', 'a', 'l', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* l* */ 'l', '*', + /* letter */ 'l', 'e', 't', 't', 'e', 'r', + /* letternumber */ 'l', 'e', 't', 't', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r', + /* lineseparator */ 'l', 'i', 'n', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* ll */ 'l', 'l', + /* lm */ 'l', 'm', + /* lo */ 'l', 'o', + /* lowercaseletter */ 'l', 'o', 'w', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r', + /* lt */ 'l', 't', + /* lu */ 'l', 'u', + /* m* */ 'm', '*', + /* mark */ 'm', 'a', 'r', 'k', + /* mathsymbol */ 'm', 'a', 't', 'h', 's', 'y', 'm', 'b', 'o', 'l', + /* mc */ 'm', 'c', + /* me */ 'm', 'e', + /* mn */ 'm', 'n', + /* modifierletter */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r', + /* modifiersymbol */ 'm', 'o', 'd', 'i', 'f', 'i', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l', + /* n* */ 'n', '*', + /* nd */ 'n', 'd', + /* nl */ 'n', 'l', + /* no */ 'n', 'o', + /* nonspacingmark */ 'n', 'o', 'n', 's', 'p', 'a', 'c', 'i', 'n', 'g', 'm', 'a', 'r', 'k', + /* notassigned */ 'n', 'o', 't', 'a', 's', 's', 'i', 'g', 'n', 'e', 'd', + /* number */ 'n', 'u', 'm', 'b', 'e', 'r', + /* openpunctuation */ 'o', 'p', 'e', 'n', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* other */ 'o', 't', 'h', 'e', 'r', + /* otherletter */ 'o', 't', 'h', 'e', 'r', 'l', 'e', 't', 't', 'e', 'r', + /* othernumber */ 'o', 't', 'h', 'e', 'r', 'n', 'u', 'm', 'b', 'e', 'r', + /* otherpunctuation */ 'o', 't', 'h', 'e', 'r', 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* othersymbol */ 'o', 't', 'h', 'e', 'r', 's', 'y', 'm', 'b', 'o', 'l', + /* p* */ 'p', '*', + /* paragraphseparator */ 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* pc */ 'p', 'c', + /* pd */ 'p', 'd', + /* pe */ 'p', 'e', + /* pf */ 'p', 'f', + /* pi */ 'p', 'i', + /* po */ 'p', 'o', + /* privateuse */ 'p', 'r', 'i', 'v', 'a', 't', 'e', 'u', 's', 'e', + /* ps */ 'p', 's', + /* punctuation */ 'p', 'u', 'n', 'c', 't', 'u', 'a', 't', 'i', 'o', 'n', + /* s* */ 's', '*', + /* sc */ 's', 'c', + /* separator */ 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* sk */ 's', 'k', + /* sm */ 's', 'm', + /* so */ 's', 'o', + /* spaceseparator */ 's', 'p', 'a', 'c', 'e', 's', 'e', 'p', 'a', 'r', 'a', 't', 'o', 'r', + /* spacingcombiningmark */ 's', 'p', 'a', 'c', 'i', 'n', 'g', 'c', 'o', 'm', 'b', 'i', 'n', 'i', 'n', 'g', 'm', 'a', 'r', 'k', + /* surrogate */ 's', 'u', 'r', 'r', 'o', 'g', 'a', 't', 'e', + /* symbol */ 's', 'y', 'm', 'b', 'o', 'l', + /* titlecase */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', + /* titlecaseletter */ 't', 'i', 't', 'l', 'e', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r', + /* uppercaseletter */ 'u', 'p', 'p', 'e', 'r', 'c', 'a', 's', 'e', 'l', 'e', 't', 't', 'e', 'r', + /* z* */ 'z', '*', + /* zl */ 'z', 'l', + /* zp */ 'z', 'p', + /* zs */ 'z', 's', + }; + + static const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32> range_data[] = { + { prop_name_table + 0, prop_name_table + 3, }, // any + { prop_name_table + 3, prop_name_table + 8, }, // ascii + { prop_name_table + 8, prop_name_table + 16, }, // assigned + { prop_name_table + 16, prop_name_table + 18, }, // c* + { prop_name_table + 18, prop_name_table + 20, }, // cc + { prop_name_table + 20, prop_name_table + 22, }, // cf + { prop_name_table + 22, prop_name_table + 38, }, // closepunctuation + { prop_name_table + 38, prop_name_table + 40, }, // cn + { prop_name_table + 40, prop_name_table + 42, }, // co + { prop_name_table + 42, prop_name_table + 62, }, // connectorpunctuation + { prop_name_table + 62, prop_name_table + 69, }, // control + { prop_name_table + 69, prop_name_table + 71, }, // cs + { prop_name_table + 71, prop_name_table + 85, }, // currencysymbol + { prop_name_table + 85, prop_name_table + 100, }, // dashpunctuation + { prop_name_table + 100, prop_name_table + 118, }, // decimaldigitnumber + { prop_name_table + 118, prop_name_table + 131, }, // enclosingmark + { prop_name_table + 131, prop_name_table + 147, }, // finalpunctuation + { prop_name_table + 147, prop_name_table + 153, }, // format + { prop_name_table + 153, prop_name_table + 171, }, // initialpunctuation + { prop_name_table + 171, prop_name_table + 173, }, // l* + { prop_name_table + 173, prop_name_table + 179, }, // letter + { prop_name_table + 179, prop_name_table + 191, }, // letternumber + { prop_name_table + 191, prop_name_table + 204, }, // lineseparator + { prop_name_table + 204, prop_name_table + 206, }, // ll + { prop_name_table + 206, prop_name_table + 208, }, // lm + { prop_name_table + 208, prop_name_table + 210, }, // lo + { prop_name_table + 210, prop_name_table + 225, }, // lowercaseletter + { prop_name_table + 225, prop_name_table + 227, }, // lt + { prop_name_table + 227, prop_name_table + 229, }, // lu + { prop_name_table + 229, prop_name_table + 231, }, // m* + { prop_name_table + 231, prop_name_table + 235, }, // mark + { prop_name_table + 235, prop_name_table + 245, }, // mathsymbol + { prop_name_table + 245, prop_name_table + 247, }, // mc + { prop_name_table + 247, prop_name_table + 249, }, // me + { prop_name_table + 249, prop_name_table + 251, }, // mn + { prop_name_table + 251, prop_name_table + 265, }, // modifierletter + { prop_name_table + 265, prop_name_table + 279, }, // modifiersymbol + { prop_name_table + 279, prop_name_table + 281, }, // n* + { prop_name_table + 281, prop_name_table + 283, }, // nd + { prop_name_table + 283, prop_name_table + 285, }, // nl + { prop_name_table + 285, prop_name_table + 287, }, // no + { prop_name_table + 287, prop_name_table + 301, }, // nonspacingmark + { prop_name_table + 301, prop_name_table + 312, }, // notassigned + { prop_name_table + 312, prop_name_table + 318, }, // number + { prop_name_table + 318, prop_name_table + 333, }, // openpunctuation + { prop_name_table + 333, prop_name_table + 338, }, // other + { prop_name_table + 338, prop_name_table + 349, }, // otherletter + { prop_name_table + 349, prop_name_table + 360, }, // othernumber + { prop_name_table + 360, prop_name_table + 376, }, // otherpunctuation + { prop_name_table + 376, prop_name_table + 387, }, // othersymbol + { prop_name_table + 387, prop_name_table + 389, }, // p* + { prop_name_table + 389, prop_name_table + 407, }, // paragraphseparator + { prop_name_table + 407, prop_name_table + 409, }, // pc + { prop_name_table + 409, prop_name_table + 411, }, // pd + { prop_name_table + 411, prop_name_table + 413, }, // pe + { prop_name_table + 413, prop_name_table + 415, }, // pf + { prop_name_table + 415, prop_name_table + 417, }, // pi + { prop_name_table + 417, prop_name_table + 419, }, // po + { prop_name_table + 419, prop_name_table + 429, }, // privateuse + { prop_name_table + 429, prop_name_table + 431, }, // ps + { prop_name_table + 431, prop_name_table + 442, }, // punctuation + { prop_name_table + 442, prop_name_table + 444, }, // s* + { prop_name_table + 444, prop_name_table + 446, }, // sc + { prop_name_table + 446, prop_name_table + 455, }, // separator + { prop_name_table + 455, prop_name_table + 457, }, // sk + { prop_name_table + 457, prop_name_table + 459, }, // sm + { prop_name_table + 459, prop_name_table + 461, }, // so + { prop_name_table + 461, prop_name_table + 475, }, // spaceseparator + { prop_name_table + 475, prop_name_table + 495, }, // spacingcombiningmark + { prop_name_table + 495, prop_name_table + 504, }, // surrogate + { prop_name_table + 504, prop_name_table + 510, }, // symbol + { prop_name_table + 510, prop_name_table + 519, }, // titlecase + { prop_name_table + 519, prop_name_table + 534, }, // titlecaseletter + { prop_name_table + 534, prop_name_table + 549, }, // uppercaseletter + { prop_name_table + 549, prop_name_table + 551, }, // z* + { prop_name_table + 551, prop_name_table + 553, }, // zl + { prop_name_table + 553, prop_name_table + 555, }, // zp + { prop_name_table + 555, prop_name_table + 557, }, // zs + }; + + static const icu_regex_traits::char_class_type icu_class_map[] = { + mask_any, // any + mask_ascii, // ascii + (0x3FFFFFFFu) & ~(U_GC_CN_MASK), // assigned + U_GC_C_MASK, // c* + U_GC_CC_MASK, // cc + U_GC_CF_MASK, // cf + U_GC_PE_MASK, // closepunctuation + U_GC_CN_MASK, // cn + U_GC_CO_MASK, // co + U_GC_PC_MASK, // connectorpunctuation + U_GC_CC_MASK, // control + U_GC_CS_MASK, // cs + U_GC_SC_MASK, // currencysymbol + U_GC_PD_MASK, // dashpunctuation + U_GC_ND_MASK, // decimaldigitnumber + U_GC_ME_MASK, // enclosingmark + U_GC_PF_MASK, // finalpunctuation + U_GC_CF_MASK, // format + U_GC_PI_MASK, // initialpunctuation + U_GC_L_MASK, // l* + U_GC_L_MASK, // letter + U_GC_NL_MASK, // letternumber + U_GC_ZL_MASK, // lineseparator + U_GC_LL_MASK, // ll + U_GC_LM_MASK, // lm + U_GC_LO_MASK, // lo + U_GC_LL_MASK, // lowercaseletter + U_GC_LT_MASK, // lt + U_GC_LU_MASK, // lu + U_GC_M_MASK, // m* + U_GC_M_MASK, // mark + U_GC_SM_MASK, // mathsymbol + U_GC_MC_MASK, // mc + U_GC_ME_MASK, // me + U_GC_MN_MASK, // mn + U_GC_LM_MASK, // modifierletter + U_GC_SK_MASK, // modifiersymbol + U_GC_N_MASK, // n* + U_GC_ND_MASK, // nd + U_GC_NL_MASK, // nl + U_GC_NO_MASK, // no + U_GC_MN_MASK, // nonspacingmark + U_GC_CN_MASK, // notassigned + U_GC_N_MASK, // number + U_GC_PS_MASK, // openpunctuation + U_GC_C_MASK, // other + U_GC_LO_MASK, // otherletter + U_GC_NO_MASK, // othernumber + U_GC_PO_MASK, // otherpunctuation + U_GC_SO_MASK, // othersymbol + U_GC_P_MASK, // p* + U_GC_ZP_MASK, // paragraphseparator + U_GC_PC_MASK, // pc + U_GC_PD_MASK, // pd + U_GC_PE_MASK, // pe + U_GC_PF_MASK, // pf + U_GC_PI_MASK, // pi + U_GC_PO_MASK, // po + U_GC_CO_MASK, // privateuse + U_GC_PS_MASK, // ps + U_GC_P_MASK, // punctuation + U_GC_S_MASK, // s* + U_GC_SC_MASK, // sc + U_GC_Z_MASK, // separator + U_GC_SK_MASK, // sk + U_GC_SM_MASK, // sm + U_GC_SO_MASK, // so + U_GC_ZS_MASK, // spaceseparator + U_GC_MC_MASK, // spacingcombiningmark + U_GC_CS_MASK, // surrogate + U_GC_S_MASK, // symbol + U_GC_LT_MASK, // titlecase + U_GC_LT_MASK, // titlecaseletter + U_GC_LU_MASK, // uppercaseletter + U_GC_Z_MASK, // z* + U_GC_ZL_MASK, // zl + U_GC_ZP_MASK, // zp + U_GC_ZS_MASK, // zs + }; + + + const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32>* ranges_begin = range_data; + const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32>* ranges_end = range_data + (sizeof(range_data) / sizeof(range_data[0])); + + BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32> t = { p1, p2, }; + const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32>* p = std::lower_bound(ranges_begin, ranges_end, t); + if ((p != ranges_end) && (t == *p)) + return icu_class_map[p - ranges_begin]; + return 0; + } + std::shared_ptr< ::boost::BOOST_REGEX_DETAIL_NS::icu_regex_traits_implementation> m_pimpl; +}; + +} // namespace boost + +namespace boost{ + +// types: +typedef basic_regex< ::UChar32, icu_regex_traits> u32regex; +typedef match_results u32match; +typedef match_results u16match; + +// +// Construction of 32-bit regex types from UTF-8 and UTF-16 primitives: +// +namespace BOOST_REGEX_DETAIL_NS{ + +template +inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const std::integral_constant*) +{ + typedef boost::u8_to_u32_iterator conv_type; + return u32regex(conv_type(i, i, j), conv_type(j, i, j), opt); +} + +template +inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const std::integral_constant*) +{ + typedef boost::u16_to_u32_iterator conv_type; + return u32regex(conv_type(i, i, j), conv_type(j, i, j), opt); +} + +template +inline u32regex do_make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt, + const std::integral_constant*) +{ + return u32regex(i, j, opt); +} +} + +// BOOST_REGEX_UCHAR_IS_WCHAR_T +// +// Source inspection of unicode/umachine.h in ICU version 59 indicates that: +// +// On version 59, UChar is always char16_t in C++ mode (and uint16_t in C mode) +// +// On earlier versions, the logic is +// +// #if U_SIZEOF_WCHAR_T==2 +// typedef wchar_t OldUChar; +// #elif defined(__CHAR16_TYPE__) +// typedef __CHAR16_TYPE__ OldUChar; +// #else +// typedef uint16_t OldUChar; +// #endif +// +// That is, UChar is wchar_t only on versions below 59, when U_SIZEOF_WCHAR_T==2 +// +// Hence, + +#define BOOST_REGEX_UCHAR_IS_WCHAR_T (U_ICU_VERSION_MAJOR_NUM < 59 && U_SIZEOF_WCHAR_T == 2) + +#if BOOST_REGEX_UCHAR_IS_WCHAR_T + static_assert((std::is_same::value), "Configuration logic has failed!"); +#else + static_assert(!(std::is_same::value), "Configuration logic has failed!"); +#endif + +// +// Construction from an iterator pair: +// +template +inline u32regex make_u32regex(InputIterator i, + InputIterator j, + boost::regex_constants::syntax_option_type opt) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(i, j, opt, static_cast const*>(0)); +} +// +// construction from UTF-8 nul-terminated strings: +// +inline u32regex make_u32regex(const char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + std::strlen(p), opt, static_cast const*>(0)); +} +inline u32regex make_u32regex(const unsigned char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + std::strlen(reinterpret_cast(p)), opt, static_cast const*>(0)); +} +// +// construction from UTF-16 nul-terminated strings: +// +#ifndef BOOST_NO_WREGEX +inline u32regex make_u32regex(const wchar_t* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + std::wcslen(p), opt, static_cast const*>(0)); +} +#endif +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T +inline u32regex make_u32regex(const UChar* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(p, p + u_strlen(p), opt, static_cast const*>(0)); +} +#endif +// +// construction from basic_string class-template: +// +template +inline u32regex make_u32regex(const std::basic_string& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(s.begin(), s.end(), opt, static_cast const*>(0)); +} +// +// Construction from ICU string type: +// +inline u32regex make_u32regex(const U_NAMESPACE_QUALIFIER UnicodeString& s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl) +{ + return BOOST_REGEX_DETAIL_NS::do_make_u32regex(s.getBuffer(), s.getBuffer() + s.length(), opt, static_cast const*>(0)); +} + +// +// regex_match overloads that widen the character type as appropriate: +// +namespace BOOST_REGEX_DETAIL_NS{ +template +void copy_results(MR1& out, MR2 const& in, NSubs named_subs) +{ + // copy results from an adapted MR2 match_results: + out.set_size(in.size(), in.prefix().first.base(), in.suffix().second.base()); + out.set_base(in.base().base()); + out.set_named_subs(named_subs); + for(int i = 0; i < (int)in.size(); ++i) + { + if(in[i].matched || !i) + { + out.set_first(in[i].first.base(), i); + out.set_second(in[i].second.base(), i, in[i].matched); + } + } +#ifdef BOOST_REGEX_MATCH_EXTRA + // Copy full capture info as well: + for(int i = 0; i < (int)in.size(); ++i) + { + if(in[i].captures().size()) + { + out[i].get_captures().assign(in[i].captures().size(), typename MR1::value_type()); + for(int j = 0; j < (int)out[i].captures().size(); ++j) + { + out[i].get_captures()[j].first = in[i].captures()[j].first.base(); + out[i].get_captures()[j].second = in[i].captures()[j].second.base(); + out[i].get_captures()[j].matched = in[i].captures()[j].matched; + } + } + } +#endif +} + +template +inline bool do_regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + std::integral_constant const*) +{ + return ::boost::regex_match(first, last, m, e, flags); +} +template +bool do_regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + std::integral_constant const*) +{ + typedef u16_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_match(conv_type(first, first, last), conv_type(last, first, last), what, e, flags); + // copy results across to m: + if(result) copy_results(m, what, e.get_named_subs()); + return result; +} +template +bool do_regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + std::integral_constant const*) +{ + typedef u8_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_match(conv_type(first, first, last), conv_type(last, first, last), what, e, flags); + // copy results across to m: + if(result) copy_results(m, what, e.get_named_subs()); + return result; +} +} // namespace BOOST_REGEX_DETAIL_NS + +template +inline bool u32regex_match(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(first, last, m, e, flags, static_cast const*>(0)); +} +inline bool u32regex_match(const UChar* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+u_strlen(p), m, e, flags, static_cast const*>(0)); +} +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) +inline bool u32regex_match(const wchar_t* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+std::wcslen(p), m, e, flags, static_cast const*>(0)); +} +#endif +inline bool u32regex_match(const char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+std::strlen(p), m, e, flags, static_cast const*>(0)); +} +inline bool u32regex_match(const unsigned char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+std::strlen((const char*)p), m, e, flags, static_cast const*>(0)); +} +inline bool u32regex_match(const std::string& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); +} +#ifndef BOOST_NO_STD_WSTRING +inline bool u32regex_match(const std::wstring& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); +} +#endif +inline bool u32regex_match(const U_NAMESPACE_QUALIFIER UnicodeString& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, static_cast const*>(0)); +} +// +// regex_match overloads that do not return what matched: +// +template +inline bool u32regex_match(BidiIterator first, BidiIterator last, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(first, last, m, e, flags, static_cast const*>(0)); +} +inline bool u32regex_match(const UChar* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+u_strlen(p), m, e, flags, static_cast const*>(0)); +} +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) +inline bool u32regex_match(const wchar_t* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+std::wcslen(p), m, e, flags, static_cast const*>(0)); +} +#endif +inline bool u32regex_match(const char* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+std::strlen(p), m, e, flags, static_cast const*>(0)); +} +inline bool u32regex_match(const unsigned char* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(p, p+std::strlen((const char*)p), m, e, flags, static_cast const*>(0)); +} +inline bool u32regex_match(const std::string& s, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); +} +#ifndef BOOST_NO_STD_WSTRING +inline bool u32regex_match(const std::wstring& s, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.begin(), s.end(), m, e, flags, static_cast const*>(0)); +} +#endif +inline bool u32regex_match(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_match(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, static_cast const*>(0)); +} + +// +// regex_search overloads that widen the character type as appropriate: +// +namespace BOOST_REGEX_DETAIL_NS{ +template +inline bool do_regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base, + std::integral_constant const*) +{ + return ::boost::regex_search(first, last, m, e, flags, base); +} +template +bool do_regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base, + std::integral_constant const*) +{ + typedef u16_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_search(conv_type(first, first, last), conv_type(last, first, last), what, e, flags, conv_type(base)); + // copy results across to m: + if(result) copy_results(m, what, e.get_named_subs()); + return result; +} +template +bool do_regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base, + std::integral_constant const*) +{ + typedef u8_to_u32_iterator conv_type; + typedef match_results match_type; + //typedef typename match_type::allocator_type alloc_type; + match_type what; + bool result = ::boost::regex_search(conv_type(first, first, last), conv_type(last, first, last), what, e, flags, conv_type(base)); + // copy results across to m: + if(result) copy_results(m, what, e.get_named_subs()); + return result; +} +} + +template +inline bool u32regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(first, last, m, e, flags, first, static_cast const*>(0)); +} +template +inline bool u32regex_search(BidiIterator first, BidiIterator last, + match_results& m, + const u32regex& e, + match_flag_type flags, + BidiIterator base) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(first, last, m, e, flags, base, static_cast const*>(0)); +} +inline bool u32regex_search(const UChar* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+u_strlen(p), m, e, flags, p, static_cast const*>(0)); +} +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) +inline bool u32regex_search(const wchar_t* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+std::wcslen(p), m, e, flags, p, static_cast const*>(0)); +} +#endif +inline bool u32regex_search(const char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+std::strlen(p), m, e, flags, p, static_cast const*>(0)); +} +inline bool u32regex_search(const unsigned char* p, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+std::strlen((const char*)p), m, e, flags, p, static_cast const*>(0)); +} +inline bool u32regex_search(const std::string& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); +} +#ifndef BOOST_NO_STD_WSTRING +inline bool u32regex_search(const std::wstring& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); +} +#endif +inline bool u32regex_search(const U_NAMESPACE_QUALIFIER UnicodeString& s, + match_results& m, + const u32regex& e, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, s.getBuffer(), static_cast const*>(0)); +} +template +inline bool u32regex_search(BidiIterator first, BidiIterator last, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(first, last, m, e, flags, first, static_cast const*>(0)); +} +inline bool u32regex_search(const UChar* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+u_strlen(p), m, e, flags, p, static_cast const*>(0)); +} +#if !BOOST_REGEX_UCHAR_IS_WCHAR_T && !defined(BOOST_NO_WREGEX) +inline bool u32regex_search(const wchar_t* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+std::wcslen(p), m, e, flags, p, static_cast const*>(0)); +} +#endif +inline bool u32regex_search(const char* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+std::strlen(p), m, e, flags, p, static_cast const*>(0)); +} +inline bool u32regex_search(const unsigned char* p, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(p, p+std::strlen((const char*)p), m, e, flags, p, static_cast const*>(0)); +} +inline bool u32regex_search(const std::string& s, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); +} +#ifndef BOOST_NO_STD_WSTRING +inline bool u32regex_search(const std::wstring& s, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.begin(), s.end(), m, e, flags, s.begin(), static_cast const*>(0)); +} +#endif +inline bool u32regex_search(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + match_flag_type flags = match_default) +{ + match_results m; + return BOOST_REGEX_DETAIL_NS::do_regex_search(s.getBuffer(), s.getBuffer() + s.length(), m, e, flags, s.getBuffer(), static_cast const*>(0)); +} + +// +// overloads for regex_replace with utf-8 and utf-16 data types: +// +namespace BOOST_REGEX_DETAIL_NS{ +template +inline std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator > + make_utf32_seq(I i, I j, std::integral_constant const*) +{ + return std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator >(boost::u8_to_u32_iterator(i, i, j), boost::u8_to_u32_iterator(j, i, j)); +} +template +inline std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator > + make_utf32_seq(I i, I j, std::integral_constant const*) +{ + return std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator >(boost::u16_to_u32_iterator(i, i, j), boost::u16_to_u32_iterator(j, i, j)); +} +template +inline std::pair< I, I > + make_utf32_seq(I i, I j, std::integral_constant const*) +{ + return std::pair< I, I >(i, j); +} +template +inline std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator > + make_utf32_seq(const charT* p, std::integral_constant const*) +{ + std::size_t len = std::strlen((const char*)p); + return std::pair< boost::u8_to_u32_iterator, boost::u8_to_u32_iterator >(boost::u8_to_u32_iterator(p, p, p+len), boost::u8_to_u32_iterator(p+len, p, p+len)); +} +template +inline std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator > + make_utf32_seq(const charT* p, std::integral_constant const*) +{ + std::size_t len = u_strlen((const UChar*)p); + return std::pair< boost::u16_to_u32_iterator, boost::u16_to_u32_iterator >(boost::u16_to_u32_iterator(p, p, p + len), boost::u16_to_u32_iterator(p+len, p, p + len)); +} +template +inline std::pair< const charT*, const charT* > + make_utf32_seq(const charT* p, std::integral_constant const*) +{ + return std::pair< const charT*, const charT* >(p, p+icu_regex_traits::length((UChar32 const*)p)); +} +template +inline OutputIterator make_utf32_out(OutputIterator o, std::integral_constant const*) +{ + return o; +} +template +inline utf16_output_iterator make_utf32_out(OutputIterator o, std::integral_constant const*) +{ + return o; +} +template +inline utf8_output_iterator make_utf32_out(OutputIterator o, std::integral_constant const*) +{ + return o; +} + +template +OutputIterator do_regex_replace(OutputIterator out, + std::pair const& in, + const u32regex& e, + const std::pair& fmt, + match_flag_type flags + ) +{ + // unfortunately we have to copy the format string in order to pass in onward: + std::vector f; + f.assign(fmt.first, fmt.second); + + regex_iterator i(in.first, in.second, e, flags); + regex_iterator j; + if(i == j) + { + if(!(flags & regex_constants::format_no_copy)) + out = std::copy(in.first, in.second, out); + } + else + { + I1 last_m = in.first; + while(i != j) + { + if(!(flags & regex_constants::format_no_copy)) + out = std::copy(i->prefix().first, i->prefix().second, out); + if(!f.empty()) + out = ::boost::BOOST_REGEX_DETAIL_NS::regex_format_imp(out, *i, &*f.begin(), &*f.begin() + f.size(), flags, e.get_traits()); + else + out = ::boost::BOOST_REGEX_DETAIL_NS::regex_format_imp(out, *i, static_cast(0), static_cast(0), flags, e.get_traits()); + last_m = (*i)[0].second; + if(flags & regex_constants::format_first_only) + break; + ++i; + } + if(!(flags & regex_constants::format_no_copy)) + out = std::copy(last_m, in.second, out); + } + return out; +} +template +inline const BaseIterator& extract_output_base(const BaseIterator& b) +{ + return b; +} +template +inline BaseIterator extract_output_base(const utf8_output_iterator& b) +{ + return b.base(); +} +template +inline BaseIterator extract_output_base(const utf16_output_iterator& b) +{ + return b.base(); +} +} // BOOST_REGEX_DETAIL_NS + +template +inline OutputIterator u32regex_replace(OutputIterator out, + BidirectionalIterator first, + BidirectionalIterator last, + const u32regex& e, + const charT* fmt, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::extract_output_base + ( + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(out, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(first, last, static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt, static_cast const*>(0)), + flags) + ); +} + +template +inline OutputIterator u32regex_replace(OutputIterator out, + Iterator first, + Iterator last, + const u32regex& e, + const std::basic_string& fmt, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::extract_output_base + ( + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(out, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(first, last, static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt.begin(), fmt.end(), static_cast const*>(0)), + flags) + ); +} + +template +inline OutputIterator u32regex_replace(OutputIterator out, + Iterator first, + Iterator last, + const u32regex& e, + const U_NAMESPACE_QUALIFIER UnicodeString& fmt, + match_flag_type flags = match_default) +{ + return BOOST_REGEX_DETAIL_NS::extract_output_base + ( + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(out, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(first, last, static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt.getBuffer(), fmt.getBuffer() + fmt.length(), static_cast const*>(0)), + flags) + ); +} + +template +std::basic_string u32regex_replace(const std::basic_string& s, + const u32regex& e, + const charT* fmt, + match_flag_type flags = match_default) +{ + std::basic_string result; + BOOST_REGEX_DETAIL_NS::string_out_iterator > i(result); + u32regex_replace(i, s.begin(), s.end(), e, fmt, flags); + return result; +} + +template +std::basic_string u32regex_replace(const std::basic_string& s, + const u32regex& e, + const std::basic_string& fmt, + match_flag_type flags = match_default) +{ + std::basic_string result; + BOOST_REGEX_DETAIL_NS::string_out_iterator > i(result); + u32regex_replace(i, s.begin(), s.end(), e, fmt.c_str(), flags); + return result; +} + +namespace BOOST_REGEX_DETAIL_NS{ + +class unicode_string_out_iterator +{ + U_NAMESPACE_QUALIFIER UnicodeString* out; +public: + unicode_string_out_iterator(U_NAMESPACE_QUALIFIER UnicodeString& s) : out(&s) {} + unicode_string_out_iterator& operator++() { return *this; } + unicode_string_out_iterator& operator++(int) { return *this; } + unicode_string_out_iterator& operator*() { return *this; } + unicode_string_out_iterator& operator=(UChar v) + { + *out += v; + return *this; + } + typedef std::ptrdiff_t difference_type; + typedef UChar value_type; + typedef value_type* pointer; + typedef value_type& reference; + typedef std::output_iterator_tag iterator_category; +}; + +} + +inline U_NAMESPACE_QUALIFIER UnicodeString u32regex_replace(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + const UChar* fmt, + match_flag_type flags = match_default) +{ + U_NAMESPACE_QUALIFIER UnicodeString result; + BOOST_REGEX_DETAIL_NS::unicode_string_out_iterator i(result); + u32regex_replace(i, s.getBuffer(), s.getBuffer()+s.length(), e, fmt, flags); + return result; +} + +inline U_NAMESPACE_QUALIFIER UnicodeString u32regex_replace(const U_NAMESPACE_QUALIFIER UnicodeString& s, + const u32regex& e, + const U_NAMESPACE_QUALIFIER UnicodeString& fmt, + match_flag_type flags = match_default) +{ + U_NAMESPACE_QUALIFIER UnicodeString result; + BOOST_REGEX_DETAIL_NS::unicode_string_out_iterator i(result); + BOOST_REGEX_DETAIL_NS::do_regex_replace( + BOOST_REGEX_DETAIL_NS::make_utf32_out(i, static_cast const*>(0)), + BOOST_REGEX_DETAIL_NS::make_utf32_seq(s.getBuffer(), s.getBuffer()+s.length(), static_cast const*>(0)), + e, + BOOST_REGEX_DETAIL_NS::make_utf32_seq(fmt.getBuffer(), fmt.getBuffer() + fmt.length(), static_cast const*>(0)), + flags); + return result; +} + +} // namespace boost. + +#ifdef BOOST_MSVC +#pragma warning (pop) +#endif + +#include +#include + +#endif diff --git a/include/boost/regex/v5/object_cache.hpp b/include/boost/regex/v5/object_cache.hpp new file mode 100644 index 00000000..17ff616b --- /dev/null +++ b/include/boost/regex/v5/object_cache.hpp @@ -0,0 +1,160 @@ +/* + * + * Copyright (c) 2004 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE object_cache.hpp + * VERSION see + * DESCRIPTION: Implements a generic object cache. + */ + +#ifndef BOOST_REGEX_OBJECT_CACHE_HPP +#define BOOST_REGEX_OBJECT_CACHE_HPP + +#include +#include +#include +#include +#include +#include +#ifdef BOOST_HAS_THREADS +#include +#endif + +namespace boost{ + +template +class object_cache +{ +public: + typedef std::pair< ::std::shared_ptr, Key const*> value_type; + typedef std::list list_type; + typedef typename list_type::iterator list_iterator; + typedef std::map map_type; + typedef typename map_type::iterator map_iterator; + typedef typename list_type::size_type size_type; + static std::shared_ptr get(const Key& k, size_type l_max_cache_size); + +private: + static std::shared_ptr do_get(const Key& k, size_type l_max_cache_size); + + struct data + { + list_type cont; + map_type index; + }; + + // Needed by compilers not implementing the resolution to DR45. For reference, + // see http://www.open-std.org/JTC1/SC22/WG21/docs/cwg_defects.html#45. + friend struct data; +}; + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable: 4702) +#endif +template +std::shared_ptr object_cache::get(const Key& k, size_type l_max_cache_size) +{ +#ifdef BOOST_HAS_THREADS + static std::mutex mut; + std::lock_guard l(mut); + return do_get(k, l_max_cache_size); +#else + return do_get(k, l_max_cache_size); +#endif +} +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +template +std::shared_ptr object_cache::do_get(const Key& k, size_type l_max_cache_size) +{ + typedef typename object_cache::data object_data; + typedef typename map_type::size_type map_size_type; + static object_data s_data; + + // + // see if the object is already in the cache: + // + map_iterator mpos = s_data.index.find(k); + if(mpos != s_data.index.end()) + { + // + // Eureka! + // We have a cached item, bump it up the list and return it: + // + if(--(s_data.cont.end()) != mpos->second) + { + // splice out the item we want to move: + list_type temp; + temp.splice(temp.end(), s_data.cont, mpos->second); + // and now place it at the end of the list: + s_data.cont.splice(s_data.cont.end(), temp, temp.begin()); + BOOST_REGEX_ASSERT(*(s_data.cont.back().second) == k); + // update index with new position: + mpos->second = --(s_data.cont.end()); + BOOST_REGEX_ASSERT(&(mpos->first) == mpos->second->second); + BOOST_REGEX_ASSERT(&(mpos->first) == s_data.cont.back().second); + } + return s_data.cont.back().first; + } + // + // if we get here then the item is not in the cache, + // so create it: + // + std::shared_ptr result(new Object(k)); + // + // Add it to the list, and index it: + // + s_data.cont.push_back(value_type(result, static_cast(0))); + s_data.index.insert(std::make_pair(k, --(s_data.cont.end()))); + s_data.cont.back().second = &(s_data.index.find(k)->first); + map_size_type s = s_data.index.size(); + BOOST_REGEX_ASSERT(s_data.index[k]->first.get() == result.get()); + BOOST_REGEX_ASSERT(&(s_data.index.find(k)->first) == s_data.cont.back().second); + BOOST_REGEX_ASSERT(s_data.index.find(k)->first == k); + if(s > l_max_cache_size) + { + // + // We have too many items in the list, so we need to start + // popping them off the back of the list, but only if they're + // being held uniquely by us: + // + list_iterator pos = s_data.cont.begin(); + list_iterator last = s_data.cont.end(); + while((pos != last) && (s > l_max_cache_size)) + { + if(pos->first.unique()) + { + list_iterator condemmed(pos); + ++pos; + // now remove the items from our containers, + // then order has to be as follows: + BOOST_REGEX_ASSERT(s_data.index.find(*(condemmed->second)) != s_data.index.end()); + s_data.index.erase(*(condemmed->second)); + s_data.cont.erase(condemmed); + --s; + } + else + ++pos; + } + BOOST_REGEX_ASSERT(s_data.index[k]->first.get() == result.get()); + BOOST_REGEX_ASSERT(&(s_data.index.find(k)->first) == s_data.cont.back().second); + BOOST_REGEX_ASSERT(s_data.index.find(k)->first == k); + } + return result; +} + +} + +#endif diff --git a/include/boost/regex/v5/pattern_except.hpp b/include/boost/regex/v5/pattern_except.hpp new file mode 100644 index 00000000..93c9db19 --- /dev/null +++ b/include/boost/regex/v5/pattern_except.hpp @@ -0,0 +1,127 @@ +/* + * + * Copyright (c) 1998-2002 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE pattern_except.hpp + * VERSION see + * DESCRIPTION: Declares pattern-matching exception classes. + */ + +#ifndef BOOST_RE_V5_PAT_EXCEPT_HPP +#define BOOST_RE_V5_PAT_EXCEPT_HPP + +#ifndef BOOST_REGEX_CONFIG_HPP +#include +#endif + +#include +#include +#include +#include + +namespace boost{ + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable: 4103) +#endif +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_PREFIX +#endif +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable : 4275) +#if BOOST_MSVC >= 1800 +#pragma warning(disable : 26812) +#endif +#endif +class regex_error : public std::runtime_error +{ +public: + explicit regex_error(const std::string& s, regex_constants::error_type err = regex_constants::error_unknown, std::ptrdiff_t pos = 0) + : std::runtime_error(s) + , m_error_code(err) + , m_position(pos) + { + } + explicit regex_error(regex_constants::error_type err) + : std::runtime_error(::boost::BOOST_REGEX_DETAIL_NS::get_default_error_string(err)) + , m_error_code(err) + , m_position(0) + { + } + ~regex_error() noexcept override {} + regex_constants::error_type code()const + { return m_error_code; } + std::ptrdiff_t position()const + { return m_position; } + void raise()const + { +#ifndef BOOST_NO_EXCEPTIONS +#ifndef BOOST_REGEX_STANDALONE + ::boost::throw_exception(*this); +#else + throw* this; +#endif +#endif + } +private: + regex_constants::error_type m_error_code; + std::ptrdiff_t m_position; +}; + +typedef regex_error bad_pattern; +typedef regex_error bad_expression; + +namespace BOOST_REGEX_DETAIL_NS{ + +inline void BOOST_REGEX_CALL raise_runtime_error(const std::runtime_error& ex) +{ +#ifndef BOOST_REGEX_STANDALONE + ::boost::throw_exception(ex); +#else + throw ex; +#endif +} + +template +void raise_error(const traits& t, regex_constants::error_type code) +{ + (void)t; // warning suppression + std::runtime_error e(t.error_string(code)); + ::boost::BOOST_REGEX_DETAIL_NS::raise_runtime_error(e); +} + +} + +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable: 4103) +#endif +#ifdef BOOST_HAS_ABI_HEADERS +# include BOOST_ABI_SUFFIX +#endif +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + +} // namespace boost + +#endif diff --git a/include/boost/regex/v5/unicode_iterator.hpp b/include/boost/regex/v5/unicode_iterator.hpp new file mode 100644 index 00000000..1a6b629c --- /dev/null +++ b/include/boost/regex/v5/unicode_iterator.hpp @@ -0,0 +1,864 @@ +/* + * + * Copyright (c) 2004 + * John Maddock + * + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + */ + + /* + * LOCATION: see http://www.boost.org for most recent version. + * FILE unicode_iterator.hpp + * VERSION see + * DESCRIPTION: Iterator adapters for converting between different Unicode encodings. + */ + +/**************************************************************************** + +Contents: +~~~~~~~~~ + +1) Read Only, Input Adapters: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +template +class u32_to_u8_iterator; + +Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8. + +template +class u8_to_u32_iterator; + +Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32. + +template +class u32_to_u16_iterator; + +Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16. + +template +class u16_to_u32_iterator; + +Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32. + +2) Single pass output iterator adapters: + +template +class utf8_output_iterator; + +Accepts UTF-32 code points and forwards them on as UTF-8 code points. + +template +class utf16_output_iterator; + +Accepts UTF-32 code points and forwards them on as UTF-16 code points. + +****************************************************************************/ + +#ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP +#define BOOST_REGEX_UNICODE_ITERATOR_HPP +#include +#include +#include +#include +#include +#include // CHAR_BIT + +#include + +#ifndef BOOST_REGEX_STANDALONE +#include +#endif + +namespace boost{ + +namespace detail{ + +static const std::uint16_t high_surrogate_base = 0xD7C0u; +static const std::uint16_t low_surrogate_base = 0xDC00u; +static const std::uint32_t ten_bit_mask = 0x3FFu; + +inline bool is_high_surrogate(std::uint16_t v) +{ + return (v & 0xFFFFFC00u) == 0xd800u; +} +inline bool is_low_surrogate(std::uint16_t v) +{ + return (v & 0xFFFFFC00u) == 0xdc00u; +} +template +inline bool is_surrogate(T v) +{ + return (v & 0xFFFFF800u) == 0xd800; +} + +inline unsigned utf8_byte_count(std::uint8_t c) +{ + // if the most significant bit with a zero in it is in position + // 8-N then there are N bytes in this UTF-8 sequence: + std::uint8_t mask = 0x80u; + unsigned result = 0; + while(c & mask) + { + ++result; + mask >>= 1; + } + return (result == 0) ? 1 : ((result > 4) ? 4 : result); +} + +inline unsigned utf8_trailing_byte_count(std::uint8_t c) +{ + return utf8_byte_count(c) - 1; +} + +#ifdef BOOST_MSVC +#pragma warning(push) +#pragma warning(disable:4100) +#endif +#ifndef BOOST_NO_EXCEPTIONS +BOOST_NORETURN +#endif +inline void invalid_utf32_code_point(std::uint32_t val) +{ + std::stringstream ss; + ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence"; + std::out_of_range e(ss.str()); +#ifndef BOOST_REGEX_STANDALONE + boost::throw_exception(e); +#else + throw e; +#endif +} +#ifdef BOOST_MSVC +#pragma warning(pop) +#endif + + +} // namespace detail + +template +class u32_to_u16_iterator +{ + typedef typename std::iterator_traits::value_type base_value_type; + + static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument"); + static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument"); + +public: + typedef std::ptrdiff_t difference_type; + typedef U16Type value_type; + typedef value_type const* pointer; + typedef value_type const reference; + typedef std::bidirectional_iterator_tag iterator_category; + + reference operator*()const + { + if(m_current == 2) + extract_current(); + return m_values[m_current]; + } + bool operator==(const u32_to_u16_iterator& that)const + { + if(m_position == that.m_position) + { + // Both m_currents must be equal, or both even + // this is the same as saying their sum must be even: + return (m_current + that.m_current) & 1u ? false : true; + } + return false; + } + bool operator!=(const u32_to_u16_iterator& that)const + { + return !(*this == that); + } + u32_to_u16_iterator& operator++() + { + // if we have a pending read then read now, so that we know whether + // to skip a position, or move to a low-surrogate: + if(m_current == 2) + { + // pending read: + extract_current(); + } + // move to the next surrogate position: + ++m_current; + // if we've reached the end skip a position: + if(m_values[m_current] == 0) + { + m_current = 2; + ++m_position; + } + return *this; + } + u32_to_u16_iterator operator++(int) + { + u32_to_u16_iterator r(*this); + ++(*this); + return r; + } + u32_to_u16_iterator& operator--() + { + if(m_current != 1) + { + // decrementing an iterator always leads to a valid position: + --m_position; + extract_current(); + m_current = m_values[1] ? 1 : 0; + } + else + { + m_current = 0; + } + return *this; + } + u32_to_u16_iterator operator--(int) + { + u32_to_u16_iterator r(*this); + --(*this); + return r; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u32_to_u16_iterator() : m_position(), m_current(0) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + } + u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + } +private: + + void extract_current()const + { + // begin by checking for a code point out of range: + std::uint32_t v = *m_position; + if(v >= 0x10000u) + { + if(v > 0x10FFFFu) + detail::invalid_utf32_code_point(*m_position); + // split into two surrogates: + m_values[0] = static_cast(v >> 10) + detail::high_surrogate_base; + m_values[1] = static_cast(v & detail::ten_bit_mask) + detail::low_surrogate_base; + m_current = 0; + BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0])); + BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1])); + } + else + { + // 16-bit code point: + m_values[0] = static_cast(*m_position); + m_values[1] = 0; + m_current = 0; + // value must not be a surrogate: + if(detail::is_surrogate(m_values[0])) + detail::invalid_utf32_code_point(*m_position); + } + } + BaseIterator m_position; + mutable U16Type m_values[3]; + mutable unsigned m_current; +}; + +template +class u16_to_u32_iterator +{ + // special values for pending iterator reads: + static const U32Type pending_read = 0xffffffffu; + + typedef typename std::iterator_traits::value_type base_value_type; + + static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument"); + static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument"); + +public: + typedef std::ptrdiff_t difference_type; + typedef U32Type value_type; + typedef value_type const* pointer; + typedef value_type const reference; + typedef std::bidirectional_iterator_tag iterator_category; + + reference operator*()const + { + if(m_value == pending_read) + extract_current(); + return m_value; + } + bool operator==(const u16_to_u32_iterator& that)const + { + return m_position == that.m_position; + } + bool operator!=(const u16_to_u32_iterator& that)const + { + return !(*this == that); + } + u16_to_u32_iterator& operator++() + { + // skip high surrogate first if there is one: + if(detail::is_high_surrogate(*m_position)) ++m_position; + ++m_position; + m_value = pending_read; + return *this; + } + u16_to_u32_iterator operator++(int) + { + u16_to_u32_iterator r(*this); + ++(*this); + return r; + } + u16_to_u32_iterator& operator--() + { + --m_position; + // if we have a low surrogate then go back one more: + if(detail::is_low_surrogate(*m_position)) + --m_position; + m_value = pending_read; + return *this; + } + u16_to_u32_iterator operator--(int) + { + u16_to_u32_iterator r(*this); + --(*this); + return r; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u16_to_u32_iterator() : m_position() + { + m_value = pending_read; + } + u16_to_u32_iterator(BaseIterator b) : m_position(b) + { + m_value = pending_read; + } + // + // Range checked version: + // + u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b) + { + m_value = pending_read; + // + // The range must not start with a low surrogate, or end in a high surrogate, + // otherwise we run the risk of running outside the underlying input range. + // Likewise b must not be located at a low surrogate. + // + std::uint16_t val; + if(start != end) + { + if((b != start) && (b != end)) + { + val = *b; + if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u)) + invalid_code_point(val); + } + val = *start; + if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u)) + invalid_code_point(val); + val = *--end; + if(detail::is_high_surrogate(val)) + invalid_code_point(val); + } + } +private: + static void invalid_code_point(std::uint16_t val) + { + std::stringstream ss; + ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence"; + std::out_of_range e(ss.str()); +#ifndef BOOST_REGEX_STANDALONE + boost::throw_exception(e); +#else + throw e; +#endif + } + void extract_current()const + { + m_value = static_cast(static_cast< std::uint16_t>(*m_position)); + // if the last value is a high surrogate then adjust m_position and m_value as needed: + if(detail::is_high_surrogate(*m_position)) + { + // precondition; next value must have be a low-surrogate: + BaseIterator next(m_position); + std::uint16_t t = *++next; + if((t & 0xFC00u) != 0xDC00u) + invalid_code_point(t); + m_value = (m_value - detail::high_surrogate_base) << 10; + m_value |= (static_cast(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask); + } + // postcondition; result must not be a surrogate: + if(detail::is_surrogate(m_value)) + invalid_code_point(static_cast< std::uint16_t>(m_value)); + } + BaseIterator m_position; + mutable U32Type m_value; +}; + +template +class u32_to_u8_iterator +{ + typedef typename std::iterator_traits::value_type base_value_type; + + static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument"); + static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument"); + +public: + typedef std::ptrdiff_t difference_type; + typedef U8Type value_type; + typedef value_type const* pointer; + typedef value_type const reference; + typedef std::bidirectional_iterator_tag iterator_category; + + reference operator*()const + { + if(m_current == 4) + extract_current(); + return m_values[m_current]; + } + bool operator==(const u32_to_u8_iterator& that)const + { + if(m_position == that.m_position) + { + // either the m_current's must be equal, or one must be 0 and + // the other 4: which means neither must have bits 1 or 2 set: + return (m_current == that.m_current) + || (((m_current | that.m_current) & 3) == 0); + } + return false; + } + bool operator!=(const u32_to_u8_iterator& that)const + { + return !(*this == that); + } + u32_to_u8_iterator& operator++() + { + // if we have a pending read then read now, so that we know whether + // to skip a position, or move to a low-surrogate: + if(m_current == 4) + { + // pending read: + extract_current(); + } + // move to the next surrogate position: + ++m_current; + // if we've reached the end skip a position: + if(m_values[m_current] == 0) + { + m_current = 4; + ++m_position; + } + return *this; + } + u32_to_u8_iterator operator++(int) + { + u32_to_u8_iterator r(*this); + ++(*this); + return r; + } + u32_to_u8_iterator& operator--() + { + if((m_current & 3) == 0) + { + --m_position; + extract_current(); + m_current = 3; + while(m_current && (m_values[m_current] == 0)) + --m_current; + } + else + --m_current; + return *this; + } + u32_to_u8_iterator operator--(int) + { + u32_to_u8_iterator r(*this); + --(*this); + return r; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u32_to_u8_iterator() : m_position(), m_current(0) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + m_values[3] = 0; + m_values[4] = 0; + } + u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4) + { + m_values[0] = 0; + m_values[1] = 0; + m_values[2] = 0; + m_values[3] = 0; + m_values[4] = 0; + } +private: + + void extract_current()const + { + std::uint32_t c = *m_position; + if(c > 0x10FFFFu) + detail::invalid_utf32_code_point(c); + if(c < 0x80u) + { + m_values[0] = static_cast(c); + m_values[1] = static_cast(0u); + m_values[2] = static_cast(0u); + m_values[3] = static_cast(0u); + } + else if(c < 0x800u) + { + m_values[0] = static_cast(0xC0u + (c >> 6)); + m_values[1] = static_cast(0x80u + (c & 0x3Fu)); + m_values[2] = static_cast(0u); + m_values[3] = static_cast(0u); + } + else if(c < 0x10000u) + { + m_values[0] = static_cast(0xE0u + (c >> 12)); + m_values[1] = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + m_values[2] = static_cast(0x80u + (c & 0x3Fu)); + m_values[3] = static_cast(0u); + } + else + { + m_values[0] = static_cast(0xF0u + (c >> 18)); + m_values[1] = static_cast(0x80u + ((c >> 12) & 0x3Fu)); + m_values[2] = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + m_values[3] = static_cast(0x80u + (c & 0x3Fu)); + } + m_current= 0; + } + BaseIterator m_position; + mutable U8Type m_values[5]; + mutable unsigned m_current; +}; + +template +class u8_to_u32_iterator +{ + // special values for pending iterator reads: + static const U32Type pending_read = 0xffffffffu; + + typedef typename std::iterator_traits::value_type base_value_type; + + static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument"); + static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument"); + +public: + typedef std::ptrdiff_t difference_type; + typedef U32Type value_type; + typedef value_type const* pointer; + typedef value_type const reference; + typedef std::bidirectional_iterator_tag iterator_category; + + reference operator*()const + { + if(m_value == pending_read) + extract_current(); + return m_value; + } + bool operator==(const u8_to_u32_iterator& that)const + { + return m_position == that.m_position; + } + bool operator!=(const u8_to_u32_iterator& that)const + { + return !(*this == that); + } + u8_to_u32_iterator& operator++() + { + // We must not start with a continuation character: + if((static_cast(*m_position) & 0xC0) == 0x80) + invalid_sequence(); + // skip high surrogate first if there is one: + unsigned c = detail::utf8_byte_count(*m_position); + if(m_value == pending_read) + { + // Since we haven't read in a value, we need to validate the code points: + for(unsigned i = 0; i < c; ++i) + { + ++m_position; + // We must have a continuation byte: + if((i != c - 1) && ((static_cast(*m_position) & 0xC0) != 0x80)) + invalid_sequence(); + } + } + else + { + std::advance(m_position, c); + } + m_value = pending_read; + return *this; + } + u8_to_u32_iterator operator++(int) + { + u8_to_u32_iterator r(*this); + ++(*this); + return r; + } + u8_to_u32_iterator& operator--() + { + // Keep backtracking until we don't have a trailing character: + unsigned count = 0; + while((*--m_position & 0xC0u) == 0x80u) ++count; + // now check that the sequence was valid: + if(count != detail::utf8_trailing_byte_count(*m_position)) + invalid_sequence(); + m_value = pending_read; + return *this; + } + u8_to_u32_iterator operator--(int) + { + u8_to_u32_iterator r(*this); + --(*this); + return r; + } + BaseIterator base()const + { + return m_position; + } + // construct: + u8_to_u32_iterator() : m_position() + { + m_value = pending_read; + } + u8_to_u32_iterator(BaseIterator b) : m_position(b) + { + m_value = pending_read; + } + // + // Checked constructor: + // + u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b) + { + m_value = pending_read; + // + // We must not start with a continuation character, or end with a + // truncated UTF-8 sequence otherwise we run the risk of going past + // the start/end of the underlying sequence: + // + if(start != end) + { + unsigned char v = *start; + if((v & 0xC0u) == 0x80u) + invalid_sequence(); + if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u)) + invalid_sequence(); + BaseIterator pos = end; + do + { + v = *--pos; + } + while((start != pos) && ((v & 0xC0u) == 0x80u)); + std::ptrdiff_t extra = detail::utf8_byte_count(v); + if(std::distance(pos, end) < extra) + invalid_sequence(); + } + } +private: + static void invalid_sequence() + { + std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character"); +#ifndef BOOST_REGEX_STANDALONE + boost::throw_exception(e); +#else + throw e; +#endif + } + void extract_current()const + { + m_value = static_cast(static_cast< std::uint8_t>(*m_position)); + // we must not have a continuation character: + if((m_value & 0xC0u) == 0x80u) + invalid_sequence(); + // see how many extra bytes we have: + unsigned extra = detail::utf8_trailing_byte_count(*m_position); + // extract the extra bits, 6 from each extra byte: + BaseIterator next(m_position); + for(unsigned c = 0; c < extra; ++c) + { + ++next; + m_value <<= 6; + // We must have a continuation byte: + if((static_cast(*next) & 0xC0) != 0x80) + invalid_sequence(); + m_value += static_cast(*next) & 0x3Fu; + } + // we now need to remove a few of the leftmost bits, but how many depends + // upon how many extra bytes we've extracted: + static const std::uint32_t masks[4] = + { + 0x7Fu, + 0x7FFu, + 0xFFFFu, + 0x1FFFFFu, + }; + m_value &= masks[extra]; + // check the result is in range: + if(m_value > static_cast(0x10FFFFu)) + invalid_sequence(); + // The result must not be a surrogate: + if((m_value >= static_cast(0xD800)) && (m_value <= static_cast(0xDFFF))) + invalid_sequence(); + // We should not have had an invalidly encoded UTF8 sequence: + if((extra > 0) && (m_value <= static_cast(masks[extra - 1]))) + invalid_sequence(); + } + BaseIterator m_position; + mutable U32Type m_value; +}; + +template +class utf16_output_iterator +{ +public: + typedef void difference_type; + typedef void value_type; + typedef std::uint32_t* pointer; + typedef std::uint32_t& reference; + typedef std::output_iterator_tag iterator_category; + + utf16_output_iterator(const BaseIterator& b) + : m_position(b){} + utf16_output_iterator(const utf16_output_iterator& that) + : m_position(that.m_position){} + utf16_output_iterator& operator=(const utf16_output_iterator& that) + { + m_position = that.m_position; + return *this; + } + const utf16_output_iterator& operator*()const + { + return *this; + } + void operator=(std::uint32_t val)const + { + push(val); + } + utf16_output_iterator& operator++() + { + return *this; + } + utf16_output_iterator& operator++(int) + { + return *this; + } + BaseIterator base()const + { + return m_position; + } +private: + void push(std::uint32_t v)const + { + if(v >= 0x10000u) + { + // begin by checking for a code point out of range: + if(v > 0x10FFFFu) + detail::invalid_utf32_code_point(v); + // split into two surrogates: + *m_position++ = static_cast(v >> 10) + detail::high_surrogate_base; + *m_position++ = static_cast(v & detail::ten_bit_mask) + detail::low_surrogate_base; + } + else + { + // 16-bit code point: + // value must not be a surrogate: + if(detail::is_surrogate(v)) + detail::invalid_utf32_code_point(v); + *m_position++ = static_cast(v); + } + } + mutable BaseIterator m_position; +}; + +template +class utf8_output_iterator +{ +public: + typedef void difference_type; + typedef void value_type; + typedef std::uint32_t* pointer; + typedef std::uint32_t& reference; + typedef std::output_iterator_tag iterator_category; + + utf8_output_iterator(const BaseIterator& b) + : m_position(b){} + utf8_output_iterator(const utf8_output_iterator& that) + : m_position(that.m_position){} + utf8_output_iterator& operator=(const utf8_output_iterator& that) + { + m_position = that.m_position; + return *this; + } + const utf8_output_iterator& operator*()const + { + return *this; + } + void operator=(std::uint32_t val)const + { + push(val); + } + utf8_output_iterator& operator++() + { + return *this; + } + utf8_output_iterator& operator++(int) + { + return *this; + } + BaseIterator base()const + { + return m_position; + } +private: + void push(std::uint32_t c)const + { + if(c > 0x10FFFFu) + detail::invalid_utf32_code_point(c); + if(c < 0x80u) + { + *m_position++ = static_cast(c); + } + else if(c < 0x800u) + { + *m_position++ = static_cast(0xC0u + (c >> 6)); + *m_position++ = static_cast(0x80u + (c & 0x3Fu)); + } + else if(c < 0x10000u) + { + *m_position++ = static_cast(0xE0u + (c >> 12)); + *m_position++ = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + *m_position++ = static_cast(0x80u + (c & 0x3Fu)); + } + else + { + *m_position++ = static_cast(0xF0u + (c >> 18)); + *m_position++ = static_cast(0x80u + ((c >> 12) & 0x3Fu)); + *m_position++ = static_cast(0x80u + ((c >> 6) & 0x3Fu)); + *m_position++ = static_cast(0x80u + (c & 0x3Fu)); + } + } + mutable BaseIterator m_position; +}; + +} // namespace boost + +#endif // BOOST_REGEX_UNICODE_ITERATOR_HPP +