diff --git a/include/boost/detail/utf8_codecvt_facet.hpp b/include/boost/detail/utf8_codecvt_facet.hpp index 11b0866..ce5a3d6 100644 --- a/include/boost/detail/utf8_codecvt_facet.hpp +++ b/include/boost/detail/utf8_codecvt_facet.hpp @@ -14,7 +14,7 @@ /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 // utf8_codecvt_facet.hpp -// This header defines class utf8_codecvt_facet, derived from +// This header defines class utf8_codecvt_facet, derived from // std::codecvt, which can be used to convert utf8 data in // files into wchar_t strings in the application. // @@ -23,10 +23,10 @@ // we want to avoid code duplication. It would be possible to create utf8 // library, but: // - this requires review process first -// - in the case, when linking the a library which uses utf8 +// - in the case, when linking the a library which uses utf8 // (say 'program_options'), user should also link to the utf8 library. -// This seems inconvenient, and asking a user to link to an unrevieved -// library is strange. +// This seems inconvenient, and asking a user to link to an unrevieved +// library is strange. // Until the above points are fixed, a library which wants to use utf8 must: // - include this header in one of it's headers or sources // - include the corresponding boost/detail/utf8_codecvt_facet.ipp file in one @@ -39,14 +39,14 @@ // symbols. // // For example, program_options library might contain: -// #define BOOST_UTF8_BEGIN_NAMESPACE +// #define BOOST_UTF8_BEGIN_NAMESPACE // namespace boost { namespace program_options { // #define BOOST_UTF8_END_NAMESPACE }} // #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL // #include // // Essentially, each library will have its own copy of utf8 code, in -// different namespaces. +// different namespaces. // Note:(Robert Ramey). I have made the following alterations in the original // code. @@ -67,7 +67,7 @@ // use two template parameters // // utf8_codecvt_facet -// This is an implementation of a std::codecvt facet for translating +// This is an implementation of a std::codecvt facet for translating // from UTF-8 externally to UCS-4. Note that this is not tied to // any specific types in order to allow customization on platforms // where wchar_t is not big enough. @@ -110,27 +110,28 @@ BOOST_UTF8_BEGIN_NAMESPACE #endif struct BOOST_SYMBOL_VISIBLE utf8_codecvt_facet : - public std::codecvt + public std::codecvt { public: - explicit utf8_codecvt_facet(std::size_t no_locale_manage=0); - virtual ~utf8_codecvt_facet(); + BOOST_UTF8_DECL explicit utf8_codecvt_facet(std::size_t no_locale_manage = 0); + BOOST_UTF8_DECL virtual ~utf8_codecvt_facet(); + protected: - virtual std::codecvt_base::result do_in( - std::mbstate_t& state, + BOOST_UTF8_DECL virtual std::codecvt_base::result do_in( + std::mbstate_t& state, const char * from, - const char * from_end, + const char * from_end, const char * & from_next, - wchar_t * to, - wchar_t * to_end, - wchar_t*& to_next + wchar_t * to, + wchar_t * to_end, + wchar_t * & to_next ) const; - virtual std::codecvt_base::result do_out( + BOOST_UTF8_DECL virtual std::codecvt_base::result do_out( std::mbstate_t & state, const wchar_t * from, const wchar_t * from_end, - const wchar_t* & from_next, + const wchar_t * & from_next, char * to, char * to_end, char * & to_next @@ -140,7 +141,7 @@ protected: return (octet_1 < 0x80|| 0xbf< octet_1); } - bool invalid_leading_octet(unsigned char octet_1) const { + bool invalid_leading_octet(unsigned char octet_1) const { return (0x7f < octet_1 && octet_1 < 0xc0) || (octet_1 > 0xfd); } @@ -150,11 +151,11 @@ protected: return get_octet_count(lead_octet) - 1; } - static unsigned int get_octet_count(unsigned char lead_octet); + BOOST_UTF8_DECL static unsigned int get_octet_count(unsigned char lead_octet); // How many "continuing octets" will be needed for this word // == total octets - 1. - int get_cont_octet_out_count(wchar_t word) const ; + BOOST_UTF8_DECL static int get_cont_octet_out_count(wchar_t word); virtual bool do_always_noconv() const BOOST_NOEXCEPT_OR_NOTHROW { return false; @@ -162,7 +163,7 @@ protected: // UTF-8 isn't really stateful since we rewind on partial conversions virtual std::codecvt_base::result do_unshift( - std::mbstate_t&, + std::mbstate_t &, char * from, char * /*to*/, char * & next @@ -178,10 +179,10 @@ protected: // How many char objects can I process to get <= max_limit // wchar_t objects? - virtual int do_length( + BOOST_UTF8_DECL virtual int do_length( std::mbstate_t &, const char * from, - const char * from_end, + const char * from_end, std::size_t max_limit ) const #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) @@ -193,7 +194,7 @@ protected: virtual int do_length( const std::mbstate_t & s, const char * from, - const char * from_end, + const char * from_end, std::size_t max_limit ) const #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) diff --git a/include/boost/detail/utf8_codecvt_facet.ipp b/include/boost/detail/utf8_codecvt_facet.ipp index f9e9deb..65215cb 100644 --- a/include/boost/detail/utf8_codecvt_facet.ipp +++ b/include/boost/detail/utf8_codecvt_facet.ipp @@ -2,7 +2,7 @@ // utf8_codecvt_facet.ipp // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) -// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). +// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). // Use, modification and distribution is subject to the Boost Software // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) @@ -18,7 +18,7 @@ #include #include -// If we don't have wstring, then Unicode support +// If we don't have wstring, then Unicode support // is not available anyway, so we don't need to even // compiler this file. This also fixes the problem // with mingw, which can compile this file, but will @@ -30,6 +30,19 @@ BOOST_UTF8_BEGIN_NAMESPACE /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 // implementation for wchar_t +namespace detail { + +inline const wchar_t * get_octet1_modifier_table() BOOST_NOEXCEPT +{ + static const wchar_t octet1_modifier_table[] = { + 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc + }; + return octet1_modifier_table; +} + +} // namespace detail + + BOOST_UTF8_DECL utf8_codecvt_facet::utf8_codecvt_facet( std::size_t no_locale_manage ) : @@ -41,97 +54,93 @@ BOOST_UTF8_DECL utf8_codecvt_facet::~utf8_codecvt_facet() // Translate incoming UTF-8 into UCS-4 BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_in( - std::mbstate_t& /*state*/, + std::mbstate_t& /*state*/, const char * from, - const char * from_end, + const char * from_end, const char * & from_next, - wchar_t * to, - wchar_t * to_end, + wchar_t * to, + wchar_t * to_end, wchar_t * & to_next ) const { - // Basic algorithm: The first octet determines how many - // octets total make up the UCS-4 character. The remaining + // Basic algorithm: The first octet determines how many + // octets total make up the UCS-4 character. The remaining // "continuing octets" all begin with "10". To convert, subtract // the amount that specifies the number of octets from the first - // octet. Subtract 0x80 (1000 0000) from each continuing octet, - // then mash the whole lot together. Note that each continuing + // octet. Subtract 0x80 (1000 0000) from each continuing octet, + // then mash the whole lot together. Note that each continuing // octet only uses 6 bits as unique values, so only shift by // multiples of 6 to combine. + const wchar_t * const octet1_modifier_table = detail::get_octet1_modifier_table(); while (from != from_end && to != to_end) { - // Error checking on the first octet - if (invalid_leading_octet(*from)){ + // Error checking on the first octet + if (invalid_leading_octet(*from)) { from_next = from; to_next = to; return std::codecvt_base::error; } - // The first octet is adjusted by a value dependent upon - // the number of "continuing octets" encoding the character - const int cont_octet_count = get_cont_octet_count(*from); - const wchar_t octet1_modifier_table[] = { - 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc - }; + // The first octet is adjusted by a value dependent upon + // the number of "continuing octets" encoding the character + const int cont_octet_count = get_cont_octet_count(*from); // The unsigned char conversion is necessary in case char is - // signed (I learned this the hard way) - wchar_t ucs_result = + // signed (I learned this the hard way) + wchar_t ucs_result = (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; - // Invariants : - // 1) At the start of the loop, 'i' continuing characters have been - // processed - // 2) *from points to the next continuing character to be processed. - int i = 0; - while(i != cont_octet_count && from != from_end) { + // Invariants: + // 1) At the start of the loop, 'i' continuing characters have been + // processed + // 2) *from points to the next continuing character to be processed. + int i = 0; + while (i != cont_octet_count && from != from_end) { // Error checking on continuing characters if (invalid_continuing_octet(*from)) { - from_next = from; - to_next = to; + from_next = from; + to_next = to; return std::codecvt_base::error; } - ucs_result *= (1 << 6); + ucs_result *= (1 << 6); - // each continuing character has an extra (10xxxxxx)b attached to + // each continuing character has an extra (10xxxxxx)b attached to // it that must be removed. ucs_result += (unsigned char)(*from++) - 0x80; ++i; } - // If the buffer ends with an incomplete unicode character... - if (from == from_end && i != cont_octet_count) { + // If the buffer ends with an incomplete unicode character... + if (from == from_end && i != cont_octet_count) { // rewind "from" to before the current character translation - from_next = from - (i+1); + from_next = from - (i + 1); to_next = to; return std::codecvt_base::partial; } - *to++ = ucs_result; + *to++ = ucs_result; } from_next = from; to_next = to; // Were we done converting or did we run out of destination space? - if(from == from_end) return std::codecvt_base::ok; - else return std::codecvt_base::partial; + if (from == from_end) + return std::codecvt_base::ok; + else + return std::codecvt_base::partial; } BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out( - std::mbstate_t& /*state*/, - const wchar_t * from, - const wchar_t * from_end, + std::mbstate_t& /*state*/, + const wchar_t * from, + const wchar_t * from_end, const wchar_t * & from_next, - char * to, - char * to_end, + char * to, + char * to_end, char * & to_next ) const { - // RG - consider merging this table with the other one - const wchar_t octet1_modifier_table[] = { - 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc - }; - + const wchar_t * const octet1_modifier_table = detail::get_octet1_modifier_table(); wchar_t max_wchar = (std::numeric_limits::max)(); while (from != from_end && to != to_end) { @@ -144,37 +153,40 @@ BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out( int cont_octet_count = get_cont_octet_out_count(*from); - // RG - comment this formula better - int shift_exponent = (cont_octet_count) * 6; + // RG - comment this formula better + int shift_exponent = cont_octet_count * 6; // Process the first character *to++ = static_cast(octet1_modifier_table[cont_octet_count] + (unsigned char)(*from / (1 << shift_exponent))); - // Process the continuation characters - // Invariants: At the start of the loop: - // 1) 'i' continuing octets have been generated - // 2) '*to' points to the next location to place an octet - // 3) shift_exponent is 6 more than needed for the next octet - int i = 0; - while (i != cont_octet_count && to != to_end) { + // Process the continuation characters + // Invariants: At the start of the loop: + // 1) 'i' continuing octets have been generated + // 2) '*to' points to the next location to place an octet + // 3) shift_exponent is 6 more than needed for the next octet + int i = 0; + while (i != cont_octet_count && to != to_end) { shift_exponent -= 6; *to++ = static_cast(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); ++i; } - // If we filled up the out buffer before encoding the character - if(to == to_end && i != cont_octet_count) { + // If we filled up the out buffer before encoding the character + if (to == to_end && i != cont_octet_count) { from_next = from; - to_next = to - (i+1); + to_next = to - (i + 1); return std::codecvt_base::partial; } ++from; } from_next = from; to_next = to; + // Were we done or did we run out of destination space - if(from == from_end) return std::codecvt_base::ok; - else return std::codecvt_base::partial; + if (from == from_end) + return std::codecvt_base::ok; + else + return std::codecvt_base::partial; } // How many char objects can I process to get <= max_limit @@ -182,7 +194,7 @@ BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out( BOOST_UTF8_DECL int utf8_codecvt_facet::do_length( std::mbstate_t &, const char * from, - const char * from_end, + const char * from_end, std::size_t max_limit ) const #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) @@ -203,7 +215,7 @@ BOOST_UTF8_DECL int utf8_codecvt_facet::do_length( BOOST_UTF8_DECL unsigned int utf8_codecvt_facet::get_octet_count( unsigned char lead_octet -){ +) { // if the 0-bit (MSB) is 0, then 1 character if (lead_octet <= 0x7f) return 1; @@ -220,7 +232,7 @@ BOOST_UTF8_DECL unsigned int utf8_codecvt_facet::get_octet_count( namespace detail { template -BOOST_UTF8_DECL int get_cont_octet_out_count_impl(wchar_t word){ +inline int get_cont_octet_out_count_impl(wchar_t word) { if (word < 0x80) { return 0; } @@ -231,7 +243,7 @@ BOOST_UTF8_DECL int get_cont_octet_out_count_impl(wchar_t word){ } template<> -BOOST_UTF8_DECL int get_cont_octet_out_count_impl<4>(wchar_t word){ +inline int get_cont_octet_out_count_impl<4>(wchar_t word) { if (word < 0x80) { return 0; } @@ -243,7 +255,7 @@ BOOST_UTF8_DECL int get_cont_octet_out_count_impl<4>(wchar_t word){ // where wchar_t is defined as UCS2. The warnings are superfluous as the // specialization is never instantitiated with such compilers, but this // can cause problems if warnings are being treated as errors, so we guard - // against that. Including as we do + // against that. Including as we do // should be enough to get WCHAR_MAX defined. #if !defined(WCHAR_MAX) # error WCHAR_MAX not defined! @@ -252,8 +264,8 @@ BOOST_UTF8_DECL int get_cont_octet_out_count_impl<4>(wchar_t word){ #if defined(_MSC_VER) && _MSC_VER <= 1310 // 7.1 or earlier return 2; #elif WCHAR_MAX > 0x10000 - - if (word < 0x10000) { + + if (word < 0x10000) { return 2; } if (word < 0x200000) { @@ -263,7 +275,7 @@ BOOST_UTF8_DECL int get_cont_octet_out_count_impl<4>(wchar_t word){ return 4; } return 5; - + #else return 2; #endif @@ -275,9 +287,10 @@ BOOST_UTF8_DECL int get_cont_octet_out_count_impl<4>(wchar_t word){ // == total octets - 1. BOOST_UTF8_DECL int utf8_codecvt_facet::get_cont_octet_out_count( wchar_t word -) const { +) { return detail::get_cont_octet_out_count_impl(word); } + BOOST_UTF8_END_NAMESPACE #endif