diff --git a/include/async_mqtt5/detail/utf8_mqtt.hpp b/include/async_mqtt5/detail/utf8_mqtt.hpp index e9c29f7..29dcffe 100644 --- a/include/async_mqtt5/detail/utf8_mqtt.hpp +++ b/include/async_mqtt5/detail/utf8_mqtt.hpp @@ -6,44 +6,32 @@ namespace async_mqtt5::detail { -struct code_point { - int32_t val; - uint32_t size; +inline int pop_front_unichar(std::string_view& s) { + // assuming that s.length() is > 0 - auto operator<=>(const code_point&) const = default; + int n = s[0] & 0xF0; + int ch = -1; - static code_point from(std::string_view s) { - auto hnibble = s[0] & 0xF0; - return - (hnibble & 0x80) == 0 ? - code_point { s[0], 1 } - : - (hnibble == 0xC0 || hnibble == 0xD0) && s.size() > 1 ? - code_point { - (int32_t(s[0] & 0x1F) << 6) | int32_t(s[1] & 0x3F), - 2 - } - : - (hnibble == 0xE0) && s.size() > 2 ? - code_point { - (int32_t(s[0] & 0x1F) << 12) | - (int32_t(s[1] & 0x3F) << 6) | - int32_t(s[2] & 0x3F), - 3 - } - : - (hnibble == 0xF0) && s.size() > 3 ? - code_point { - (int32_t(s[0] & 0x1F) << 18) | - (int32_t(s[1] & 0x3F) << 12) | - (int32_t(s[2] & 0x3F) << 6) | - int32_t(s[3] & 0x3F), - 4 - } - : - code_point { -1, 0 }; + if ((n & 0x80) == 0) { + ch = s[0]; + s.remove_prefix(1); } -}; + else if ((n == 0xC0 || n == 0xD0) && s.size() > 1) { + ch = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F); + s.remove_prefix(2); + } + else if ((n == 0xE0) && s.size() > 2) { + ch = ((s[0] & 0x1F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); + s.remove_prefix(3); + } + else if ((n == 0xF0) && s.size() > 3) { + ch = ((s[0] & 0x1F) << 18) | ((s[1] & 0x3F) << 12) | + ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); + s.remove_prefix(4); + } + + return ch; +} inline bool is_valid_mqtt_utf8(std::string_view str) { constexpr size_t max_sz = 65535; @@ -51,23 +39,21 @@ inline bool is_valid_mqtt_utf8(std::string_view str) { if (str.size() > max_sz) return false; - auto is_valid_cp = [](int32_t c) -> bool { - constexpr int32_t fe_flag = 0xFE; - constexpr int32_t ff_flag = 0xFF; - - return c >= 32 && // U+0000...U+001F control characters - (c < 127 || c > 159) && // U+007F...0+009F control characters - (c < 55296 || c > 57343) && // U+D800...U+DFFF surrogates - (c < 64976 || c > 65007) &&// U+FDD0...U+FDEF non-characters - (c & fe_flag) != fe_flag && // non-characters - (c & ff_flag) != ff_flag; - }; + constexpr int fe_flag = 0xFE; + constexpr int ff_flag = 0xFF; while (!str.empty()) { - auto cp = code_point::from(str.data()); - if (!is_valid_cp(cp.val)) + int c = pop_front_unichar(str); + + auto is_valid = c > 0x001F && // U+0000...U+001F control characters + (c < 0x007F || c > 0x009F) && // U+007F...0+009F control characters + (c < 0xD800 || c > 0xDFFF) && // U+D800...U+DFFF surrogates + (c < 0xFDD0 || c > 0xFDEF) && // U+FDD0...U+FDEF non-characters + (c & fe_flag) != fe_flag && // non-characters + (c & ff_flag) != ff_flag; + + if (!is_valid) return false; - str.remove_prefix(cp.size); } return true; diff --git a/test/unit/test/utf8_mqtt.cpp b/test/unit/test/utf8_mqtt.cpp index b85e496..7be0e6d 100644 --- a/test/unit/test/utf8_mqtt.cpp +++ b/test/unit/test/utf8_mqtt.cpp @@ -4,45 +4,46 @@ BOOST_AUTO_TEST_SUITE(utf8_mqtt/*, *boost::unit_test::disabled()*/) -std::string to_str(async_mqtt5::detail::code_point cp) { - return cp.size == 1 ? std::string { char(cp.val) } - : cp.size == 2 ? std::string { char((cp.val >> 6) | 0xC0), char((cp.val & 0x3F) | 0x80) } - : cp.size == 3 ? std::string { - char((cp.val >> 12) | 0xE0), - char(((cp.val >> 6) & 0x3F) | 0x80), - char((cp.val & 0x3F) | 0x80) - } - : std::string { // cp.size == 4 - char((cp.val >> 18) | 0xF0), - char(((cp.val >> 12) & 0x3F) | 0x80), - char(((cp.val >> 6) & 0x3F) | 0x80), - char((cp.val & 0x3F) | 0x80) - +std::string to_str(int utf8ch) { + if (utf8ch < 0x80) + return { char(utf8ch) }; + if (utf8ch < 0x800) + return { + char((utf8ch >> 6) | 0xC0), + char((utf8ch & 0x3F) | 0x80) }; -} - -async_mqtt5::detail::code_point cp(int32_t val) { - return { val, uint32_t(val < 0x80 ? 1 : val < 0x800 ? 2 : val < 0xFFFF ? 3 : /* val < 0x10FFFF */ 4) }; + if (utf8ch < 0xFFFF) + return { + char((utf8ch >> 12) | 0xE0), + char(((utf8ch >> 6) & 0x3F) | 0x80), + char((utf8ch & 0x3F) | 0x80) + }; + return { + char((utf8ch >> 18) | 0xF0), + char(((utf8ch >> 12) & 0x3F) | 0x80), + char(((utf8ch >> 6) & 0x3F) | 0x80), + char((utf8ch & 0x3F) | 0x80) + }; } BOOST_AUTO_TEST_CASE(utf8_string_validation) { using namespace async_mqtt5::detail; BOOST_CHECK_EQUAL(is_valid_mqtt_utf8("stringy"), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(""), true); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(1))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(31))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(32))), true); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(126))), true); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(127))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(159))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(160))), true); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(55296))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(57343))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(64976))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(65007))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(65008))), true); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(131070))), false); - BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(131071))), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(1)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(31)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(32)), true); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(126)), true); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(127)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(159)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(160)), true); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(55296)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(57343)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(64976)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(65007)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(65008)), true); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(131070)), false); + BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(131071)), false); } BOOST_AUTO_TEST_CASE(utf8_topic_validation) {