Align utf8 manipulation code style.

This commit is contained in:
Ivica Siladic
2023-11-30 08:36:05 +01:00
parent a5d020155f
commit 8c1c4fab9f
2 changed files with 68 additions and 81 deletions

View File

@@ -6,44 +6,32 @@
namespace async_mqtt5::detail { namespace async_mqtt5::detail {
struct code_point { inline int pop_front_unichar(std::string_view& s) {
int32_t val; // assuming that s.length() is > 0
uint32_t size;
auto operator<=>(const code_point&) const = default; int n = s[0] & 0xF0;
int ch = -1;
static code_point from(std::string_view s) { if ((n & 0x80) == 0) {
auto hnibble = s[0] & 0xF0; ch = s[0];
return s.remove_prefix(1);
(hnibble & 0x80) == 0 ?
code_point { s[0], 1 }
:
(hnibble == 0xC0 || hnibble == 0xD0) && s.size() > 1 ?
code_point {
(int32_t(s[0] & 0x1F) << 6) | int32_t(s[1] & 0x3F),
2
} }
: else if ((n == 0xC0 || n == 0xD0) && s.size() > 1) {
(hnibble == 0xE0) && s.size() > 2 ? ch = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
code_point { s.remove_prefix(2);
(int32_t(s[0] & 0x1F) << 12) |
(int32_t(s[1] & 0x3F) << 6) |
int32_t(s[2] & 0x3F),
3
} }
: else if ((n == 0xE0) && s.size() > 2) {
(hnibble == 0xF0) && s.size() > 3 ? ch = ((s[0] & 0x1F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
code_point { s.remove_prefix(3);
(int32_t(s[0] & 0x1F) << 18) |
(int32_t(s[1] & 0x3F) << 12) |
(int32_t(s[2] & 0x3F) << 6) |
int32_t(s[3] & 0x3F),
4
} }
: else if ((n == 0xF0) && s.size() > 3) {
code_point { -1, 0 }; ch = ((s[0] & 0x1F) << 18) | ((s[1] & 0x3F) << 12) |
((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
s.remove_prefix(4);
}
return ch;
} }
};
inline bool is_valid_mqtt_utf8(std::string_view str) { inline bool is_valid_mqtt_utf8(std::string_view str) {
constexpr size_t max_sz = 65535; constexpr size_t max_sz = 65535;
@@ -51,23 +39,21 @@ inline bool is_valid_mqtt_utf8(std::string_view str) {
if (str.size() > max_sz) if (str.size() > max_sz)
return false; return false;
auto is_valid_cp = [](int32_t c) -> bool { constexpr int fe_flag = 0xFE;
constexpr int32_t fe_flag = 0xFE; constexpr int ff_flag = 0xFF;
constexpr int32_t ff_flag = 0xFF;
return c >= 32 && // U+0000...U+001F control characters
(c < 127 || c > 159) && // U+007F...0+009F control characters
(c < 55296 || c > 57343) && // U+D800...U+DFFF surrogates
(c < 64976 || c > 65007) &&// U+FDD0...U+FDEF non-characters
(c & fe_flag) != fe_flag && // non-characters
(c & ff_flag) != ff_flag;
};
while (!str.empty()) { while (!str.empty()) {
auto cp = code_point::from(str.data()); int c = pop_front_unichar(str);
if (!is_valid_cp(cp.val))
auto is_valid = c > 0x001F && // U+0000...U+001F control characters
(c < 0x007F || c > 0x009F) && // U+007F...0+009F control characters
(c < 0xD800 || c > 0xDFFF) && // U+D800...U+DFFF surrogates
(c < 0xFDD0 || c > 0xFDEF) && // U+FDD0...U+FDEF non-characters
(c & fe_flag) != fe_flag && // non-characters
(c & ff_flag) != ff_flag;
if (!is_valid)
return false; return false;
str.remove_prefix(cp.size);
} }
return true; return true;

View File

@@ -4,45 +4,46 @@
BOOST_AUTO_TEST_SUITE(utf8_mqtt/*, *boost::unit_test::disabled()*/) BOOST_AUTO_TEST_SUITE(utf8_mqtt/*, *boost::unit_test::disabled()*/)
std::string to_str(async_mqtt5::detail::code_point cp) { std::string to_str(int utf8ch) {
return cp.size == 1 ? std::string { char(cp.val) } if (utf8ch < 0x80)
: cp.size == 2 ? std::string { char((cp.val >> 6) | 0xC0), char((cp.val & 0x3F) | 0x80) } return { char(utf8ch) };
: cp.size == 3 ? std::string { if (utf8ch < 0x800)
char((cp.val >> 12) | 0xE0), return {
char(((cp.val >> 6) & 0x3F) | 0x80), char((utf8ch >> 6) | 0xC0),
char((cp.val & 0x3F) | 0x80) char((utf8ch & 0x3F) | 0x80)
} };
: std::string { // cp.size == 4 if (utf8ch < 0xFFFF)
char((cp.val >> 18) | 0xF0), return {
char(((cp.val >> 12) & 0x3F) | 0x80), char((utf8ch >> 12) | 0xE0),
char(((cp.val >> 6) & 0x3F) | 0x80), char(((utf8ch >> 6) & 0x3F) | 0x80),
char((cp.val & 0x3F) | 0x80) char((utf8ch & 0x3F) | 0x80)
};
return {
char((utf8ch >> 18) | 0xF0),
char(((utf8ch >> 12) & 0x3F) | 0x80),
char(((utf8ch >> 6) & 0x3F) | 0x80),
char((utf8ch & 0x3F) | 0x80)
}; };
}
async_mqtt5::detail::code_point cp(int32_t val) {
return { val, uint32_t(val < 0x80 ? 1 : val < 0x800 ? 2 : val < 0xFFFF ? 3 : /* val < 0x10FFFF */ 4) };
} }
BOOST_AUTO_TEST_CASE(utf8_string_validation) { BOOST_AUTO_TEST_CASE(utf8_string_validation) {
using namespace async_mqtt5::detail; using namespace async_mqtt5::detail;
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8("stringy"), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8("stringy"), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(""), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(""), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(1))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(1)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(31))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(31)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(32))), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(32)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(126))), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(126)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(127))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(127)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(159))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(159)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(160))), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(160)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(55296))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(55296)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(57343))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(57343)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(64976))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(64976)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(65007))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(65007)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(65008))), true); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(65008)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(131070))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(131070)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(131071))), false); BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(131071)), false);
} }
BOOST_AUTO_TEST_CASE(utf8_topic_validation) { BOOST_AUTO_TEST_CASE(utf8_topic_validation) {