Align utf8 manipulation code style.

This commit is contained in:
Ivica Siladic
2023-11-30 08:36:05 +01:00
parent a5d020155f
commit 8c1c4fab9f
2 changed files with 68 additions and 81 deletions

View File

@@ -6,44 +6,32 @@
namespace async_mqtt5::detail {
struct code_point {
int32_t val;
uint32_t size;
inline int pop_front_unichar(std::string_view& s) {
// assuming that s.length() is > 0
auto operator<=>(const code_point&) const = default;
int n = s[0] & 0xF0;
int ch = -1;
static code_point from(std::string_view s) {
auto hnibble = s[0] & 0xF0;
return
(hnibble & 0x80) == 0 ?
code_point { s[0], 1 }
:
(hnibble == 0xC0 || hnibble == 0xD0) && s.size() > 1 ?
code_point {
(int32_t(s[0] & 0x1F) << 6) | int32_t(s[1] & 0x3F),
2
}
:
(hnibble == 0xE0) && s.size() > 2 ?
code_point {
(int32_t(s[0] & 0x1F) << 12) |
(int32_t(s[1] & 0x3F) << 6) |
int32_t(s[2] & 0x3F),
3
}
:
(hnibble == 0xF0) && s.size() > 3 ?
code_point {
(int32_t(s[0] & 0x1F) << 18) |
(int32_t(s[1] & 0x3F) << 12) |
(int32_t(s[2] & 0x3F) << 6) |
int32_t(s[3] & 0x3F),
4
}
:
code_point { -1, 0 };
if ((n & 0x80) == 0) {
ch = s[0];
s.remove_prefix(1);
}
};
else if ((n == 0xC0 || n == 0xD0) && s.size() > 1) {
ch = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
s.remove_prefix(2);
}
else if ((n == 0xE0) && s.size() > 2) {
ch = ((s[0] & 0x1F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
s.remove_prefix(3);
}
else if ((n == 0xF0) && s.size() > 3) {
ch = ((s[0] & 0x1F) << 18) | ((s[1] & 0x3F) << 12) |
((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
s.remove_prefix(4);
}
return ch;
}
inline bool is_valid_mqtt_utf8(std::string_view str) {
constexpr size_t max_sz = 65535;
@@ -51,23 +39,21 @@ inline bool is_valid_mqtt_utf8(std::string_view str) {
if (str.size() > max_sz)
return false;
auto is_valid_cp = [](int32_t c) -> bool {
constexpr int32_t fe_flag = 0xFE;
constexpr int32_t ff_flag = 0xFF;
return c >= 32 && // U+0000...U+001F control characters
(c < 127 || c > 159) && // U+007F...0+009F control characters
(c < 55296 || c > 57343) && // U+D800...U+DFFF surrogates
(c < 64976 || c > 65007) &&// U+FDD0...U+FDEF non-characters
(c & fe_flag) != fe_flag && // non-characters
(c & ff_flag) != ff_flag;
};
constexpr int fe_flag = 0xFE;
constexpr int ff_flag = 0xFF;
while (!str.empty()) {
auto cp = code_point::from(str.data());
if (!is_valid_cp(cp.val))
int c = pop_front_unichar(str);
auto is_valid = c > 0x001F && // U+0000...U+001F control characters
(c < 0x007F || c > 0x009F) && // U+007F...0+009F control characters
(c < 0xD800 || c > 0xDFFF) && // U+D800...U+DFFF surrogates
(c < 0xFDD0 || c > 0xFDEF) && // U+FDD0...U+FDEF non-characters
(c & fe_flag) != fe_flag && // non-characters
(c & ff_flag) != ff_flag;
if (!is_valid)
return false;
str.remove_prefix(cp.size);
}
return true;

View File

@@ -4,45 +4,46 @@
BOOST_AUTO_TEST_SUITE(utf8_mqtt/*, *boost::unit_test::disabled()*/)
std::string to_str(async_mqtt5::detail::code_point cp) {
return cp.size == 1 ? std::string { char(cp.val) }
: cp.size == 2 ? std::string { char((cp.val >> 6) | 0xC0), char((cp.val & 0x3F) | 0x80) }
: cp.size == 3 ? std::string {
char((cp.val >> 12) | 0xE0),
char(((cp.val >> 6) & 0x3F) | 0x80),
char((cp.val & 0x3F) | 0x80)
}
: std::string { // cp.size == 4
char((cp.val >> 18) | 0xF0),
char(((cp.val >> 12) & 0x3F) | 0x80),
char(((cp.val >> 6) & 0x3F) | 0x80),
char((cp.val & 0x3F) | 0x80)
std::string to_str(int utf8ch) {
if (utf8ch < 0x80)
return { char(utf8ch) };
if (utf8ch < 0x800)
return {
char((utf8ch >> 6) | 0xC0),
char((utf8ch & 0x3F) | 0x80)
};
}
async_mqtt5::detail::code_point cp(int32_t val) {
return { val, uint32_t(val < 0x80 ? 1 : val < 0x800 ? 2 : val < 0xFFFF ? 3 : /* val < 0x10FFFF */ 4) };
if (utf8ch < 0xFFFF)
return {
char((utf8ch >> 12) | 0xE0),
char(((utf8ch >> 6) & 0x3F) | 0x80),
char((utf8ch & 0x3F) | 0x80)
};
return {
char((utf8ch >> 18) | 0xF0),
char(((utf8ch >> 12) & 0x3F) | 0x80),
char(((utf8ch >> 6) & 0x3F) | 0x80),
char((utf8ch & 0x3F) | 0x80)
};
}
BOOST_AUTO_TEST_CASE(utf8_string_validation) {
using namespace async_mqtt5::detail;
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8("stringy"), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(""), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(1))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(31))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(32))), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(126))), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(127))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(159))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(160))), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(55296))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(57343))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(64976))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(65007))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(65008))), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(131070))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(cp(131071))), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(1)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(31)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(32)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(126)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(127)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(159)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(160)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(55296)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(57343)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(64976)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(65007)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(65008)), true);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(131070)), false);
BOOST_CHECK_EQUAL(is_valid_mqtt_utf8(to_str(131071)), false);
}
BOOST_AUTO_TEST_CASE(utf8_topic_validation) {