Reduced Unicode conversion code size (-122 bytes on AVR)

This commit is contained in:
Benoit Blanchon
2020-01-09 15:39:45 +01:00
parent 91b808381e
commit 5ec062cc71
5 changed files with 115 additions and 37 deletions

View File

@ -7,10 +7,9 @@
#include <catch.hpp> #include <catch.hpp>
TEST_CASE("Invalid JSON input") { TEST_CASE("Invalid JSON input") {
const char* testCases[] = { const char* testCases[] = {"'\\u'", "'\\u000g'", "'\\u000'", "'\\u000G'",
"'\\u'", "'\\u000g'", "'\\u000'", "'\\u000G'", "'\\ud83d\\ud83d'", "'\\u000/'", "\\x1234", "6a9", "1,",
"'\\udda4'", "'\\ud83d_'", "'\\u000/'", "\\x1234", "6a9", "2]", "3}"};
"1,", "2]", "3}"};
const size_t testCount = sizeof(testCases) / sizeof(testCases[0]); const size_t testCount = sizeof(testCases) / sizeof(testCases[0]);
DynamicJsonDocument doc(4096); DynamicJsonDocument doc(4096);
@ -23,7 +22,14 @@ TEST_CASE("Invalid JSON input") {
} }
TEST_CASE("Invalid JSON input that should pass") { TEST_CASE("Invalid JSON input that should pass") {
const char* testCases[] = {"nulL", "tru3", "fals3"}; const char* testCases[] = {
"nulL",
"tru3",
"fals3",
"'\\ud83d'", // leading surrogate without a trailing surrogate
"'\\udda4'", // trailing surrogate without a leading surrogate
"'\\ud83d\\ud83d'", // two leading surrogates
};
const size_t testCount = sizeof(testCases) / sizeof(testCases[0]); const size_t testCount = sizeof(testCases) / sizeof(testCases[0]);
DynamicJsonDocument doc(4096); DynamicJsonDocument doc(4096);

View File

@ -10,6 +10,7 @@ add_executable(MiscTests
StringWriter.cpp StringWriter.cpp
TypeTraits.cpp TypeTraits.cpp
unsigned_char.cpp unsigned_char.cpp
Utf8.cpp
version.cpp version.cpp
) )

View File

@ -0,0 +1,59 @@
// ArduinoJson - arduinojson.org
// Copyright Benoit Blanchon 2014-2019
// MIT License
#include <ArduinoJson.h>
#include <catch.hpp>
#include <string>
using namespace ARDUINOJSON_NAMESPACE;
static void testCodepoint(uint32_t codepoint, std::string expected) {
char buffer[4096];
MemoryPool pool(buffer, 4096);
StringBuilder str(&pool);
CAPTURE(codepoint);
Utf8::encodeCodepoint(codepoint, str);
REQUIRE(str.complete() == expected);
}
TEST_CASE("Utf8::encodeCodepoint()") {
SECTION("U+0000") {
testCodepoint(0x0000, "");
}
SECTION("U+0001") {
testCodepoint(0x0001, "\x01");
}
SECTION("U+007F") {
testCodepoint(0x007F, "\x7f");
}
SECTION("U+0080") {
testCodepoint(0x0080, "\xc2\x80");
}
SECTION("U+07FF") {
testCodepoint(0x07FF, "\xdf\xbf");
}
SECTION("U+0800") {
testCodepoint(0x0800, "\xe0\xa0\x80");
}
SECTION("U+FFFF") {
testCodepoint(0xFFFF, "\xef\xbf\xbf");
}
SECTION("U+10000") {
testCodepoint(0x10000, "\xf0\x90\x80\x80");
}
SECTION("U+10FFFF") {
testCodepoint(0x10FFFF, "\xf4\x8f\xbf\xbf");
}
}

View File

@ -189,7 +189,9 @@ class JsonDeserializer {
DeserializationError parseQuotedString(const char *&result) { DeserializationError parseQuotedString(const char *&result) {
StringBuilder builder = _stringStorage.startString(); StringBuilder builder = _stringStorage.startString();
#if ARDUINOJSON_DECODE_UNICODE
uint16_t surrogate1 = 0; uint16_t surrogate1 = 0;
#endif
const char stopChar = current(); const char stopChar = current();
move(); move();
@ -205,23 +207,21 @@ class JsonDeserializer {
if (c == '\0') return DeserializationError::IncompleteInput; if (c == '\0') return DeserializationError::IncompleteInput;
if (c == 'u') { if (c == 'u') {
#if ARDUINOJSON_DECODE_UNICODE #if ARDUINOJSON_DECODE_UNICODE
uint16_t codepoint;
move(); move();
DeserializationError err = parseCodepoint(codepoint); uint32_t codepoint;
uint16_t codeunit;
DeserializationError err = parseHex4(codeunit);
if (err) return err; if (err) return err;
if (codepoint >= 0xd800 && codepoint <= 0xdbff) { if (codeunit >= 0xDC00) {
if (surrogate1 > 0) return DeserializationError::InvalidInput; codepoint =
surrogate1 = codepoint; uint32_t(0x10000 | ((surrogate1 << 10) | (codeunit & 0x3FF)));
} else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { } else if (codeunit < 0xd800) {
if (surrogate1 == 0) return DeserializationError::InvalidInput; codepoint = codeunit;
uint32_t codepoint32 = 0x10000;
codepoint32 += static_cast<uint32_t>(surrogate1 - 0xd800) << 10;
codepoint32 += codepoint - 0xdc00;
Utf8::encodeCodepoint(codepoint32, builder);
surrogate1 = 0;
} else { } else {
Utf8::encodeCodepoint(codepoint, builder); surrogate1 = codeunit & 0x3FF;
continue;
} }
Utf8::encodeCodepoint(codepoint, builder);
continue; continue;
#else #else
return DeserializationError::NotSupported; return DeserializationError::NotSupported;
@ -233,8 +233,6 @@ class JsonDeserializer {
move(); move();
} }
if (surrogate1 > 0) return DeserializationError::InvalidInput;
builder.append(c); builder.append(c);
} }
@ -312,14 +310,14 @@ class JsonDeserializer {
return DeserializationError::InvalidInput; return DeserializationError::InvalidInput;
} }
DeserializationError parseCodepoint(uint16_t &codepoint) { DeserializationError parseHex4(uint16_t &result) {
codepoint = 0; result = 0;
for (uint8_t i = 0; i < 4; ++i) { for (uint8_t i = 0; i < 4; ++i) {
char digit = current(); char digit = current();
if (!digit) return DeserializationError::IncompleteInput; if (!digit) return DeserializationError::IncompleteInput;
uint8_t value = decodeHex(digit); uint8_t value = decodeHex(digit);
if (value > 0x0F) return DeserializationError::InvalidInput; if (value > 0x0F) return DeserializationError::InvalidInput;
codepoint = uint16_t((codepoint << 4) | value); result = uint16_t((result << 4) | value);
move(); move();
} }
return DeserializationError::Ok; return DeserializationError::Ok;

View File

@ -10,23 +10,37 @@ namespace ARDUINOJSON_NAMESPACE {
namespace Utf8 { namespace Utf8 {
template <typename TStringBuilder> template <typename TStringBuilder>
inline void encodeCodepoint(uint32_t codepoint, TStringBuilder &str) { inline void encodeCodepoint(uint32_t codepoint32, TStringBuilder& str) {
if (codepoint < 0x80) { // this function was optimize for code size on AVR
str.append(char(codepoint));
return; // a buffer to store the string in reverse
char buf[5];
char* p = buf;
*(p++) = 0;
if (codepoint32 < 0x80) {
*(p++) = char((codepoint32));
} else {
*(p++) = char((codepoint32 | 0x80) & 0xBF);
uint16_t codepoint16 = uint16_t(codepoint32 >> 6);
if (codepoint16 < 0x20) { // 0x800
*(p++) = char(codepoint16 | 0xC0);
} else {
*(p++) = char((codepoint16 | 0x80) & 0xBF);
codepoint16 = uint16_t(codepoint16 >> 6);
if (codepoint16 < 0x10) { // 0x10000
*(p++) = char(codepoint16 | 0xE0);
} else {
*(p++) = char((codepoint16 | 0x80) & 0xBF);
codepoint16 = uint16_t(codepoint16 >> 6);
*(p++) = char(codepoint16 | 0xF0);
}
}
} }
if (codepoint < 0x00000800) { while (*(--p)) {
str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6))); str.append(*p);
} else if (codepoint < 0x00010000) {
str.append(char(0xe0 /*0b11100000*/ | (codepoint >> 12)));
str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80));
} else if (codepoint < 0x00110000) {
str.append(char(0xf0 /*0b11110000*/ | (codepoint >> 18)));
str.append(char(((codepoint >> 12) & 0x3f /*0b00111111*/) | 0x80));
str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80));
} }
str.append(char((codepoint & 0x3f /*0b00111111*/) | 0x80));
} }
} // namespace Utf8 } // namespace Utf8
} // namespace ARDUINOJSON_NAMESPACE } // namespace ARDUINOJSON_NAMESPACE