diff --git a/extras/tests/JsonDeserializer/invalid_input.cpp b/extras/tests/JsonDeserializer/invalid_input.cpp index 0ecbe83c..bc2f5c90 100644 --- a/extras/tests/JsonDeserializer/invalid_input.cpp +++ b/extras/tests/JsonDeserializer/invalid_input.cpp @@ -7,10 +7,9 @@ #include TEST_CASE("Invalid JSON input") { - const char* testCases[] = { - "'\\u'", "'\\u000g'", "'\\u000'", "'\\u000G'", "'\\ud83d\\ud83d'", - "'\\udda4'", "'\\ud83d_'", "'\\u000/'", "\\x1234", "6a9", - "1,", "2]", "3}"}; + const char* testCases[] = {"'\\u'", "'\\u000g'", "'\\u000'", "'\\u000G'", + "'\\u000/'", "\\x1234", "6a9", "1,", + "2]", "3}"}; const size_t testCount = sizeof(testCases) / sizeof(testCases[0]); DynamicJsonDocument doc(4096); @@ -23,7 +22,14 @@ TEST_CASE("Invalid JSON input") { } TEST_CASE("Invalid JSON input that should pass") { - const char* testCases[] = {"nulL", "tru3", "fals3"}; + const char* testCases[] = { + "nulL", + "tru3", + "fals3", + "'\\ud83d'", // leading surrogate without a trailing surrogate + "'\\udda4'", // trailing surrogate without a leading surrogate + "'\\ud83d\\ud83d'", // two leading surrogates + }; const size_t testCount = sizeof(testCases) / sizeof(testCases[0]); DynamicJsonDocument doc(4096); diff --git a/extras/tests/Misc/CMakeLists.txt b/extras/tests/Misc/CMakeLists.txt index ccd2f1b8..f2494fc1 100644 --- a/extras/tests/Misc/CMakeLists.txt +++ b/extras/tests/Misc/CMakeLists.txt @@ -10,6 +10,7 @@ add_executable(MiscTests StringWriter.cpp TypeTraits.cpp unsigned_char.cpp + Utf8.cpp version.cpp ) diff --git a/extras/tests/Misc/Utf8.cpp b/extras/tests/Misc/Utf8.cpp new file mode 100644 index 00000000..9763ced0 --- /dev/null +++ b/extras/tests/Misc/Utf8.cpp @@ -0,0 +1,59 @@ +// ArduinoJson - arduinojson.org +// Copyright Benoit Blanchon 2014-2019 +// MIT License + +#include +#include + +#include + +using namespace ARDUINOJSON_NAMESPACE; + +static void testCodepoint(uint32_t codepoint, std::string expected) { + char buffer[4096]; + MemoryPool pool(buffer, 4096); + StringBuilder str(&pool); + + CAPTURE(codepoint); + Utf8::encodeCodepoint(codepoint, str); + + REQUIRE(str.complete() == expected); +} + +TEST_CASE("Utf8::encodeCodepoint()") { + SECTION("U+0000") { + testCodepoint(0x0000, ""); + } + + SECTION("U+0001") { + testCodepoint(0x0001, "\x01"); + } + + SECTION("U+007F") { + testCodepoint(0x007F, "\x7f"); + } + + SECTION("U+0080") { + testCodepoint(0x0080, "\xc2\x80"); + } + + SECTION("U+07FF") { + testCodepoint(0x07FF, "\xdf\xbf"); + } + + SECTION("U+0800") { + testCodepoint(0x0800, "\xe0\xa0\x80"); + } + + SECTION("U+FFFF") { + testCodepoint(0xFFFF, "\xef\xbf\xbf"); + } + + SECTION("U+10000") { + testCodepoint(0x10000, "\xf0\x90\x80\x80"); + } + + SECTION("U+10FFFF") { + testCodepoint(0x10FFFF, "\xf4\x8f\xbf\xbf"); + } +} diff --git a/src/ArduinoJson/Json/JsonDeserializer.hpp b/src/ArduinoJson/Json/JsonDeserializer.hpp index c52a74d5..5e5b04fe 100644 --- a/src/ArduinoJson/Json/JsonDeserializer.hpp +++ b/src/ArduinoJson/Json/JsonDeserializer.hpp @@ -189,7 +189,9 @@ class JsonDeserializer { DeserializationError parseQuotedString(const char *&result) { StringBuilder builder = _stringStorage.startString(); +#if ARDUINOJSON_DECODE_UNICODE uint16_t surrogate1 = 0; +#endif const char stopChar = current(); move(); @@ -205,23 +207,21 @@ class JsonDeserializer { if (c == '\0') return DeserializationError::IncompleteInput; if (c == 'u') { #if ARDUINOJSON_DECODE_UNICODE - uint16_t codepoint; move(); - DeserializationError err = parseCodepoint(codepoint); + uint32_t codepoint; + uint16_t codeunit; + DeserializationError err = parseHex4(codeunit); if (err) return err; - if (codepoint >= 0xd800 && codepoint <= 0xdbff) { - if (surrogate1 > 0) return DeserializationError::InvalidInput; - surrogate1 = codepoint; - } else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { - if (surrogate1 == 0) return DeserializationError::InvalidInput; - uint32_t codepoint32 = 0x10000; - codepoint32 += static_cast(surrogate1 - 0xd800) << 10; - codepoint32 += codepoint - 0xdc00; - Utf8::encodeCodepoint(codepoint32, builder); - surrogate1 = 0; + if (codeunit >= 0xDC00) { + codepoint = + uint32_t(0x10000 | ((surrogate1 << 10) | (codeunit & 0x3FF))); + } else if (codeunit < 0xd800) { + codepoint = codeunit; } else { - Utf8::encodeCodepoint(codepoint, builder); + surrogate1 = codeunit & 0x3FF; + continue; } + Utf8::encodeCodepoint(codepoint, builder); continue; #else return DeserializationError::NotSupported; @@ -233,8 +233,6 @@ class JsonDeserializer { move(); } - if (surrogate1 > 0) return DeserializationError::InvalidInput; - builder.append(c); } @@ -312,14 +310,14 @@ class JsonDeserializer { return DeserializationError::InvalidInput; } - DeserializationError parseCodepoint(uint16_t &codepoint) { - codepoint = 0; + DeserializationError parseHex4(uint16_t &result) { + result = 0; for (uint8_t i = 0; i < 4; ++i) { char digit = current(); if (!digit) return DeserializationError::IncompleteInput; uint8_t value = decodeHex(digit); if (value > 0x0F) return DeserializationError::InvalidInput; - codepoint = uint16_t((codepoint << 4) | value); + result = uint16_t((result << 4) | value); move(); } return DeserializationError::Ok; diff --git a/src/ArduinoJson/Json/Utf8.hpp b/src/ArduinoJson/Json/Utf8.hpp index 3357e43e..d7a38526 100644 --- a/src/ArduinoJson/Json/Utf8.hpp +++ b/src/ArduinoJson/Json/Utf8.hpp @@ -10,23 +10,37 @@ namespace ARDUINOJSON_NAMESPACE { namespace Utf8 { template -inline void encodeCodepoint(uint32_t codepoint, TStringBuilder &str) { - if (codepoint < 0x80) { - str.append(char(codepoint)); - return; +inline void encodeCodepoint(uint32_t codepoint32, TStringBuilder& str) { + // this function was optimize for code size on AVR + + // a buffer to store the string in reverse + char buf[5]; + char* p = buf; + + *(p++) = 0; + if (codepoint32 < 0x80) { + *(p++) = char((codepoint32)); + } else { + *(p++) = char((codepoint32 | 0x80) & 0xBF); + uint16_t codepoint16 = uint16_t(codepoint32 >> 6); + if (codepoint16 < 0x20) { // 0x800 + *(p++) = char(codepoint16 | 0xC0); + } else { + *(p++) = char((codepoint16 | 0x80) & 0xBF); + codepoint16 = uint16_t(codepoint16 >> 6); + if (codepoint16 < 0x10) { // 0x10000 + *(p++) = char(codepoint16 | 0xE0); + } else { + *(p++) = char((codepoint16 | 0x80) & 0xBF); + codepoint16 = uint16_t(codepoint16 >> 6); + *(p++) = char(codepoint16 | 0xF0); + } + } } - if (codepoint < 0x00000800) { - str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6))); - } else if (codepoint < 0x00010000) { - str.append(char(0xe0 /*0b11100000*/ | (codepoint >> 12))); - str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80)); - } else if (codepoint < 0x00110000) { - str.append(char(0xf0 /*0b11110000*/ | (codepoint >> 18))); - str.append(char(((codepoint >> 12) & 0x3f /*0b00111111*/) | 0x80)); - str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80)); + while (*(--p)) { + str.append(*p); } - str.append(char((codepoint & 0x3f /*0b00111111*/) | 0x80)); } } // namespace Utf8 } // namespace ARDUINOJSON_NAMESPACE