diff --git a/CHANGELOG.md b/CHANGELOG.md index 101b8eb2..e2056f8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,10 @@ HEAD * Added `BasicJsonDocument::shrinkToFit()` * Added support of `uint8_t` for `serializeJson()`, `serializeJsonPretty()`, and `serializeMsgPack()` (issue #1142) * Auto enable support for `std::string` and `std::stream` on modern compilers (issue #1156) - No need to define `ARDUINOJSON_ENABLE_STD_STRING` and `ARDUINOJSON_ENABLE_STD_STREAM`. + (No need to define `ARDUINOJSON_ENABLE_STD_STRING` and `ARDUINOJSON_ENABLE_STD_STREAM` anymore) +* Improved decoding of UTF-16 surrogate pairs (PR #1157 by @kaysievers) + (ArduinoJson now produces standard UTF-8 instead of CESU-8) + v6.13.0 (2019-11-01) ------- diff --git a/extras/tests/JsonDeserializer/invalid_input.cpp b/extras/tests/JsonDeserializer/invalid_input.cpp index 8e5dec9b..0ecbe83c 100644 --- a/extras/tests/JsonDeserializer/invalid_input.cpp +++ b/extras/tests/JsonDeserializer/invalid_input.cpp @@ -7,9 +7,10 @@ #include TEST_CASE("Invalid JSON input") { - const char* testCases[] = {"'\\u'", "'\\u000g'", "'\\u000'", "'\\u000G'", - "'\\u000/'", "\\x1234", "6a9", "1,", - "2]", "3}"}; + const char* testCases[] = { + "'\\u'", "'\\u000g'", "'\\u000'", "'\\u000G'", "'\\ud83d\\ud83d'", + "'\\udda4'", "'\\ud83d_'", "'\\u000/'", "\\x1234", "6a9", + "1,", "2]", "3}"}; const size_t testCount = sizeof(testCases) / sizeof(testCases[0]); DynamicJsonDocument doc(4096); diff --git a/extras/tests/JsonDeserializer/string.cpp b/extras/tests/JsonDeserializer/string.cpp index 26bb4394..cd056822 100644 --- a/extras/tests/JsonDeserializer/string.cpp +++ b/extras/tests/JsonDeserializer/string.cpp @@ -17,10 +17,10 @@ TEST_CASE("Valid JSON strings value") { {"\'hello world\'", "hello world"}, {"\"1\\\"2\\\\3\\/4\\b5\\f6\\n7\\r8\\t9\"", "1\"2\\3/4\b5\f6\n7\r8\t9"}, {"'\\u0041'", "A"}, - {"'\\u00e4'", "\xc3\xa4"}, // ä - {"'\\u00E4'", "\xc3\xa4"}, // ä - {"'\\u3042'", "\xe3\x81\x82"}, // あ - + {"'\\u00e4'", "\xc3\xa4"}, // ä + {"'\\u00E4'", "\xc3\xa4"}, // ä + {"'\\u3042'", "\xe3\x81\x82"}, // あ + {"'\\ud83d\\udda4'", "\xf0\x9f\x96\xa4"}, // 🖤 }; const size_t testCount = sizeof(testCases) / sizeof(testCases[0]); diff --git a/src/ArduinoJson/Json/JsonDeserializer.hpp b/src/ArduinoJson/Json/JsonDeserializer.hpp index 3f08d0bf..c52a74d5 100644 --- a/src/ArduinoJson/Json/JsonDeserializer.hpp +++ b/src/ArduinoJson/Json/JsonDeserializer.hpp @@ -189,6 +189,7 @@ class JsonDeserializer { DeserializationError parseQuotedString(const char *&result) { StringBuilder builder = _stringStorage.startString(); + uint16_t surrogate1 = 0; const char stopChar = current(); move(); @@ -208,7 +209,19 @@ class JsonDeserializer { move(); DeserializationError err = parseCodepoint(codepoint); if (err) return err; - Utf8::encodeCodepoint(codepoint, builder); + if (codepoint >= 0xd800 && codepoint <= 0xdbff) { + if (surrogate1 > 0) return DeserializationError::InvalidInput; + surrogate1 = codepoint; + } else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { + if (surrogate1 == 0) return DeserializationError::InvalidInput; + uint32_t codepoint32 = 0x10000; + codepoint32 += static_cast(surrogate1 - 0xd800) << 10; + codepoint32 += codepoint - 0xdc00; + Utf8::encodeCodepoint(codepoint32, builder); + surrogate1 = 0; + } else { + Utf8::encodeCodepoint(codepoint, builder); + } continue; #else return DeserializationError::NotSupported; @@ -220,6 +233,8 @@ class JsonDeserializer { move(); } + if (surrogate1 > 0) return DeserializationError::InvalidInput; + builder.append(c); } diff --git a/src/ArduinoJson/Json/Utf8.hpp b/src/ArduinoJson/Json/Utf8.hpp index d79f65d8..3357e43e 100644 --- a/src/ArduinoJson/Json/Utf8.hpp +++ b/src/ArduinoJson/Json/Utf8.hpp @@ -10,17 +10,21 @@ namespace ARDUINOJSON_NAMESPACE { namespace Utf8 { template -inline void encodeCodepoint(uint16_t codepoint, TStringBuilder &str) { +inline void encodeCodepoint(uint32_t codepoint, TStringBuilder &str) { if (codepoint < 0x80) { str.append(char(codepoint)); return; } - if (codepoint >= 0x00000800) { + if (codepoint < 0x00000800) { + str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6))); + } else if (codepoint < 0x00010000) { str.append(char(0xe0 /*0b11100000*/ | (codepoint >> 12))); str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80)); - } else { - str.append(char(0xc0 /*0b11000000*/ | (codepoint >> 6))); + } else if (codepoint < 0x00110000) { + str.append(char(0xf0 /*0b11110000*/ | (codepoint >> 18))); + str.append(char(((codepoint >> 12) & 0x3f /*0b00111111*/) | 0x80)); + str.append(char(((codepoint >> 6) & 0x3f /*0b00111111*/) | 0x80)); } str.append(char((codepoint & 0x3f /*0b00111111*/) | 0x80)); }