C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
2014-02-25 13:44:11 -03:00
parent 4fefb1ca2a
commit 70122b3061
12 changed files with 503 additions and 28 deletions
--- a/tests/auto/cplusplus/lexer/tst_lexer.cpp
+++ b/tests/auto/cplusplus/lexer/tst_lexer.cpp
@@ -52,28 +52,49 @@ class tst_SimpleLexer: public QObject
 public:
    tst_SimpleLexer() : _state(0) {}

+    enum TokenCompareFlag {
+        CompareKind            = 1 << 1,
+        CompareBytes           = 1 << 2,
+        CompareBytesBegin      = 1 << 3,
+        CompareBytesEnd        = 1 << 4,
+        CompareUtf16Chars      = 1 << 5,
+        CompareUtf16CharsBegin = 1 << 6,
+        CompareUtf16CharsEnd   = 1 << 7
+    };
+    Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
+
 private slots:
    void basic();
    void basic_data();
    void incremental();
    void incremental_data();

+    //
+    // The following "non-latin1" code points are used in the tests following this comment:
+    //
+    //   U+00FC  - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
+    //   U+4E8C  - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
+    //   U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
+    //
+
+    void bytes_and_utf16chars();
+    void bytes_and_utf16chars_data();
+    void offsets();
+    void offsets_data();
+
 private:
    static TokenList toTokenList(const TokenKindList &tokenKinds);

-    enum TokenCompareFlag {
-        CompareKind = 1 << 1
-    };
-    Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
-
    void run(const QByteArray &source,
             const TokenList &expectedTokenList,
             bool preserveState,
-             TokenCompareFlag compareFlags);
+             TokenCompareFlags compareFlags);

    int _state;
 };

+Q_DECLARE_OPERATORS_FOR_FLAGS(tst_SimpleLexer::TokenCompareFlags)
+
 TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
 {
    TokenList tokens;
@@ -88,10 +109,13 @@ TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
 void tst_SimpleLexer::run(const QByteArray &source,
                          const TokenList &expectedTokenList,
                          bool preserveState,
-                          TokenCompareFlag compareFlags)
+                          TokenCompareFlags compareFlags)
 {
+    QVERIFY(compareFlags);
+
    SimpleLexer lexer;
-    const QList<Token> tokenList = lexer(source, preserveState ? _state : 0);
+    const QList<Token> tokenList = lexer(source, preserveState ? _state : 0,
+                                         /*convertToUtf8=*/ true);
    if (preserveState)
        _state = lexer.state();

@@ -108,6 +132,20 @@ void tst_SimpleLexer::run(const QByteArray &source,
 #endif
        if (compareFlags & CompareKind)
            QCOMPARE(token.kind(), expectedToken.kind());
+
+        if (compareFlags & CompareBytes)
+            QCOMPARE(token.bytes(), expectedToken.bytes());
+        if (compareFlags & CompareBytesBegin)
+            QCOMPARE(token.bytesBegin(), expectedToken.bytesBegin());
+        if (compareFlags & CompareBytesEnd)
+            QCOMPARE(token.bytesEnd(), expectedToken.bytesEnd());
+
+        if (compareFlags & CompareUtf16Chars)
+            QCOMPARE(token.utf16chars(), expectedToken.utf16chars());
+        if (compareFlags & CompareUtf16CharsBegin)
+            QCOMPARE(token.utf16charsBegin(), expectedToken.utf16charsBegin());
+        if (compareFlags & CompareUtf16CharsEnd)
+            QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
    }
    QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected.");
 }
@@ -221,7 +259,168 @@ void tst_SimpleLexer::basic_data()
        << T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE
        << T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER;
    QTest::newRow(source) << source << expectedTokenKindList;
+}

+void tst_SimpleLexer::bytes_and_utf16chars()
+{
+    QFETCH(QByteArray, source);
+    QFETCH(QList<Token>, expectedTokenList);
+
+    const TokenCompareFlags compareFlags = CompareKind | CompareBytes | CompareUtf16Chars;
+    run(source, expectedTokenList, false, compareFlags);
+}
+
+static QList<Token> createToken(unsigned kind, unsigned bytes, unsigned utf16chars)
+{
+    Token t;
+    t.f.kind = kind;
+    t.f.bytes = bytes;
+    t.f.utf16chars = utf16chars;
+    return QList<Token>() << t;
+}
+
+void tst_SimpleLexer::bytes_and_utf16chars_data()
+{
+    QTest::addColumn<QByteArray>("source");
+    QTest::addColumn<QList<Token> >("expectedTokenList");
+
+    typedef QByteArray _;
+
+    // LATIN1 Identifier
+    QTest::newRow("latin1 identifier")
+        << _("var") << createToken(T_IDENTIFIER, 3, 3);
+
+    // NON-LATIN1 identifier (code point with 2 UTF8 code units)
+    QTest::newRow("non-latin1 identifier (2-byte code unit at start)")
+        << _("\u00FC_var") << createToken(T_IDENTIFIER, 6, 5);
+    QTest::newRow("non-latin1 identifier (2-byte code unit in center)")
+        << _("_v\u00FCr_") << createToken(T_IDENTIFIER, 6, 5);
+    QTest::newRow("non-latin1 identifier (2-byte code unit at end)")
+        << _("var_\u00FC") << createToken(T_IDENTIFIER, 6, 5);
+    QTest::newRow("non-latin1 identifier (2-byte code unit only)")
+        << _("\u00FC") << createToken(T_IDENTIFIER, 2, 1);
+
+    // NON-LATIN1 identifier (code point with 3 UTF8 code units)
+    QTest::newRow("non-latin1 identifier (3-byte code unit at start)")
+        << _("\u4E8C_var") << createToken(T_IDENTIFIER, 7, 5);
+    QTest::newRow("non-latin1 identifier (3-byte code unit in center)")
+        << _("_v\u4E8Cr_") << createToken(T_IDENTIFIER, 7, 5);
+    QTest::newRow("non-latin1 identifier (3-byte code unit at end)")
+        << _("var_\u4E8C") << createToken(T_IDENTIFIER, 7, 5);
+    QTest::newRow("non-latin1 identifier (3-byte code unit only)")
+        << _("\u4E8C") << createToken(T_IDENTIFIER, 3, 1);
+
+    // NON-LATIN1 identifier (code point with 4 UTF8 code units)
+    QTest::newRow("non-latin1 identifier (4-byte code unit at start)")
+        << _("\U00010302_var") << createToken(T_IDENTIFIER, 8, 6);
+    QTest::newRow("non-latin1 identifier (4-byte code unit in center)")
+        << _("_v\U00010302r_") << createToken(T_IDENTIFIER, 8, 6);
+    QTest::newRow("non-latin1 identifier (4-byte code unit at end)")
+        << _("var_\U00010302") << createToken(T_IDENTIFIER, 8, 6);
+    QTest::newRow("non-latin1 identifier (4-byte code unit only)")
+        << _("\U00010302") << createToken(T_IDENTIFIER, 4, 2);
+
+    // NON-LATIN1 identifier (code points with several multi-byte UTF8 code units)
+    QTest::newRow("non-latin1 identifier (mixed multi-byte code units at start)")
+        << _("\u00FC\u4E8C\U00010302_var") << createToken(T_IDENTIFIER, 13, 8);
+    QTest::newRow("non-latin1 identifier (mixed multi-byte code units in center)")
+        << _("_v\u00FC\u4E8C\U00010302r_") << createToken(T_IDENTIFIER, 13, 8);
+    QTest::newRow("non-latin1 identifier (mixed multi-byte code units at end)")
+        << _("var_\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 13, 8);
+    QTest::newRow("non-latin1 identifier (mixed multi-byte code units only)")
+        << _("\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 9, 4);
+
+    // Comments
+    QTest::newRow("ascii comment /* ... */")
+        << _("/* hello world */") << createToken(T_COMMENT, 17, 17);
+    QTest::newRow("latin1 comment //")
+        << _("// hello world") << createToken(T_CPP_COMMENT, 14, 14);
+    QTest::newRow("non-latin1 comment /* ... */ (1)")
+        << _("/* \u00FC\u4E8C\U00010302 */") << createToken(T_COMMENT, 15, 10);
+    QTest::newRow("non-latin1 comment /* ... */ (2)")
+        << _("/*\u00FC\u4E8C\U00010302*/") << createToken(T_COMMENT, 13, 8);
+    QTest::newRow("non-latin1 comment // (1)")
+        << _("// \u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 12, 7);
+    QTest::newRow("non-latin1 comment // (2)")
+        << _("//\u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 11, 6);
+
+    // String Literals
+    QTest::newRow("latin1 string literal")
+        << _("\"hello\"") << createToken(T_STRING_LITERAL, 7, 7);
+    QTest::newRow("non-latin1 string literal")
+        << _("\"\u00FC\u4E8C\U00010302\"") << createToken(T_STRING_LITERAL, 11, 6);
+}
+
+static Token createToken(unsigned kind, unsigned byteOffset, unsigned bytes,
+                         unsigned utf16charsOffset, unsigned utf16chars)
+{
+    Token t;
+    t.f.kind = kind;
+    t.byteOffset = byteOffset;
+    t.f.bytes = bytes;
+    t.utf16charOffset = utf16charsOffset;
+    t.f.utf16chars = utf16chars;
+    return t;
+}
+
+void tst_SimpleLexer::offsets()
+{
+    QFETCH(QByteArray, source);
+    QFETCH(QList<Token>, expectedTokenList);
+
+    const TokenCompareFlags compareFlags = CompareKind
+            | CompareBytesBegin
+            | CompareBytesEnd
+            | CompareUtf16CharsBegin
+            | CompareUtf16CharsEnd
+            ;
+    run(source, expectedTokenList, false, compareFlags);
+}
+
+void tst_SimpleLexer::offsets_data()
+{
+    QTest::addColumn<QByteArray>("source");
+    QTest::addColumn<QList<Token> >("expectedTokenList");
+
+    typedef QByteArray _;
+
+    // LATIN1 Identifier
+    QTest::newRow("latin1 identifiers")
+        << _("var var") << (QList<Token>()
+            << createToken(T_IDENTIFIER, 0, 3, 0, 3)
+            << createToken(T_IDENTIFIER, 4, 3, 4, 3)
+        );
+
+    // NON-LATIN1 identifier
+    QTest::newRow("non-latin1 identifiers 1")
+        << _("var_\u00FC var_\u00FC") << (QList<Token>()
+            << createToken(T_IDENTIFIER, 0, 6, 0, 5)
+            << createToken(T_IDENTIFIER, 7, 6, 6, 5)
+        );
+    QTest::newRow("non-latin1 identifiers 2")
+        << _("\u00FC\u4E8C\U00010302 \u00FC\u4E8C\U00010302") << (QList<Token>()
+            << createToken(T_IDENTIFIER, 0, 9, 0, 4)
+            << createToken(T_IDENTIFIER, 10, 9, 5, 4)
+        );
+
+    QTest::newRow("non-latin1 identifiers 3")   // first code unit on line: <bytes> / <utf16char>
+        << _("class v\u00FC\u4E8C\U00010302\n"  //  0 / 0
+             "{\n"                              // 17 / 12
+             "public:\n"                        // 19 / 14
+             "    v\u00FC\u4E8C\U00010302();\n" // 27 / 22
+             "};\n") << (QList<Token>()         // 45 / 35
+            << createToken(T_CLASS, 0, 5, 0, 5)         // class
+            << createToken(T_IDENTIFIER, 6, 10, 6, 5)   // non-latin1 id
+            << createToken(T_LBRACE, 17, 1, 12, 1)      // {
+            << createToken(T_PUBLIC, 19, 6, 14, 6)      // public
+            << createToken(T_COLON, 25, 1, 20, 1)       // :
+            << createToken(T_IDENTIFIER, 31, 10, 26, 5) // id
+            << createToken(T_LPAREN, 41, 1, 31, 1)      // (
+            << createToken(T_RPAREN, 42, 1, 32, 1)      // )
+            << createToken(T_SEMICOLON, 43, 1, 33, 1)   // ;
+            << createToken(T_RBRACE, 45, 1, 35, 1)      // }
+            << createToken(T_SEMICOLON, 46, 1, 36, 1)   // ;
+        );
 }

 void tst_SimpleLexer::incremental()