C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
2014-02-25 13:44:11 -03:00
parent 4fefb1ca2a
commit 70122b3061
12 changed files with 503 additions and 28 deletions
--- a/src/libs/3rdparty/cplusplus/Lexer.cpp
+++ b/src/libs/3rdparty/cplusplus/Lexer.cpp
@@ -29,6 +29,13 @@
 using namespace CPlusPlus;
 /*!
    \class Lexer
    \brief The Lexer generates tokens from an UTF-8 encoded source text.
    \sa Token
 */
 Lexer::Lexer(TranslationUnit *unit)
    : _translationUnit(unit),
      _control(unit->control()),
@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
    _firstChar = firstChar;
    _lastChar = lastChar;
    _currentChar = _firstChar - 1;
    _currentCharUtf16 = -1;
    _tokenStart = _currentChar;
    _yychar = '\n';
 }
@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
    tok->reset();
    scan_helper(tok);
    tok->f.bytes = _currentChar - _tokenStart;
    tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
 }
 void Lexer::scan_helper(Token *tok)
@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
    _tokenStart = _currentChar;
    tok->byteOffset = _currentChar - _firstChar;
    _tokenStartUtf16 = _currentCharUtf16;
    tok->utf16charOffset = _currentCharUtf16;
    if (_yychar) {
        s._newlineExpected = false;
    } else if (s._tokenKind) {
@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
            } else {
                scanIdentifier(tok);
            }
-        } else if (std::isalpha(ch) || ch == '_' || ch == '$') {
+        } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
-            scanIdentifier(tok);
+            scanIdentifier(tok, _currentChar - _tokenStart - 1);
        } else if (std::isdigit(ch)) {
            scanNumericLiteral(tok);
        } else {
@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
 void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
 {
    const char *yytext = _currentChar - 1 - extraProcessedChars;
-    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
+    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
            || isByteOfMultiByteCodePoint(_yychar)) {
        yyinp();
    }
    int yylen = _currentChar - yytext;
    if (f._scanKeywords)
        tok->f.kind = classify(yytext, yylen, _languageFeatures);
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -62,6 +62,7 @@ public:
    void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
 private:
    void pushLineStartOffset();
    void scan_helper(Token *tok);
    void setSource(const char *firstChar, const char *lastChar);
    static int classify(const char *string, int length, LanguageFeatures features);
@@ -77,15 +78,32 @@ private:
    void scanBackslash(Kind type);
    void scanCppComment(Kind type);
-    inline void yyinp()
+    static bool isByteOfMultiByteCodePoint(unsigned char byte)
    { return byte & 0x80; } // Check if most significant bit is set
    void yyinp()
    {
-        _yychar = *++_currentChar;
+        ++_currentCharUtf16;
        // Process multi-byte UTF-8 code point (non-latin1)
        if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
            unsigned trailingBytesCurrentCodePoint = 1;
            for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
                ++trailingBytesCurrentCodePoint;
            // Code points >= 0x00010000 are represented by two UTF16 code units
            if (trailingBytesCurrentCodePoint >= 3)
                ++_currentCharUtf16;
            _yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
        // Process single-byte UTF-8 code point (latin1)
        } else {
            _yychar = *++_currentChar;
        }
        if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
            pushLineStartOffset();
    }
    void pushLineStartOffset();
 private:
    struct Flags {
        unsigned _scanCommentTokens: 1;
@@ -105,6 +123,10 @@ private:
    const char *_lastChar;
    const char *_tokenStart;
    unsigned char _yychar;
    unsigned _currentCharUtf16;
    unsigned _tokenStartUtf16;
    union {
        unsigned char _state;
        State s;
@@ -113,6 +135,7 @@ private:
        unsigned _flags;
        Flags f;
    };
    unsigned _currentLine;
    LanguageFeatures _languageFeatures;
 };
--- a/src/libs/3rdparty/cplusplus/Token.cpp
+++ b/src/libs/3rdparty/cplusplus/Token.cpp
@@ -85,6 +85,7 @@ void Token::reset()
 {
    flags = 0;
    byteOffset = 0;
    utf16charOffset = 0;
    ptr = 0;
 }
--- a/src/libs/3rdparty/cplusplus/Token.h
+++ b/src/libs/3rdparty/cplusplus/Token.h
@@ -285,7 +285,7 @@ enum Kind {
 class CPLUSPLUS_EXPORT Token
 {
 public:
-    Token() : flags(0), byteOffset(0), ptr(0) {}
+    Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
    inline bool is(unsigned k) const    { return f.kind == k; }
    inline bool isNot(unsigned k) const { return f.kind != k; }
@@ -298,13 +298,14 @@ public:
    inline bool joined() const { return f.joined; }
    inline bool expanded() const { return f.expanded; }
    inline bool generated() const { return f.generated; }
    inline unsigned bytes() const { return f.bytes; }
    inline unsigned bytesBegin() const { return byteOffset; }
    inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
-    inline unsigned bytesBegin() const
+    inline unsigned utf16chars() const { return f.utf16chars; }
-    { return byteOffset; }
+    inline unsigned utf16charsBegin() const { return utf16charOffset; }
-
+    inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
    inline unsigned bytesEnd() const
    { return byteOffset + f.bytes; }
    inline bool isLiteral() const
    { return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
@@ -354,15 +355,17 @@ public:
        unsigned generated     : 1;
        // Unused...
        unsigned pad           : 3;
-        // The token length in bytes.
+        // The token length in bytes and UTF16 chars.
        unsigned bytes         : 16;
        unsigned utf16chars    : 16;
    };
    union {
-        unsigned flags;
+        unsigned long flags;
        Flags f;
    };
    unsigned byteOffset;
    unsigned utf16charOffset;
    union {
        void *ptr;
@@ -393,5 +396,4 @@ struct LanguageFeatures
 } // namespace CPlusPlus
 #endif // CPLUSPLUS_TOKEN_H
--- a/src/libs/cplusplus/SimpleLexer.cpp
+++ b/src/libs/cplusplus/SimpleLexer.cpp
@@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const
    return _endedJoined;
 }
-QList<Token> SimpleLexer::operator()(const QString &text, int state)
+QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8)
 {
    QList<Token> tokens;
-    const QByteArray bytes = text.toLatin1();
+    const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1();
    const char *firstChar = bytes.constData();
    const char *lastChar = firstChar + bytes.size();
--- a/src/libs/cplusplus/SimpleLexer.h
+++ b/src/libs/cplusplus/SimpleLexer.h
@@ -54,7 +54,7 @@ public:
    bool endedJoined() const;
-    QList<Token> operator()(const QString &text, int state = 0);
+    QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false);
    int state() const
    { return _lastState; }
--- a/tests/auto/cplusplus/cplusplus.pro
+++ b/tests/auto/cplusplus/cplusplus.pro
@@ -12,4 +12,5 @@ SUBDIRS = \
    misc \
    cxx11 \
    checksymbols \
-    lexer
+    lexer \
    translationunit
--- a/tests/auto/cplusplus/cplusplus.qbs
+++ b/tests/auto/cplusplus/cplusplus.qbs
@@ -13,6 +13,7 @@ Project {
        "misc/misc.qbs",
        "preprocessor/preprocessor.qbs",
        "semantic/semantic.qbs",
        "translationunit/translationunit.qbs",
        "typeprettyprinter/typeprettyprinter.qbs"
    ]
 }
--- a/tests/auto/cplusplus/lexer/tst_lexer.cpp
+++ b/tests/auto/cplusplus/lexer/tst_lexer.cpp
@@ -52,28 +52,49 @@ class tst_SimpleLexer: public QObject
 public:
    tst_SimpleLexer() : _state(0) {}
    enum TokenCompareFlag {
        CompareKind            = 1 << 1,
        CompareBytes           = 1 << 2,
        CompareBytesBegin      = 1 << 3,
        CompareBytesEnd        = 1 << 4,
        CompareUtf16Chars      = 1 << 5,
        CompareUtf16CharsBegin = 1 << 6,
        CompareUtf16CharsEnd   = 1 << 7
    };
    Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
 private slots:
    void basic();
    void basic_data();
    void incremental();
    void incremental_data();
    //
    // The following "non-latin1" code points are used in the tests following this comment:
    //
    //   U+00FC  - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
    //   U+4E8C  - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
    //   U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
    //
    void bytes_and_utf16chars();
    void bytes_and_utf16chars_data();
    void offsets();
    void offsets_data();
 private:
    static TokenList toTokenList(const TokenKindList &tokenKinds);
    enum TokenCompareFlag {
        CompareKind = 1 << 1
    };
    Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
    void run(const QByteArray &source,
             const TokenList &expectedTokenList,
             bool preserveState,
-             TokenCompareFlag compareFlags);
+             TokenCompareFlags compareFlags);
    int _state;
 };
 Q_DECLARE_OPERATORS_FOR_FLAGS(tst_SimpleLexer::TokenCompareFlags)
 TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
 {
    TokenList tokens;
@@ -88,10 +109,13 @@ TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
 void tst_SimpleLexer::run(const QByteArray &source,
                          const TokenList &expectedTokenList,
                          bool preserveState,
-                          TokenCompareFlag compareFlags)
+                          TokenCompareFlags compareFlags)
 {
    QVERIFY(compareFlags);
    SimpleLexer lexer;
-    const QList<Token> tokenList = lexer(source, preserveState ? _state : 0);
+    const QList<Token> tokenList = lexer(source, preserveState ? _state : 0,
                                         /*convertToUtf8=*/ true);
    if (preserveState)
        _state = lexer.state();
@@ -108,6 +132,20 @@ void tst_SimpleLexer::run(const QByteArray &source,
 #endif
        if (compareFlags & CompareKind)
            QCOMPARE(token.kind(), expectedToken.kind());
        if (compareFlags & CompareBytes)
            QCOMPARE(token.bytes(), expectedToken.bytes());
        if (compareFlags & CompareBytesBegin)
            QCOMPARE(token.bytesBegin(), expectedToken.bytesBegin());
        if (compareFlags & CompareBytesEnd)
            QCOMPARE(token.bytesEnd(), expectedToken.bytesEnd());
        if (compareFlags & CompareUtf16Chars)
            QCOMPARE(token.utf16chars(), expectedToken.utf16chars());
        if (compareFlags & CompareUtf16CharsBegin)
            QCOMPARE(token.utf16charsBegin(), expectedToken.utf16charsBegin());
        if (compareFlags & CompareUtf16CharsEnd)
            QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
    }
    QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected.");
 }
@@ -221,7 +259,168 @@ void tst_SimpleLexer::basic_data()
        << T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE
        << T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER;
    QTest::newRow(source) << source << expectedTokenKindList;
 }
 void tst_SimpleLexer::bytes_and_utf16chars()
 {
    QFETCH(QByteArray, source);
    QFETCH(QList<Token>, expectedTokenList);
    const TokenCompareFlags compareFlags = CompareKind | CompareBytes | CompareUtf16Chars;
    run(source, expectedTokenList, false, compareFlags);
 }
 static QList<Token> createToken(unsigned kind, unsigned bytes, unsigned utf16chars)
 {
    Token t;
    t.f.kind = kind;
    t.f.bytes = bytes;
    t.f.utf16chars = utf16chars;
    return QList<Token>() << t;
 }
 void tst_SimpleLexer::bytes_and_utf16chars_data()
 {
    QTest::addColumn<QByteArray>("source");
    QTest::addColumn<QList<Token> >("expectedTokenList");
    typedef QByteArray _;
    // LATIN1 Identifier
    QTest::newRow("latin1 identifier")
        << _("var") << createToken(T_IDENTIFIER, 3, 3);
    // NON-LATIN1 identifier (code point with 2 UTF8 code units)
    QTest::newRow("non-latin1 identifier (2-byte code unit at start)")
        << _("\u00FC_var") << createToken(T_IDENTIFIER, 6, 5);
    QTest::newRow("non-latin1 identifier (2-byte code unit in center)")
        << _("_v\u00FCr_") << createToken(T_IDENTIFIER, 6, 5);
    QTest::newRow("non-latin1 identifier (2-byte code unit at end)")
        << _("var_\u00FC") << createToken(T_IDENTIFIER, 6, 5);
    QTest::newRow("non-latin1 identifier (2-byte code unit only)")
        << _("\u00FC") << createToken(T_IDENTIFIER, 2, 1);
    // NON-LATIN1 identifier (code point with 3 UTF8 code units)
    QTest::newRow("non-latin1 identifier (3-byte code unit at start)")
        << _("\u4E8C_var") << createToken(T_IDENTIFIER, 7, 5);
    QTest::newRow("non-latin1 identifier (3-byte code unit in center)")
        << _("_v\u4E8Cr_") << createToken(T_IDENTIFIER, 7, 5);
    QTest::newRow("non-latin1 identifier (3-byte code unit at end)")
        << _("var_\u4E8C") << createToken(T_IDENTIFIER, 7, 5);
    QTest::newRow("non-latin1 identifier (3-byte code unit only)")
        << _("\u4E8C") << createToken(T_IDENTIFIER, 3, 1);
    // NON-LATIN1 identifier (code point with 4 UTF8 code units)
    QTest::newRow("non-latin1 identifier (4-byte code unit at start)")
        << _("\U00010302_var") << createToken(T_IDENTIFIER, 8, 6);
    QTest::newRow("non-latin1 identifier (4-byte code unit in center)")
        << _("_v\U00010302r_") << createToken(T_IDENTIFIER, 8, 6);
    QTest::newRow("non-latin1 identifier (4-byte code unit at end)")
        << _("var_\U00010302") << createToken(T_IDENTIFIER, 8, 6);
    QTest::newRow("non-latin1 identifier (4-byte code unit only)")
        << _("\U00010302") << createToken(T_IDENTIFIER, 4, 2);
    // NON-LATIN1 identifier (code points with several multi-byte UTF8 code units)
    QTest::newRow("non-latin1 identifier (mixed multi-byte code units at start)")
        << _("\u00FC\u4E8C\U00010302_var") << createToken(T_IDENTIFIER, 13, 8);
    QTest::newRow("non-latin1 identifier (mixed multi-byte code units in center)")
        << _("_v\u00FC\u4E8C\U00010302r_") << createToken(T_IDENTIFIER, 13, 8);
    QTest::newRow("non-latin1 identifier (mixed multi-byte code units at end)")
        << _("var_\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 13, 8);
    QTest::newRow("non-latin1 identifier (mixed multi-byte code units only)")
        << _("\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 9, 4);
    // Comments
    QTest::newRow("ascii comment /* ... */")
        << _("/* hello world */") << createToken(T_COMMENT, 17, 17);
    QTest::newRow("latin1 comment //")
        << _("// hello world") << createToken(T_CPP_COMMENT, 14, 14);
    QTest::newRow("non-latin1 comment /* ... */ (1)")
        << _("/* \u00FC\u4E8C\U00010302 */") << createToken(T_COMMENT, 15, 10);
    QTest::newRow("non-latin1 comment /* ... */ (2)")
        << _("/*\u00FC\u4E8C\U00010302*/") << createToken(T_COMMENT, 13, 8);
    QTest::newRow("non-latin1 comment // (1)")
        << _("// \u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 12, 7);
    QTest::newRow("non-latin1 comment // (2)")
        << _("//\u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 11, 6);
    // String Literals
    QTest::newRow("latin1 string literal")
        << _("\"hello\"") << createToken(T_STRING_LITERAL, 7, 7);
    QTest::newRow("non-latin1 string literal")
        << _("\"\u00FC\u4E8C\U00010302\"") << createToken(T_STRING_LITERAL, 11, 6);
 }
 static Token createToken(unsigned kind, unsigned byteOffset, unsigned bytes,
                         unsigned utf16charsOffset, unsigned utf16chars)
 {
    Token t;
    t.f.kind = kind;
    t.byteOffset = byteOffset;
    t.f.bytes = bytes;
    t.utf16charOffset = utf16charsOffset;
    t.f.utf16chars = utf16chars;
    return t;
 }
 void tst_SimpleLexer::offsets()
 {
    QFETCH(QByteArray, source);
    QFETCH(QList<Token>, expectedTokenList);
    const TokenCompareFlags compareFlags = CompareKind
            | CompareBytesBegin
            | CompareBytesEnd
            | CompareUtf16CharsBegin
            | CompareUtf16CharsEnd
            ;
    run(source, expectedTokenList, false, compareFlags);
 }
 void tst_SimpleLexer::offsets_data()
 {
    QTest::addColumn<QByteArray>("source");
    QTest::addColumn<QList<Token> >("expectedTokenList");
    typedef QByteArray _;
    // LATIN1 Identifier
    QTest::newRow("latin1 identifiers")
        << _("var var") << (QList<Token>()
            << createToken(T_IDENTIFIER, 0, 3, 0, 3)
            << createToken(T_IDENTIFIER, 4, 3, 4, 3)
        );
    // NON-LATIN1 identifier
    QTest::newRow("non-latin1 identifiers 1")
        << _("var_\u00FC var_\u00FC") << (QList<Token>()
            << createToken(T_IDENTIFIER, 0, 6, 0, 5)
            << createToken(T_IDENTIFIER, 7, 6, 6, 5)
        );
    QTest::newRow("non-latin1 identifiers 2")
        << _("\u00FC\u4E8C\U00010302 \u00FC\u4E8C\U00010302") << (QList<Token>()
            << createToken(T_IDENTIFIER, 0, 9, 0, 4)
            << createToken(T_IDENTIFIER, 10, 9, 5, 4)
        );
    QTest::newRow("non-latin1 identifiers 3")   // first code unit on line: <bytes> / <utf16char>
        << _("class v\u00FC\u4E8C\U00010302\n"  //  0 / 0
             "{\n"                              // 17 / 12
             "public:\n"                        // 19 / 14
             "    v\u00FC\u4E8C\U00010302();\n" // 27 / 22
             "};\n") << (QList<Token>()         // 45 / 35
            << createToken(T_CLASS, 0, 5, 0, 5)         // class
            << createToken(T_IDENTIFIER, 6, 10, 6, 5)   // non-latin1 id
            << createToken(T_LBRACE, 17, 1, 12, 1)      // {
            << createToken(T_PUBLIC, 19, 6, 14, 6)      // public
            << createToken(T_COLON, 25, 1, 20, 1)       // :
            << createToken(T_IDENTIFIER, 31, 10, 26, 5) // id
            << createToken(T_LPAREN, 41, 1, 31, 1)      // (
            << createToken(T_RPAREN, 42, 1, 32, 1)      // )
            << createToken(T_SEMICOLON, 43, 1, 33, 1)   // ;
            << createToken(T_RBRACE, 45, 1, 35, 1)      // }
            << createToken(T_SEMICOLON, 46, 1, 36, 1)   // ;
        );
 }
 void tst_SimpleLexer::incremental()
--- a/tests/auto/cplusplus/translationunit/translationunit.pro
+++ b/tests/auto/cplusplus/translationunit/translationunit.pro
@@ -0,0 +1,2 @@
 include(../shared/shared.pri)
 SOURCES += tst_translationunit.cpp
--- a/tests/auto/cplusplus/translationunit/translationunit.qbs
+++ b/tests/auto/cplusplus/translationunit/translationunit.qbs
@@ -0,0 +1,7 @@
 import qbs
 import "../cplusplusautotest.qbs" as CPlusPlusAutotest
 CPlusPlusAutotest {
    name: "CPlusPlus translation unit autotest"
    files: "tst_translationunit.cpp"
 }
--- a/tests/auto/cplusplus/translationunit/tst_translationunit.cpp
+++ b/tests/auto/cplusplus/translationunit/tst_translationunit.cpp
@@ -0,0 +1,225 @@
 /****************************************************************************
 **
 ** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
 ** Contact: http://www.qt-project.org/legal
 **
 ** This file is part of Qt Creator.
 **
 ** Commercial License Usage
 ** Licensees holding valid commercial Qt licenses may use this file in
 ** accordance with the commercial license agreement provided with the
 ** Software or, alternatively, in accordance with the terms contained in
 ** a written agreement between you and Digia.  For licensing terms and
 ** conditions see http://qt.digia.com/licensing.  For further information
 ** use the contact form at http://qt.digia.com/contact-us.
 **
 ** GNU Lesser General Public License Usage
 ** Alternatively, this file may be used under the terms of the GNU Lesser
 ** General Public License version 2.1 as published by the Free Software
 ** Foundation and appearing in the file LICENSE.LGPL included in the
 ** packaging of this file.  Please review the following information to
 ** ensure the GNU Lesser General Public License version 2.1 requirements
 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
 **
 ** In addition, as a special exception, Digia gives you certain additional
 ** rights.  These rights are described in the Digia Qt LGPL Exception
 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
 **
 ****************************************************************************/
 #include <cplusplus/PreprocessorClient.h>
 #include <cplusplus/PreprocessorEnvironment.h>
 #include <cplusplus/Token.h>
 #include <cplusplus/TranslationUnit.h>
 #include <cplusplus/pp-engine.h>
 #include <QtTest>
 #include <QDebug>
 //TESTED_COMPONENT=src/libs/cplusplus
 using namespace CPlusPlus;
 class tst_TranslationUnit: public QObject
 {
    Q_OBJECT
 private slots:
    //
    // The following "non-latin1" code points are used in the tests following this comment:
    //
    //   U+00FC  - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
    //   U+4E8C  - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
    //   U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
    //
    void unicodeIdentifier();
    void unicodeIdentifier_data();
    void unicodeStringLiteral();
    void unicodeStringLiteral_data();
 private:
    class Document
    {
    public:
        typedef QSharedPointer<Document> Ptr;
        static Document::Ptr create(const QByteArray &source)
        {
            LanguageFeatures features;
            features.objCEnabled = true;
            features.qtEnabled = false;
            features.qtKeywordsEnabled = false;
            features.qtMocRunEnabled = false;
            Document::Ptr document = Document::Ptr(new Document);
            document->translationUnit()->setLanguageFeatures(features);
            const QByteArray preprocessedSource = preprocess(source);
            document->translationUnit()->setSource(preprocessedSource.constData(),
                                                preprocessedSource.length());
            document->translationUnit()->parse();
            if (document->hasParsingErrors())
                return Document::Ptr();
            return document;
        }
    public:
        Document()
            : m_translationUnit(&m_control, m_control.stringLiteral("testFile"))
        {
            m_control.setDiagnosticClient(&m_diagnosticClient);
        }
        TranslationUnit *translationUnit()
        { return &m_translationUnit; }
        bool hasParsingErrors() const
        { return m_diagnosticClient.errorCount != 0; }
        const Identifier *lastIdentifier() const
        { return *(m_control.lastIdentifier() - 1); }
        const StringLiteral *lastStringLiteral() const
        { return *(m_control.lastStringLiteral() - 1); }
    private:
        static QByteArray preprocess(const QByteArray &source)
        {
            Client *client = 0; // no client.
            Environment env;
            Preprocessor preprocess(client, &env);
            preprocess.setKeepComments(true);
            return preprocess.run(QLatin1String("<stdin>"), source);
        }
    private:
        Control m_control;
        TranslationUnit m_translationUnit;
        class Diagnostic: public DiagnosticClient {
        public:
            int errorCount;
            Diagnostic() : errorCount(0) {}
            void report(int /*level*/, const StringLiteral *fileName, unsigned line,
                        unsigned column, const char *format, va_list ap)
            {
                ++errorCount;
                qDebug() << fileName->chars() << ':' << line << ':' << column
                         << ' ' << QString().vsprintf(format, ap);
            }
        } m_diagnosticClient;
    };
 };
 void tst_TranslationUnit::unicodeIdentifier()
 {
    QFETCH(QByteArray, identifierText);
    Document::Ptr document = Document::create("void " + identifierText + ";");
    QVERIFY(document);
    const Identifier *actual = document->lastIdentifier();
    QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
             QString::fromUtf8(identifierText));
 }
 void tst_TranslationUnit::unicodeIdentifier_data()
 {
    QTest::addColumn<QByteArray>("identifierText");
    typedef QByteArray _;
    QTest::newRow("latin1 identifier") << _("var");
    QTest::newRow("non-latin1 identifier 1") << _("prefix\u00FC\u4E8C\U00010302");
    QTest::newRow("non-latin1 identifier 2") << _("prefix\U00010302\u00FC\u4E8C");
    QTest::newRow("non-latin1 identifier 3") << _("\U00010302\u00FC\u4E8C");
    QTest::newRow("non-latin1 identifier 4") << _("\u4E8C\U00010302\u00FC");
    QTest::newRow("non-latin1 identifier 5") << _("\u4E8C\U00010302\u00FCsuffix");
    QTest::newRow("non-latin1 identifier 6") << _("\U00010302\u00FC\u4E8Csuffix");
    // Some special cases (different code path inside lexer)
    QTest::newRow("non-latin1 identifier 7") << _("LR\U00010302\u00FC\u4E8C");
    QTest::newRow("non-latin1 identifier 8") << _("u8R\U00010302\u00FC\u4E8C");
    QTest::newRow("non-latin1 identifier 9") << _("u8\U00010302\u00FC\u4E8C");
    QTest::newRow("non-latin1 identifier 10") << _("u\U00010302\u00FC\u4E8C");
 }
 static QByteArray stripQuotesFromLiteral(const QByteArray literal)
 {
    QByteArray result = literal;
    // Strip front
    while (!result.isEmpty() && result[0] != '"')
        result = result.mid(1);
    if (result.isEmpty())
        return QByteArray();
    result = result.mid(1);
    // Strip end
    while (result.size() >= 2
           && (std::isspace(result[result.size() - 1]) || result[result.size()-1] == '"')) {
        result.chop(1);
    }
    return result;
 }
 void tst_TranslationUnit::unicodeStringLiteral()
 {
    QFETCH(QByteArray, literalText);
    Document::Ptr document = Document::create("char t[] = " + literalText + ";");
    QVERIFY(document);
    const StringLiteral *actual = document->lastStringLiteral();
    QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
             QString::fromUtf8(stripQuotesFromLiteral(literalText)));
 }
 void tst_TranslationUnit::unicodeStringLiteral_data()
 {
    QTest::addColumn<QByteArray>("literalText");
    typedef QByteArray _;
    QTest::newRow("latin1 literal") << _("\"var\"");
    QTest::newRow("non-latin1 literal 1") << _("\"prefix\u00FC\u4E8C\U00010302\"");
    QTest::newRow("non-latin1 literal 2") << _("\"prefix\U00010302\u00FC\u4E8C\"");
    QTest::newRow("non-latin1 literal 3") << _("\"\U00010302\u00FC\u4E8C\"");
    QTest::newRow("non-latin1 literal 4") << _("\"\u4E8C\U00010302\u00FC\"");
    QTest::newRow("non-latin1 literal 5") << _("\"\u4E8C\U00010302\u00FCsuffix\"");
    QTest::newRow("non-latin1 literal 6") << _("\"\U00010302\u00FC\u4E8Csuffix\"");
    QTest::newRow("non-latin1 literal 7") << _("L\"\U00010302\u00FC\u4E8C\"");
    QTest::newRow("non-latin1 literal 8") << _("u8\"\U00010302\u00FC\u4E8C\"");
    QTest::newRow("non-latin1 literal 9") << _("u\"\U00010302\u00FC\u4E8C\"");
    QTest::newRow("non-latin1 literal 10") << _("U\"\U00010302\u00FC\u4E8C\"");
 }
 QTEST_APPLESS_MAIN(tst_TranslationUnit)
 #include "tst_translationunit.moc"
		`@@ -0,0 +1,2 @@`
							`include(../shared/shared.pri)`
							`SOURCES += tst_translationunit.cpp`