C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already holds UTF-8 encoded QByteArrays) and thus loss of information (see QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers. API-wise the following functions are added to Token. In follow-up patches these will become handy in combination with QStrings. utf16chars() - aequivalent of bytes() utf16charsBegin() - aequivalent of bytesBegin() utf16charsEnd() - aequivalent of bytesEnd() Next steps: * Adapt functions from TranslationUnit. They should work with utf16 chars in order to calculate lines and columns correctly also for UTF-8 multi-byte code points. * Adapt the higher level clients: * Cpp{Tools,Editor} should expect UTF-8 encoded Literals. * Cpp{Tools,Editor}: When dealing with identifiers on the QString/QTextDocument layer, code points represendet by two QChars need to be respected, too. * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report offsets usable in CppEditor/CppTools. Addresses QTCREATORBUG-7356. Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0 Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
2014-02-25 13:44:11 -03:00
parent 4fefb1ca2a
commit 70122b3061
12 changed files with 503 additions and 28 deletions
--- a/src/libs/3rdparty/cplusplus/Lexer.cpp
+++ b/src/libs/3rdparty/cplusplus/Lexer.cpp
@@ -29,6 +29,13 @@

 using namespace CPlusPlus;

+/*!
+    \class Lexer
+    \brief The Lexer generates tokens from an UTF-8 encoded source text.
+
+    \sa Token
+*/
+
 Lexer::Lexer(TranslationUnit *unit)
    : _translationUnit(unit),
      _control(unit->control()),
@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
    _firstChar = firstChar;
    _lastChar = lastChar;
    _currentChar = _firstChar - 1;
+    _currentCharUtf16 = -1;
    _tokenStart = _currentChar;
    _yychar = '\n';
 }
@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
    tok->reset();
    scan_helper(tok);
    tok->f.bytes = _currentChar - _tokenStart;
+    tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
 }

 void Lexer::scan_helper(Token *tok)
@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
    _tokenStart = _currentChar;
    tok->byteOffset = _currentChar - _firstChar;

+    _tokenStartUtf16 = _currentCharUtf16;
+    tok->utf16charOffset = _currentCharUtf16;
+
    if (_yychar) {
        s._newlineExpected = false;
    } else if (s._tokenKind) {
@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
            } else {
                scanIdentifier(tok);
            }
-        } else if (std::isalpha(ch) || ch == '_' || ch == '$') {
-            scanIdentifier(tok);
+        } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
+            scanIdentifier(tok, _currentChar - _tokenStart - 1);
        } else if (std::isdigit(ch)) {
            scanNumericLiteral(tok);
        } else {
@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
 void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
 {
    const char *yytext = _currentChar - 1 - extraProcessedChars;
-    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
+    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
+            || isByteOfMultiByteCodePoint(_yychar)) {
        yyinp();
+    }
    int yylen = _currentChar - yytext;
    if (f._scanKeywords)
        tok->f.kind = classify(yytext, yylen, _languageFeatures);