C++: Introduce unicode char/strings support

Those are the types char16_t and char32_t along with the new char/string literals u'', U'', u"", u8"", and U"". This is particularly important for the use of QStringLiteral since in some platforms it relies on expansion such as above. Note: The string literals quickfixes still need some tunning. Task-number: QTCREATORBUG-7449 Change-Id: Iebcfea15677dc8e0ebb6143def89a5477e1be7d4 Reviewed-by: hjk <qthjk@ovi.com>
2012-06-06 13:41:22 +02:00
parent b88a5f5d38
commit 23c637c4f6
17 changed files with 242 additions and 146 deletions
--- a/src/libs/3rdparty/cplusplus/Bind.cpp
+++ b/src/libs/3rdparty/cplusplus/Bind.cpp
@@ -2750,6 +2750,18 @@ bool Bind::visit(SimpleSpecifierAST *ast)
            _type.setType(control()->integerType(IntegerType::Char));
            break;

+        case T_CHAR16_T:
+            if (_type)
+                translationUnit()->error(ast->specifier_token, "duplicate data type in declaration");
+            _type.setType(control()->integerType(IntegerType::Char16));
+            break;
+
+        case T_CHAR32_T:
+            if (_type)
+                translationUnit()->error(ast->specifier_token, "duplicate data type in declaration");
+            _type.setType(control()->integerType(IntegerType::Char32));
+            break;
+
        case T_WCHAR_T:
            if (_type)
                translationUnit()->error(ast->specifier_token, "duplicate data type in declaration");
--- a/src/libs/3rdparty/cplusplus/CoreTypes.h
+++ b/src/libs/3rdparty/cplusplus/CoreTypes.h
@@ -70,6 +70,8 @@ class CPLUSPLUS_EXPORT IntegerType: public Type
 public:
    enum Kind {
        Char,
+        Char16,
+        Char32,
        WideChar,
        Bool,
        Short,
--- a/src/libs/3rdparty/cplusplus/Keywords.cpp
+++ b/src/libs/3rdparty/cplusplus/Keywords.cpp
@@ -778,6 +778,28 @@ static inline int classify8(const char *s, bool q, bool x) {
          }
        }
      }
+    } else if (x && s[1] == 'h') {
+        if (s[2] == 'a') {
+            if (s[3] == 'r') {
+                if (s[4] == '1') {
+                    if (s[5] == '6') {
+                        if (s[6] == '_') {
+                            if (s[7] == 't') {
+                                return T_CHAR16_T;
+                            }
+                        }
+                    }
+                } else if (s[4] == '3') {
+                    if (s[5] == '2') {
+                        if (s[6] == '_') {
+                            if (s[7] == 't') {
+                                return T_CHAR32_T;
+                            }
+                        }
+                    }
+                }
+            }
+        }
    }
  }
  else if (x && s[0] == 'd') {
--- a/src/libs/3rdparty/cplusplus/Lexer.cpp
+++ b/src/libs/3rdparty/cplusplus/Lexer.cpp
@@ -224,37 +224,13 @@ void Lexer::scan_helper(Token *tok)
        }
        goto _Lagain;

-    case '"': case '\'': {
-        const char quote = ch;
+    case '"':
+        scanStringLiteral(tok);
+        break;

-        tok->f.kind = quote == '"'
-            ? T_STRING_LITERAL
-            : T_CHAR_LITERAL;
-
-        const char *yytext = _currentChar;
-
-        while (_yychar && _yychar != quote) {
-            if (_yychar == '\n')
-                break;
-            else if (_yychar != '\\')
-                yyinp();
-            else {
-                yyinp(); // skip `\\'
-
-                if (_yychar)
-                    yyinp();
-            }
-        }
-        // assert(_yychar == quote);
-
-        int yylen = _currentChar - yytext;
-
-        if (_yychar == quote)
-            yyinp();
-
-        if (control())
-            tok->string = control()->stringLiteral(yytext, yylen);
-    } break;
+    case '\'':
+        scanCharLiteral(tok);
+        break;

    case '{':
        tok->f.kind = T_LBRACE;
@@ -589,112 +565,148 @@ void Lexer::scan_helper(Token *tok)
                tok->f.kind = classifyObjCAtKeyword(yytext, yylen);
                break;
            } else if (ch == '@' && _yychar == '"') {
-                // objc @string literals
                yyinp();
-                tok->f.kind = T_AT_STRING_LITERAL;
-
-                const char *yytext = _currentChar;
-
-                while (_yychar && _yychar != '"') {
-                    if (_yychar != '\\')
-                        yyinp();
-                    else {
-                        yyinp(); // skip `\\'
-
-                        if (_yychar)
-                            yyinp();
-                    }
-                }
-                // assert(_yychar == '"');
-
-                int yylen = _currentChar - yytext;
-
-                if (_yychar == '"')
-                    yyinp();
-
-                if (control())
-                    tok->string = control()->stringLiteral(yytext, yylen);
-
+                scanStringLiteral(tok, '"');
                break;
            }
        }

-        if (ch == 'L' && (_yychar == '"' || _yychar == '\'')) {
-            // wide char/string literals
-            ch = _yychar;
-            yyinp();
-
-            const char quote = ch;
-
-            tok->f.kind = quote == '"'
-                ? T_WIDE_STRING_LITERAL
-                : T_WIDE_CHAR_LITERAL;
-
-            const char *yytext = _currentChar;
-
-            while (_yychar && _yychar != quote) {
-                if (_yychar != '\\')
-                    yyinp();
-                else {
-                    yyinp(); // skip `\\'
-
-                    if (_yychar)
-                        yyinp();
-                }
-            }
-            // assert(_yychar == quote);
-
-            int yylen = _currentChar - yytext;
-
-            if (_yychar == quote)
+        if (ch == 'L' || ch == 'u' || ch == 'U') {
+            // Either a literal or still an identifier.
+            if (_yychar == '"') {
                yyinp();
-
-            if (control())
-                tok->string = control()->stringLiteral(yytext, yylen);
-        } else if (std::isalpha(ch) || ch == '_' || ch == '$') {
-            const char *yytext = _currentChar - 1;
-            while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
+                scanStringLiteral(tok, ch);
+            } else if (_yychar == '\'') {
                yyinp();
-            int yylen = _currentChar - yytext;
-            if (f._scanKeywords)
-                tok->f.kind = classify(yytext, yylen, f._qtMocRunEnabled, f._cxx0xEnabled);
-            else
-                tok->f.kind = T_IDENTIFIER;
-
-            if (tok->f.kind == T_IDENTIFIER) {
-                tok->f.kind = classifyOperator(yytext, yylen);
-
-                if (control())
-                    tok->identifier = control()->identifier(yytext, yylen);
-            }
-            break;
-        } else if (std::isdigit(ch)) {
-            const char *yytext = _currentChar - 1;
-            while (_yychar) {
-                if (_yychar == 'e' || _yychar == 'E') {
-                    yyinp();
-                    if (_yychar == '-' || _yychar == '+') {
+                scanCharLiteral(tok, ch);
+            } else {
+                if (_yychar == '8') {
+                    unsigned char la = 0;
+                    if (_currentChar + 1 != _lastChar)
+                        la = *(_currentChar + 1);
+                    if (la == '"') {
                        yyinp();
-                        // ### assert(std::isdigit(_yychar));
+                        yyinp();
+                        scanStringLiteral(tok, '8');
+                    } else if (la == '\'') {
+                        yyinp();
+                        yyinp();
+                        scanCharLiteral(tok, '8');
+                    } else {
+                        scanIdentifier(tok);
                    }
-                } else if (std::isalnum(_yychar) || _yychar == '.') {
-                    yyinp();
                } else {
-                    break;
+                    scanIdentifier(tok);
                }
            }
-            int yylen = _currentChar - yytext;
-            tok->f.kind = T_NUMERIC_LITERAL;
-            if (control())
-                tok->number = control()->numericLiteral(yytext, yylen);
-            break;
+        } else if (std::isalpha(ch) || ch == '_' || ch == '$') {
+            scanIdentifier(tok);
+        } else if (std::isdigit(ch)) {
+            scanNumericLiteral(tok);
        } else {
            tok->f.kind = T_ERROR;
-            break;
        }
+        break;
    } // default

    } // switch
 }

+void Lexer::scanStringLiteral(Token *tok, unsigned char hint)
+{
+    scanUntilQuote(tok, '"');

+    if (hint == 'L')
+        tok->f.kind = T_WIDE_STRING_LITERAL;
+    else if (hint == 'U')
+        tok->f.kind = T_UTF32_STRING_LITERAL;
+    else if (hint == 'u')
+        tok->f.kind = T_UTF16_STRING_LITERAL;
+    else if (hint == '8')
+        tok->f.kind = T_UTF8_STRING_LITERAL;
+    else if (hint == '@')
+        tok->f.kind = T_AT_STRING_LITERAL;
+    else
+        tok->f.kind = T_STRING_LITERAL;
+}
+
+void Lexer::scanCharLiteral(Token *tok, unsigned char hint)
+{
+    scanUntilQuote(tok, '\'');
+
+    if (hint == 'L')
+        tok->f.kind = T_WIDE_CHAR_LITERAL;
+    else if (hint == 'U')
+        tok->f.kind = T_UTF32_CHAR_LITERAL;
+    else if (hint == 'u')
+        tok->f.kind = T_UTF16_CHAR_LITERAL;
+    else
+        tok->f.kind = T_CHAR_LITERAL;
+}
+
+void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
+{
+    assert(quote == '"' || quote == '\'');
+
+    const char *yytext = _currentChar;
+    while (_yychar && _yychar != quote) {
+        if (_yychar != '\\')
+            yyinp();
+        else {
+            yyinp(); // skip `\\'
+
+            if (_yychar)
+                yyinp();
+        }
+    }
+    int yylen = _currentChar - yytext;
+
+    if (_yychar == quote)
+        yyinp();
+
+    if (control())
+        tok->string = control()->stringLiteral(yytext, yylen);
+}
+
+void Lexer::scanNumericLiteral(Token *tok)
+{
+    const char *yytext = _currentChar - 1;
+    while (_yychar) {
+        if (_yychar == 'e' || _yychar == 'E') {
+            yyinp();
+            if (_yychar == '-' || _yychar == '+') {
+                yyinp();
+                // ### assert(std::isdigit(_yychar));
+            }
+        } else if (std::isalnum(_yychar) || _yychar == '.') {
+            yyinp();
+        } else {
+            break;
+        }
+    }
+    int yylen = _currentChar - yytext;
+
+    tok->f.kind = T_NUMERIC_LITERAL;
+
+    if (control())
+        tok->number = control()->numericLiteral(yytext, yylen);
+}
+
+void Lexer::scanIdentifier(Token *tok)
+{
+    const char *yytext = _currentChar - 1;
+    while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
+        yyinp();
+    int yylen = _currentChar - yytext;
+    if (f._scanKeywords)
+        tok->f.kind = classify(yytext, yylen, f._qtMocRunEnabled, f._cxx0xEnabled);
+    else
+        tok->f.kind = T_IDENTIFIER;
+
+    if (tok->f.kind == T_IDENTIFIER) {
+        tok->f.kind = classifyOperator(yytext, yylen);
+
+        if (control())
+            tok->identifier = control()->identifier(yytext, yylen);
+    }
+}
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -90,6 +90,12 @@ private:
    static int classifyObjCAtKeyword(const char *s, int n);
    static int classifyOperator(const char *string, int length);

+    void scanStringLiteral(Token *tok, unsigned char hint = 0);
+    void scanCharLiteral(Token *tok, unsigned char hint = 0);
+    void scanUntilQuote(Token *tok, unsigned char quote);
+    void scanNumericLiteral(Token *tok);
+    void scanIdentifier(Token *tok);
+
    inline void yyinp()
    {
        if (++_currentChar == _lastChar)
--- a/src/libs/3rdparty/cplusplus/Parser.cpp
+++ b/src/libs/3rdparty/cplusplus/Parser.cpp
@@ -313,6 +313,8 @@ bool Parser::skipUntilStatement()
            case T_CATCH:
            case T_THROW:
            case T_CHAR:
+            case T_CHAR16_T:
+            case T_CHAR32_T:
            case T_WCHAR_T:
            case T_BOOL:
            case T_SHORT:
@@ -2811,12 +2813,21 @@ bool Parser::parseUnqualifiedName(NameAST *&node, bool acceptTemplateId)
 bool Parser::parseStringLiteral(ExpressionAST *&node)
 {
    DEBUG_THIS_RULE();
-    if (! (LA() == T_STRING_LITERAL || LA() == T_WIDE_STRING_LITERAL))
+    if (! (LA() == T_STRING_LITERAL
+           || LA() == T_WIDE_STRING_LITERAL
+           || LA() == T_UTF8_STRING_LITERAL
+           || LA() == T_UTF16_STRING_LITERAL
+           || LA() == T_UTF32_STRING_LITERAL)) {
        return false;
+    }

    StringLiteralAST **ast = reinterpret_cast<StringLiteralAST **> (&node);

-    while (LA() == T_STRING_LITERAL || LA() == T_WIDE_STRING_LITERAL) {
+    while (LA() == T_STRING_LITERAL
+           || LA() == T_WIDE_STRING_LITERAL
+           || LA() == T_UTF8_STRING_LITERAL
+           || LA() == T_UTF16_STRING_LITERAL
+           || LA() == T_UTF32_STRING_LITERAL) {
        *ast = new (_pool) StringLiteralAST;
        (*ast)->literal_token = consumeToken();
        ast = &(*ast)->next;
@@ -3541,6 +3552,8 @@ bool Parser::lookAtBuiltinTypeSpecifier() const
 {
    switch (LA()) {
    case T_CHAR:
+    case T_CHAR16_T:
+    case T_CHAR32_T:
    case T_WCHAR_T:
    case T_BOOL:
    case T_SHORT:
@@ -3982,7 +3995,9 @@ bool Parser::parseNumericLiteral(ExpressionAST *&node)
    DEBUG_THIS_RULE();
    if (LA() == T_NUMERIC_LITERAL  ||
        LA() == T_CHAR_LITERAL     ||
-        LA() == T_WIDE_CHAR_LITERAL) {
+        LA() == T_WIDE_CHAR_LITERAL ||
+        LA() == T_UTF16_CHAR_LITERAL ||
+        LA() == T_UTF32_CHAR_LITERAL) {
        NumericLiteralAST *ast = new (_pool) NumericLiteralAST;
        ast->literal_token = consumeToken();
        node = ast;
@@ -4021,6 +4036,9 @@ bool Parser::parsePrimaryExpression(ExpressionAST *&node)
    switch (LA()) {
    case T_STRING_LITERAL:
    case T_WIDE_STRING_LITERAL:
+    case T_UTF8_STRING_LITERAL:
+    case T_UTF16_STRING_LITERAL:
+    case T_UTF32_STRING_LITERAL:
        return parseStringLiteral(node);

    case T_NULLPTR:
@@ -4030,6 +4048,8 @@ bool Parser::parsePrimaryExpression(ExpressionAST *&node)

    case T_CHAR_LITERAL: // ### FIXME don't use NumericLiteral for chars
    case T_WIDE_CHAR_LITERAL:
+    case T_UTF16_CHAR_LITERAL:
+    case T_UTF32_CHAR_LITERAL:
    case T_NUMERIC_LITERAL:
        return parseNumericLiteral(node);

--- a/src/libs/3rdparty/cplusplus/Token.cpp
+++ b/src/libs/3rdparty/cplusplus/Token.cpp
@@ -29,8 +29,12 @@ static const char *token_names[] = {
    ("<C++ comment>"), ("<C++ doxy comment>"),
    ("<comment>"), ("<doxy comment>"),

-    ("<identifier>"), ("<numeric literal>"), ("<char literal>"),
-    ("<wide char literal>"), ("<string literal>"), ("<wide char literal>"),
+    ("<identifier>"),
+
+    ("<numeric literal>"),
+    ("<char literal>"), ("<wide char literal>"), ("<utf16 char literal>"), ("<utf32 char literal>"),
+    ("<string literal>"), ("<wide string literal>"), ("<utf8 string literal>"),
+    ("<utf16 string literal>"), ("<utf32 string literal>"),
    ("<@string literal>"), ("<angle string literal>"),

    ("&"), ("&&"), ("&="), ("->"), ("->*"), ("^"), ("^="), (":"), ("::"),
@@ -40,7 +44,8 @@ static const char *token_names[] = {
    ("|="), ("||"), ("+"), ("+="), ("++"), ("#"), ("##"), ("?"), ("}"),
    ("]"), (")"), (";"), ("*"), ("*="), ("~"), ("~="),

-    ("asm"), ("auto"), ("bool"), ("break"), ("case"), ("catch"), ("char"),
+    ("asm"), ("auto"), ("bool"), ("break"), ("case"), ("catch"),
+    ("char"), ("char16_t"), ("char32_t"),
    ("class"), ("const"), ("const_cast"), ("constexpr"), ("continue"),
    ("decltype"), ("default"),
    ("delete"), ("do"), ("double"), ("dynamic_cast"), ("else"), ("enum"),
@@ -92,11 +97,16 @@ const char *Token::spell() const

    case T_NUMERIC_LITERAL:
    case T_CHAR_LITERAL:
+    case T_WIDE_CHAR_LITERAL:
+    case T_UTF16_CHAR_LITERAL:
+    case T_UTF32_CHAR_LITERAL:
    case T_STRING_LITERAL:
+    case T_WIDE_STRING_LITERAL:
+    case T_UTF8_STRING_LITERAL:
+    case T_UTF16_STRING_LITERAL:
+    case T_UTF32_STRING_LITERAL:
    case T_AT_STRING_LITERAL:
    case T_ANGLE_STRING_LITERAL:
-    case T_WIDE_CHAR_LITERAL:
-    case T_WIDE_STRING_LITERAL:
        return literal->chars();

    default:
--- a/src/libs/3rdparty/cplusplus/Token.h
+++ b/src/libs/3rdparty/cplusplus/Token.h
@@ -40,10 +40,15 @@ enum Kind {
    T_FIRST_CHAR_LITERAL,
    T_CHAR_LITERAL = T_FIRST_CHAR_LITERAL,
    T_WIDE_CHAR_LITERAL,
-    T_LAST_CHAR_LITERAL = T_WIDE_CHAR_LITERAL,
+    T_UTF16_CHAR_LITERAL,
+    T_UTF32_CHAR_LITERAL,
+    T_LAST_CHAR_LITERAL = T_UTF32_CHAR_LITERAL,
    T_FIRST_STRING_LITERAL,
    T_STRING_LITERAL = T_FIRST_STRING_LITERAL,
    T_WIDE_STRING_LITERAL,
+    T_UTF8_STRING_LITERAL,
+    T_UTF16_STRING_LITERAL,
+    T_UTF32_STRING_LITERAL,
    T_AT_STRING_LITERAL,
    T_ANGLE_STRING_LITERAL,
    T_LAST_STRING_LITERAL = T_ANGLE_STRING_LITERAL,
@@ -112,6 +117,8 @@ enum Kind {
    T_CASE,
    T_CATCH,
    T_CHAR,
+    T_CHAR16_T,
+    T_CHAR32_T,
    T_CLASS,
    T_CONST,
    T_CONST_CAST,
--- a/src/libs/3rdparty/cplusplus/TranslationUnit.cpp
+++ b/src/libs/3rdparty/cplusplus/TranslationUnit.cpp
@@ -205,8 +205,8 @@ void TranslationUnit::tokenize()
                    unsigned line = (unsigned) strtoul(tk.spell(), 0, 0);
                    lex(&tk);
                    if (! tk.f.newline && tk.is(T_STRING_LITERAL)) {
-                        const StringLiteral *fileName = control()->stringLiteral(tk.string->chars(),
-                                                                                             tk.string->size());
+                        const StringLiteral *fileName =
+                                control()->stringLiteral(tk.string->chars(), tk.string->size());
                        pushPreprocessorLine(offset, line, fileName);
                        lex(&tk);
                    }