C++: clean up numeric literal parsing and add support for n3472.

Separate the messy pp-number parsing from the numeric literal parsing. The C/C++ preprocessor makes a grown man cry, but at least we have "proper" literal parsing when we want it, including C++1y binary literals. Next step is digit separators (n3781). Change-Id: Ia069eef454ed5c056f77694a5b8a595d0b76adc4 Reviewed-by: Erik Verbruggen <erik.verbruggen@theqtcompany.com>
2014-02-07 15:24:30 +01:00
parent 16becbd29c
commit 242b3f4110
6 changed files with 269 additions and 27 deletions
--- a/src/libs/3rdparty/cplusplus/Lexer.cpp
+++ b/src/libs/3rdparty/cplusplus/Lexer.cpp
@@ -305,24 +305,27 @@ void Lexer::scan_helper(Token *tok)
                tok->f.kind = T_ERROR;
            }
        } else if (std::isdigit(_yychar)) {
+            if (f._ppMode) {
+                scanPreprocessorNumber(tok, true);
+                break;
+            }
+
            const char *yytext = _currentChar - 2;
-            do {
-                if (_yychar == 'e' || _yychar == 'E') {
+            yyinp();
+            scanDigitSequence(); // this is optional: we already skipped over the first digit
+            scanExponentPart();
+            scanOptionalFloatingSuffix();
+            if (std::isalnum(_yychar) || _yychar == '_') {
+                do {
                    yyinp();
-                    if (_yychar == '-' || _yychar == '+') {
-                        yyinp();
-                        // ### CPP_CHECK(std::isdigit(_yychar));
-                    }
-                } else if (std::isalnum(_yychar) || _yychar == '.') {
-                    yyinp();
-                } else {
-                    break;
-                }
-            } while (_yychar);
-            int yylen = _currentChar - yytext;
-            tok->f.kind = T_NUMERIC_LITERAL;
-            if (control())
-                tok->number = control()->numericLiteral(yytext, yylen);
+                } while (std::isalnum(_yychar) || _yychar == '_');
+                tok->f.kind = T_ERROR;
+            } else {
+                int yylen = _currentChar - yytext;
+                tok->f.kind = T_NUMERIC_LITERAL;
+                if (control())
+                    tok->number = control()->numericLiteral(yytext, yylen);
+            }
        } else {
            tok->f.kind = T_DOT;
        }
@@ -651,7 +654,10 @@ void Lexer::scan_helper(Token *tok)
        } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
            scanIdentifier(tok, _currentChar - _tokenStart - 1);
        } else if (std::isdigit(ch)) {
-            scanNumericLiteral(tok);
+            if (f._ppMode)
+                scanPreprocessorNumber(tok, false);
+            else
+                scanNumericLiteral(tok);
        } else {
            tok->f.kind = T_ERROR;
        }
@@ -776,26 +782,141 @@ void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
        tok->string = control()->stringLiteral(yytext, yylen);
 }

+bool Lexer::scanDigitSequence()
+{
+    if (!std::isdigit(_yychar))
+        return false;
+    yyinp();
+    while (std::isdigit(_yychar))
+        yyinp();
+    return true;
+}
+
+bool Lexer::scanExponentPart()
+{
+    if (_yychar != 'e' && _yychar != 'E')
+        return false;
+    yyinp();
+    if (_yychar == '+' || _yychar == '-')
+        yyinp();
+    return scanDigitSequence();
+}
+
+void Lexer::scanOptionalFloatingSuffix()
+{
+    if (_yychar == 'f' || _yychar == 'l' || _yychar == 'F' || _yychar == 'L')
+        yyinp();
+}
+
+void Lexer::scanOptionalIntegerSuffix(bool allowU)
+{
+    switch(_yychar) {
+    case 'u':
+    case 'U':
+        if (allowU) {
+            yyinp();
+            scanOptionalIntegerSuffix(false);
+        }
+        return;
+    case 'l':
+        yyinp();
+        if (_yychar == 'l')
+            yyinp();
+        return;
+    case 'L':
+        yyinp();
+        if (_yychar == 'L')
+            yyinp();
+        return;
+    default:
+        return;
+    }
+}
+
 void Lexer::scanNumericLiteral(Token *tok)
 {
    const char *yytext = _currentChar - 1;
+    if (*yytext == '0' && _yychar) {
+        if (_yychar == 'x' || _yychar == 'X') {
+            yyinp();
+            while (std::isdigit(_yychar) ||
+                   (_yychar >= 'a' && _yychar <= 'f') ||
+                   (_yychar >= 'A' && _yychar <= 'F')) {
+                yyinp();
+            }
+            scanOptionalIntegerSuffix();
+            goto theEnd;
+        } else if (_yychar == 'b' || _yychar == 'B') { // see n3472
+            yyinp();
+            while (_yychar == '0' || _yychar == '1')
+                yyinp();
+            scanOptionalIntegerSuffix();
+            goto theEnd;
+        } else if (_yychar >= '0' && _yychar <= '7') {
+            do {
+                yyinp();
+            } while (_yychar >= '0' && _yychar <= '7');
+            scanOptionalIntegerSuffix();
+            goto theEnd;
+        }
+    }
+
+    while (_yychar) {
+        if (_yychar == '.') {
+            yyinp();
+            scanDigitSequence(); // this is optional: "1." is a valid floating point number
+            scanExponentPart();
+            scanOptionalFloatingSuffix();
+            break;
+        } else if (_yychar == 'e' || _yychar == 'E') {
+            if (scanExponentPart())
+                scanOptionalFloatingSuffix();
+            break;
+        } else if (std::isdigit(_yychar)) {
+            yyinp();
+        } else {
+            scanOptionalIntegerSuffix();
+            break;
+        }
+    }
+
+theEnd:
+    if (std::isalnum(_yychar) || _yychar == '_') {
+        do {
+            yyinp();
+        } while (std::isalnum(_yychar) || _yychar == '_');
+        tok->f.kind = T_ERROR;
+    } else {
+        int yylen = _currentChar - yytext;
+        tok->f.kind = T_NUMERIC_LITERAL;
+        if (control())
+            tok->number = control()->numericLiteral(yytext, yylen);
+    }
+}
+
+void Lexer::scanPreprocessorNumber(Token *tok, bool dotAlreadySkipped)
+{
+    const char *yytext = _currentChar - (dotAlreadySkipped ? 2 : 1);
+    if (dotAlreadySkipped &&
+            (!_yychar || (_yychar && !std::isdigit(_yychar)))) {
+        tok->f.kind = T_DOT;
+        return;
+    }
+
    while (_yychar) {
        if (_yychar == 'e' || _yychar == 'E') {
            yyinp();
-            if (_yychar == '-' || _yychar == '+') {
+            if (_yychar == '+' || _yychar == '-')
                yyinp();
-                // ### CPP_CHECK(std::isdigit(_yychar));
-            }
-        } else if (std::isalnum(_yychar) || _yychar == '.') {
+        } else if (std::isalnum(_yychar) || _yychar == '_' || _yychar == '.') {
            yyinp();
        } else {
            break;
        }
    }
+
    int yylen = _currentChar - yytext;
-
    tok->f.kind = T_NUMERIC_LITERAL;
-
    if (control())
        tok->number = control()->numericLiteral(yytext, yylen);
 }
--- a/src/libs/3rdparty/cplusplus/Lexer.h
+++ b/src/libs/3rdparty/cplusplus/Lexer.h
@@ -61,6 +61,9 @@ public:
    LanguageFeatures languageFeatures() const { return _languageFeatures; }
    void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }

+    void setPreprocessorMode(bool onoff)
+    { f._ppMode = onoff; }
+
 public:
    static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar,
                           unsigned &utf16charCounter)
@@ -95,7 +98,12 @@ private:
    void scanRawStringLiteral(Token *tok, unsigned char hint = 0);
    void scanCharLiteral(Token *tok, unsigned char hint = 0);
    void scanUntilQuote(Token *tok, unsigned char quote);
+    bool scanDigitSequence();
+    bool scanExponentPart();
+    void scanOptionalFloatingSuffix();
+    void scanOptionalIntegerSuffix(bool allowU = true);
    void scanNumericLiteral(Token *tok);
+    void scanPreprocessorNumber(Token *tok, bool dotAlreadySkipped);
    void scanIdentifier(Token *tok, unsigned extraProcessedChars = 0);
    void scanBackslash(Kind type);
    void scanCppComment(Kind type);
@@ -115,6 +123,7 @@ private:
        unsigned _scanCommentTokens: 1;
        unsigned _scanKeywords: 1;
        unsigned _scanAngleStringLiteralTokens: 1;
+        unsigned _ppMode: 1;
    };

    struct State {
--- a/src/libs/cplusplus/SimpleLexer.cpp
+++ b/src/libs/cplusplus/SimpleLexer.cpp
@@ -41,7 +41,8 @@ using namespace CPlusPlus;
 SimpleLexer::SimpleLexer()
    : _lastState(0),
      _skipComments(false),
-      _endedJoined(false)
+      _endedJoined(false),
+      _ppMode(false)
 {}

 SimpleLexer::~SimpleLexer()
@@ -73,6 +74,7 @@ Tokens SimpleLexer::operator()(const QString &text, int state)
    Lexer lex(firstChar, lastChar);
    lex.setLanguageFeatures(_languageFeatures);
    lex.setStartWithNewline(true);
+    lex.setPreprocessorMode(_ppMode);

    if (! _skipComments)
        lex.setScanCommentTokens(true);
--- a/src/libs/cplusplus/SimpleLexer.h
+++ b/src/libs/cplusplus/SimpleLexer.h
@@ -51,6 +51,9 @@ public:
    bool skipComments() const;
    void setSkipComments(bool skipComments);

+    void setPreprocessorMode(bool ppMode)
+    { _ppMode = ppMode; }
+
    LanguageFeatures languageFeatures() const { return _languageFeatures; }
    void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }

@@ -74,6 +77,7 @@ private:
    LanguageFeatures _languageFeatures;
    bool _skipComments: 1;
    bool _endedJoined: 1;
+    bool _ppMode: 1;
 };

 } // namespace CPlusPlus
--- a/src/libs/cplusplus/pp-engine.cpp
+++ b/src/libs/cplusplus/pp-engine.cpp
@@ -401,6 +401,9 @@ protected:
            const char *end = spell + len;
            char *vend = const_cast<char *>(end);
            _value.set_long(strtol(spell, &vend, 0));
+            // TODO: if (vend != end) error(NaN)
+            // TODO: binary literals
+            // TODO: float literals
            ++(*_lex);
        } else if (isTokenDefined()) {
            ++(*_lex);
@@ -1388,6 +1391,7 @@ void Preprocessor::preprocess(const QString &fileName, const QByteArray &source,
    m_state.m_lexer = new Lexer(source.constBegin(), source.constEnd());
    m_state.m_lexer->setScanKeywords(false);
    m_state.m_lexer->setScanAngleStringLiteralTokens(false);
+    m_state.m_lexer->setPreprocessorMode(true);
    if (m_keepComments)
        m_state.m_lexer->setScanCommentTokens(true);
    m_state.m_result = result;
@@ -1803,6 +1807,7 @@ const PPToken Preprocessor::evalExpression(PPToken *tk, Value &result)
    PPToken lastConditionToken;
    const QByteArray expanded = expand(tk, &lastConditionToken);
    Lexer lexer(expanded.constData(), expanded.constData() + expanded.size());
+    lexer.setPreprocessorMode(true);
    std::vector<Token> buf;
    Token t;
    do {
--- a/tests/auto/cplusplus/lexer/tst_lexer.cpp
+++ b/tests/auto/cplusplus/lexer/tst_lexer.cpp
@@ -70,6 +70,10 @@ private slots:
    void basic_data();
    void incremental();
    void incremental_data();
+    void literals();
+    void literals_data();
+    void preprocessor();
+    void preprocessor_data();

    void bytes_and_utf16chars();
    void bytes_and_utf16chars_data();
@@ -82,7 +86,8 @@ private:
    void run(const QByteArray &source,
             const Tokens &expectedTokens,
             bool preserveState,
-             TokenCompareFlags compareFlags);
+             TokenCompareFlags compareFlags,
+             bool preprocessorMode = false);

    int _state;
 };
@@ -103,11 +108,13 @@ Tokens tst_SimpleLexer::toTokens(const TokenKindList &tokenKinds)
 void tst_SimpleLexer::run(const QByteArray &source,
                          const Tokens &expectedTokens,
                          bool preserveState,
-                          TokenCompareFlags compareFlags)
+                          TokenCompareFlags compareFlags,
+                          bool preprocessorMode)
 {
    QVERIFY(compareFlags);

    SimpleLexer lexer;
+    lexer.setPreprocessorMode(preprocessorMode);
    const Tokens tokens = lexer(source, preserveState ? _state : 0);
    if (preserveState)
        _state = lexer.state();
@@ -140,7 +147,10 @@ void tst_SimpleLexer::run(const QByteArray &source,
        if (compareFlags & CompareUtf16CharsEnd)
            QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
    }
-    QVERIFY2(i == expectedTokens.size(), "Less tokens than expected.");
+
+    QString msg = QLatin1String("Less tokens than expected: got %1, expected %2.");
+    msg = msg.arg(i).arg(expectedTokens.size());
+    QVERIFY2(i == expectedTokens.size(), msg.toUtf8().constData());
 }

 void tst_SimpleLexer::basic()
@@ -254,6 +264,97 @@ void tst_SimpleLexer::basic_data()
    QTest::newRow(source) << source << expectedTokenKindList;
 }

+void tst_SimpleLexer::literals()
+{
+    QFETCH(QByteArray, source);
+    QFETCH(TokenKindList, expectedTokenKindList);
+
+    run(source, toTokens(expectedTokenKindList), false, CompareKind);
+}
+
+void tst_SimpleLexer::literals_data()
+{
+    QTest::addColumn<QByteArray>("source");
+    QTest::addColumn<TokenKindList>("expectedTokenKindList");
+
+    QByteArray source;
+    TokenKindList expectedTokenKindList;
+
+    source =
+            "1.\n"
+            "1.1\n"
+            "1.23456789\n"
+            ".1\n"
+            ".3e8\n"
+            ".3e8f\n"
+            "1e1\n"
+            "1E1\n"
+            "-1e-1\n" // the first minus sign is a separate token!
+            "1e-1\n"
+            "1e+1\n"
+            "1e1L\n"
+            "1e1l\n"
+            "1e1f\n"
+            "1e1F\n"
+            "23.45x"
+            ".45x"
+            ;
+    expectedTokenKindList =
+            TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
+                            << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
+                            << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_MINUS
+                            << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
+                            << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
+                            << T_NUMERIC_LITERAL << T_ERROR << T_ERROR
+                               ;
+    QTest::newRow("float-literals") << source << expectedTokenKindList;
+
+    source = // these are all the same
+            "42\n"
+            "0b101010u\n"
+            "052ll\n"
+            "0x2aL\n"
+            "123FOO\n"
+            "0xfOo\n"
+            "33_\n"
+            ;
+    expectedTokenKindList =
+            TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
+                            << T_NUMERIC_LITERAL << T_ERROR << T_ERROR << T_ERROR
+                               ;
+    QTest::newRow("integer-literals") << source << expectedTokenKindList;
+}
+
+void tst_SimpleLexer::preprocessor()
+{
+    QFETCH(QByteArray, source);
+    QFETCH(TokenKindList, expectedTokenKindList);
+
+    run(source, toTokens(expectedTokenKindList), false, CompareKind, true);
+}
+
+void tst_SimpleLexer::preprocessor_data()
+{
+    QTest::addColumn<QByteArray>("source");
+    QTest::addColumn<TokenKindList>("expectedTokenKindList");
+
+    QByteArray source;
+    TokenKindList expectedTokenKindList;
+
+    source = // sad but true [2.10]
+            "1\n"
+            "1x.\n"
+            "1.y\n"
+            ".1_1.1.\n"
+            "1e-\n"
+            "01x1b2qWeRtty_Grumble+E-.\n"
+            ;
+    expectedTokenKindList =
+            TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
+                            << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL;
+    QTest::newRow("pp-number") << source << expectedTokenKindList;
+}
+
 void tst_SimpleLexer::bytes_and_utf16chars()
 {
    QFETCH(QByteArray, source);