C++: clean up numeric literal parsing and add support for n3472.

Separate the messy pp-number parsing from the numeric literal parsing.
The C/C++ preprocessor makes a grown man cry, but at least we have
"proper" literal parsing when we want it, including C++1y binary
literals.

Next step is digit separators (n3781).

Change-Id: Ia069eef454ed5c056f77694a5b8a595d0b76adc4
Reviewed-by: Erik Verbruggen <erik.verbruggen@theqtcompany.com>
This commit is contained in:
Erik Verbruggen
2014-02-07 15:24:30 +01:00
committed by Nikolai Kosjar
parent 16becbd29c
commit 242b3f4110
6 changed files with 269 additions and 27 deletions

View File

@@ -305,24 +305,27 @@ void Lexer::scan_helper(Token *tok)
tok->f.kind = T_ERROR; tok->f.kind = T_ERROR;
} }
} else if (std::isdigit(_yychar)) { } else if (std::isdigit(_yychar)) {
if (f._ppMode) {
scanPreprocessorNumber(tok, true);
break;
}
const char *yytext = _currentChar - 2; const char *yytext = _currentChar - 2;
do { yyinp();
if (_yychar == 'e' || _yychar == 'E') { scanDigitSequence(); // this is optional: we already skipped over the first digit
scanExponentPart();
scanOptionalFloatingSuffix();
if (std::isalnum(_yychar) || _yychar == '_') {
do {
yyinp(); yyinp();
if (_yychar == '-' || _yychar == '+') { } while (std::isalnum(_yychar) || _yychar == '_');
yyinp(); tok->f.kind = T_ERROR;
// ### CPP_CHECK(std::isdigit(_yychar)); } else {
} int yylen = _currentChar - yytext;
} else if (std::isalnum(_yychar) || _yychar == '.') { tok->f.kind = T_NUMERIC_LITERAL;
yyinp(); if (control())
} else { tok->number = control()->numericLiteral(yytext, yylen);
break; }
}
} while (_yychar);
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
} else { } else {
tok->f.kind = T_DOT; tok->f.kind = T_DOT;
} }
@@ -651,7 +654,10 @@ void Lexer::scan_helper(Token *tok)
} else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) { } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
scanIdentifier(tok, _currentChar - _tokenStart - 1); scanIdentifier(tok, _currentChar - _tokenStart - 1);
} else if (std::isdigit(ch)) { } else if (std::isdigit(ch)) {
scanNumericLiteral(tok); if (f._ppMode)
scanPreprocessorNumber(tok, false);
else
scanNumericLiteral(tok);
} else { } else {
tok->f.kind = T_ERROR; tok->f.kind = T_ERROR;
} }
@@ -776,26 +782,141 @@ void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
tok->string = control()->stringLiteral(yytext, yylen); tok->string = control()->stringLiteral(yytext, yylen);
} }
bool Lexer::scanDigitSequence()
{
if (!std::isdigit(_yychar))
return false;
yyinp();
while (std::isdigit(_yychar))
yyinp();
return true;
}
bool Lexer::scanExponentPart()
{
if (_yychar != 'e' && _yychar != 'E')
return false;
yyinp();
if (_yychar == '+' || _yychar == '-')
yyinp();
return scanDigitSequence();
}
void Lexer::scanOptionalFloatingSuffix()
{
if (_yychar == 'f' || _yychar == 'l' || _yychar == 'F' || _yychar == 'L')
yyinp();
}
void Lexer::scanOptionalIntegerSuffix(bool allowU)
{
switch(_yychar) {
case 'u':
case 'U':
if (allowU) {
yyinp();
scanOptionalIntegerSuffix(false);
}
return;
case 'l':
yyinp();
if (_yychar == 'l')
yyinp();
return;
case 'L':
yyinp();
if (_yychar == 'L')
yyinp();
return;
default:
return;
}
}
void Lexer::scanNumericLiteral(Token *tok) void Lexer::scanNumericLiteral(Token *tok)
{ {
const char *yytext = _currentChar - 1; const char *yytext = _currentChar - 1;
if (*yytext == '0' && _yychar) {
if (_yychar == 'x' || _yychar == 'X') {
yyinp();
while (std::isdigit(_yychar) ||
(_yychar >= 'a' && _yychar <= 'f') ||
(_yychar >= 'A' && _yychar <= 'F')) {
yyinp();
}
scanOptionalIntegerSuffix();
goto theEnd;
} else if (_yychar == 'b' || _yychar == 'B') { // see n3472
yyinp();
while (_yychar == '0' || _yychar == '1')
yyinp();
scanOptionalIntegerSuffix();
goto theEnd;
} else if (_yychar >= '0' && _yychar <= '7') {
do {
yyinp();
} while (_yychar >= '0' && _yychar <= '7');
scanOptionalIntegerSuffix();
goto theEnd;
}
}
while (_yychar) {
if (_yychar == '.') {
yyinp();
scanDigitSequence(); // this is optional: "1." is a valid floating point number
scanExponentPart();
scanOptionalFloatingSuffix();
break;
} else if (_yychar == 'e' || _yychar == 'E') {
if (scanExponentPart())
scanOptionalFloatingSuffix();
break;
} else if (std::isdigit(_yychar)) {
yyinp();
} else {
scanOptionalIntegerSuffix();
break;
}
}
theEnd:
if (std::isalnum(_yychar) || _yychar == '_') {
do {
yyinp();
} while (std::isalnum(_yychar) || _yychar == '_');
tok->f.kind = T_ERROR;
} else {
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
}
}
void Lexer::scanPreprocessorNumber(Token *tok, bool dotAlreadySkipped)
{
const char *yytext = _currentChar - (dotAlreadySkipped ? 2 : 1);
if (dotAlreadySkipped &&
(!_yychar || (_yychar && !std::isdigit(_yychar)))) {
tok->f.kind = T_DOT;
return;
}
while (_yychar) { while (_yychar) {
if (_yychar == 'e' || _yychar == 'E') { if (_yychar == 'e' || _yychar == 'E') {
yyinp(); yyinp();
if (_yychar == '-' || _yychar == '+') { if (_yychar == '+' || _yychar == '-')
yyinp(); yyinp();
// ### CPP_CHECK(std::isdigit(_yychar)); } else if (std::isalnum(_yychar) || _yychar == '_' || _yychar == '.') {
}
} else if (std::isalnum(_yychar) || _yychar == '.') {
yyinp(); yyinp();
} else { } else {
break; break;
} }
} }
int yylen = _currentChar - yytext; int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL; tok->f.kind = T_NUMERIC_LITERAL;
if (control()) if (control())
tok->number = control()->numericLiteral(yytext, yylen); tok->number = control()->numericLiteral(yytext, yylen);
} }

View File

@@ -61,6 +61,9 @@ public:
LanguageFeatures languageFeatures() const { return _languageFeatures; } LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; } void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
void setPreprocessorMode(bool onoff)
{ f._ppMode = onoff; }
public: public:
static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar, static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar,
unsigned &utf16charCounter) unsigned &utf16charCounter)
@@ -95,7 +98,12 @@ private:
void scanRawStringLiteral(Token *tok, unsigned char hint = 0); void scanRawStringLiteral(Token *tok, unsigned char hint = 0);
void scanCharLiteral(Token *tok, unsigned char hint = 0); void scanCharLiteral(Token *tok, unsigned char hint = 0);
void scanUntilQuote(Token *tok, unsigned char quote); void scanUntilQuote(Token *tok, unsigned char quote);
bool scanDigitSequence();
bool scanExponentPart();
void scanOptionalFloatingSuffix();
void scanOptionalIntegerSuffix(bool allowU = true);
void scanNumericLiteral(Token *tok); void scanNumericLiteral(Token *tok);
void scanPreprocessorNumber(Token *tok, bool dotAlreadySkipped);
void scanIdentifier(Token *tok, unsigned extraProcessedChars = 0); void scanIdentifier(Token *tok, unsigned extraProcessedChars = 0);
void scanBackslash(Kind type); void scanBackslash(Kind type);
void scanCppComment(Kind type); void scanCppComment(Kind type);
@@ -115,6 +123,7 @@ private:
unsigned _scanCommentTokens: 1; unsigned _scanCommentTokens: 1;
unsigned _scanKeywords: 1; unsigned _scanKeywords: 1;
unsigned _scanAngleStringLiteralTokens: 1; unsigned _scanAngleStringLiteralTokens: 1;
unsigned _ppMode: 1;
}; };
struct State { struct State {

View File

@@ -41,7 +41,8 @@ using namespace CPlusPlus;
SimpleLexer::SimpleLexer() SimpleLexer::SimpleLexer()
: _lastState(0), : _lastState(0),
_skipComments(false), _skipComments(false),
_endedJoined(false) _endedJoined(false),
_ppMode(false)
{} {}
SimpleLexer::~SimpleLexer() SimpleLexer::~SimpleLexer()
@@ -73,6 +74,7 @@ Tokens SimpleLexer::operator()(const QString &text, int state)
Lexer lex(firstChar, lastChar); Lexer lex(firstChar, lastChar);
lex.setLanguageFeatures(_languageFeatures); lex.setLanguageFeatures(_languageFeatures);
lex.setStartWithNewline(true); lex.setStartWithNewline(true);
lex.setPreprocessorMode(_ppMode);
if (! _skipComments) if (! _skipComments)
lex.setScanCommentTokens(true); lex.setScanCommentTokens(true);

View File

@@ -51,6 +51,9 @@ public:
bool skipComments() const; bool skipComments() const;
void setSkipComments(bool skipComments); void setSkipComments(bool skipComments);
void setPreprocessorMode(bool ppMode)
{ _ppMode = ppMode; }
LanguageFeatures languageFeatures() const { return _languageFeatures; } LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; } void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
@@ -74,6 +77,7 @@ private:
LanguageFeatures _languageFeatures; LanguageFeatures _languageFeatures;
bool _skipComments: 1; bool _skipComments: 1;
bool _endedJoined: 1; bool _endedJoined: 1;
bool _ppMode: 1;
}; };
} // namespace CPlusPlus } // namespace CPlusPlus

View File

@@ -401,6 +401,9 @@ protected:
const char *end = spell + len; const char *end = spell + len;
char *vend = const_cast<char *>(end); char *vend = const_cast<char *>(end);
_value.set_long(strtol(spell, &vend, 0)); _value.set_long(strtol(spell, &vend, 0));
// TODO: if (vend != end) error(NaN)
// TODO: binary literals
// TODO: float literals
++(*_lex); ++(*_lex);
} else if (isTokenDefined()) { } else if (isTokenDefined()) {
++(*_lex); ++(*_lex);
@@ -1388,6 +1391,7 @@ void Preprocessor::preprocess(const QString &fileName, const QByteArray &source,
m_state.m_lexer = new Lexer(source.constBegin(), source.constEnd()); m_state.m_lexer = new Lexer(source.constBegin(), source.constEnd());
m_state.m_lexer->setScanKeywords(false); m_state.m_lexer->setScanKeywords(false);
m_state.m_lexer->setScanAngleStringLiteralTokens(false); m_state.m_lexer->setScanAngleStringLiteralTokens(false);
m_state.m_lexer->setPreprocessorMode(true);
if (m_keepComments) if (m_keepComments)
m_state.m_lexer->setScanCommentTokens(true); m_state.m_lexer->setScanCommentTokens(true);
m_state.m_result = result; m_state.m_result = result;
@@ -1803,6 +1807,7 @@ const PPToken Preprocessor::evalExpression(PPToken *tk, Value &result)
PPToken lastConditionToken; PPToken lastConditionToken;
const QByteArray expanded = expand(tk, &lastConditionToken); const QByteArray expanded = expand(tk, &lastConditionToken);
Lexer lexer(expanded.constData(), expanded.constData() + expanded.size()); Lexer lexer(expanded.constData(), expanded.constData() + expanded.size());
lexer.setPreprocessorMode(true);
std::vector<Token> buf; std::vector<Token> buf;
Token t; Token t;
do { do {

View File

@@ -70,6 +70,10 @@ private slots:
void basic_data(); void basic_data();
void incremental(); void incremental();
void incremental_data(); void incremental_data();
void literals();
void literals_data();
void preprocessor();
void preprocessor_data();
void bytes_and_utf16chars(); void bytes_and_utf16chars();
void bytes_and_utf16chars_data(); void bytes_and_utf16chars_data();
@@ -82,7 +86,8 @@ private:
void run(const QByteArray &source, void run(const QByteArray &source,
const Tokens &expectedTokens, const Tokens &expectedTokens,
bool preserveState, bool preserveState,
TokenCompareFlags compareFlags); TokenCompareFlags compareFlags,
bool preprocessorMode = false);
int _state; int _state;
}; };
@@ -103,11 +108,13 @@ Tokens tst_SimpleLexer::toTokens(const TokenKindList &tokenKinds)
void tst_SimpleLexer::run(const QByteArray &source, void tst_SimpleLexer::run(const QByteArray &source,
const Tokens &expectedTokens, const Tokens &expectedTokens,
bool preserveState, bool preserveState,
TokenCompareFlags compareFlags) TokenCompareFlags compareFlags,
bool preprocessorMode)
{ {
QVERIFY(compareFlags); QVERIFY(compareFlags);
SimpleLexer lexer; SimpleLexer lexer;
lexer.setPreprocessorMode(preprocessorMode);
const Tokens tokens = lexer(source, preserveState ? _state : 0); const Tokens tokens = lexer(source, preserveState ? _state : 0);
if (preserveState) if (preserveState)
_state = lexer.state(); _state = lexer.state();
@@ -140,7 +147,10 @@ void tst_SimpleLexer::run(const QByteArray &source,
if (compareFlags & CompareUtf16CharsEnd) if (compareFlags & CompareUtf16CharsEnd)
QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd()); QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
} }
QVERIFY2(i == expectedTokens.size(), "Less tokens than expected.");
QString msg = QLatin1String("Less tokens than expected: got %1, expected %2.");
msg = msg.arg(i).arg(expectedTokens.size());
QVERIFY2(i == expectedTokens.size(), msg.toUtf8().constData());
} }
void tst_SimpleLexer::basic() void tst_SimpleLexer::basic()
@@ -254,6 +264,97 @@ void tst_SimpleLexer::basic_data()
QTest::newRow(source) << source << expectedTokenKindList; QTest::newRow(source) << source << expectedTokenKindList;
} }
void tst_SimpleLexer::literals()
{
QFETCH(QByteArray, source);
QFETCH(TokenKindList, expectedTokenKindList);
run(source, toTokens(expectedTokenKindList), false, CompareKind);
}
void tst_SimpleLexer::literals_data()
{
QTest::addColumn<QByteArray>("source");
QTest::addColumn<TokenKindList>("expectedTokenKindList");
QByteArray source;
TokenKindList expectedTokenKindList;
source =
"1.\n"
"1.1\n"
"1.23456789\n"
".1\n"
".3e8\n"
".3e8f\n"
"1e1\n"
"1E1\n"
"-1e-1\n" // the first minus sign is a separate token!
"1e-1\n"
"1e+1\n"
"1e1L\n"
"1e1l\n"
"1e1f\n"
"1e1F\n"
"23.45x"
".45x"
;
expectedTokenKindList =
TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_MINUS
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_ERROR << T_ERROR
;
QTest::newRow("float-literals") << source << expectedTokenKindList;
source = // these are all the same
"42\n"
"0b101010u\n"
"052ll\n"
"0x2aL\n"
"123FOO\n"
"0xfOo\n"
"33_\n"
;
expectedTokenKindList =
TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_ERROR << T_ERROR << T_ERROR
;
QTest::newRow("integer-literals") << source << expectedTokenKindList;
}
void tst_SimpleLexer::preprocessor()
{
QFETCH(QByteArray, source);
QFETCH(TokenKindList, expectedTokenKindList);
run(source, toTokens(expectedTokenKindList), false, CompareKind, true);
}
void tst_SimpleLexer::preprocessor_data()
{
QTest::addColumn<QByteArray>("source");
QTest::addColumn<TokenKindList>("expectedTokenKindList");
QByteArray source;
TokenKindList expectedTokenKindList;
source = // sad but true [2.10]
"1\n"
"1x.\n"
"1.y\n"
".1_1.1.\n"
"1e-\n"
"01x1b2qWeRtty_Grumble+E-.\n"
;
expectedTokenKindList =
TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL;
QTest::newRow("pp-number") << source << expectedTokenKindList;
}
void tst_SimpleLexer::bytes_and_utf16chars() void tst_SimpleLexer::bytes_and_utf16chars()
{ {
QFETCH(QByteArray, source); QFETCH(QByteArray, source);