forked from qt-creator/qt-creator
Those are the types char16_t and char32_t along with the new char/string literals u'', U'', u"", u8"", and U"". This is particularly important for the use of QStringLiteral since in some platforms it relies on expansion such as above. Note: The string literals quickfixes still need some tunning. Task-number: QTCREATORBUG-7449 Change-Id: Iebcfea15677dc8e0ebb6143def89a5477e1be7d4 Reviewed-by: hjk <qthjk@ovi.com>
713 lines
18 KiB
C++
713 lines
18 KiB
C++
// Copyright (c) 2008 Roberto Raggi <roberto.raggi@gmail.com>
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in
|
|
// all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
// THE SOFTWARE.
|
|
|
|
#include "Lexer.h"
|
|
#include "Control.h"
|
|
#include "TranslationUnit.h"
|
|
#include "Literals.h"
|
|
#include <cctype>
|
|
#include <cassert>
|
|
|
|
using namespace CPlusPlus;
|
|
|
|
Lexer::Lexer(TranslationUnit *unit)
|
|
: _translationUnit(unit),
|
|
_state(State_Default),
|
|
_flags(0),
|
|
_currentLine(1)
|
|
{
|
|
f._scanKeywords = true;
|
|
setSource(_translationUnit->firstSourceChar(),
|
|
_translationUnit->lastSourceChar());
|
|
}
|
|
|
|
Lexer::Lexer(const char *firstChar, const char *lastChar)
|
|
: _translationUnit(0),
|
|
_state(State_Default),
|
|
_flags(0),
|
|
_currentLine(1)
|
|
{
|
|
f._scanKeywords = true;
|
|
setSource(firstChar, lastChar);
|
|
}
|
|
|
|
Lexer::~Lexer()
|
|
{ }
|
|
|
|
TranslationUnit *Lexer::translationUnit() const
|
|
{ return _translationUnit; }
|
|
|
|
Control *Lexer::control() const
|
|
{
|
|
if (_translationUnit)
|
|
return _translationUnit->control();
|
|
|
|
return 0;
|
|
}
|
|
|
|
void Lexer::setSource(const char *firstChar, const char *lastChar)
|
|
{
|
|
_firstChar = firstChar;
|
|
_lastChar = lastChar;
|
|
_currentChar = _firstChar - 1;
|
|
_tokenStart = _currentChar;
|
|
_yychar = '\n';
|
|
}
|
|
|
|
void Lexer::setStartWithNewline(bool enabled)
|
|
{
|
|
if (enabled)
|
|
_yychar = '\n';
|
|
else
|
|
_yychar = ' ';
|
|
}
|
|
|
|
int Lexer::state() const
|
|
{ return _state; }
|
|
|
|
void Lexer::setState(int state)
|
|
{ _state = state; }
|
|
|
|
bool Lexer::qtMocRunEnabled() const
|
|
{ return f._qtMocRunEnabled; }
|
|
|
|
void Lexer::setQtMocRunEnabled(bool onoff)
|
|
{ f._qtMocRunEnabled = onoff; }
|
|
|
|
bool Lexer::cxx0xEnabled() const
|
|
{ return f._cxx0xEnabled; }
|
|
|
|
void Lexer::setCxxOxEnabled(bool onoff)
|
|
{ f._cxx0xEnabled = onoff; }
|
|
|
|
bool Lexer::objCEnabled() const
|
|
{ return f._objCEnabled; }
|
|
|
|
void Lexer::setObjCEnabled(bool onoff)
|
|
{ f._objCEnabled = onoff; }
|
|
|
|
bool Lexer::isIncremental() const
|
|
{ return f._isIncremental; }
|
|
|
|
void Lexer::setIncremental(bool isIncremental)
|
|
{ f._isIncremental = isIncremental; }
|
|
|
|
bool Lexer::scanCommentTokens() const
|
|
{ return f._scanCommentTokens; }
|
|
|
|
void Lexer::setScanCommentTokens(bool onoff)
|
|
{ f._scanCommentTokens = onoff; }
|
|
|
|
bool Lexer::scanKeywords() const
|
|
{ return f._scanKeywords; }
|
|
|
|
void Lexer::setScanKeywords(bool onoff)
|
|
{ f._scanKeywords = onoff; }
|
|
|
|
void Lexer::setScanAngleStringLiteralTokens(bool onoff)
|
|
{ f._scanAngleStringLiteralTokens = onoff; }
|
|
|
|
void Lexer::pushLineStartOffset()
|
|
{
|
|
++_currentLine;
|
|
|
|
if (_translationUnit)
|
|
_translationUnit->pushLineOffset(_currentChar - _firstChar);
|
|
}
|
|
|
|
unsigned Lexer::tokenOffset() const
|
|
{ return _tokenStart - _firstChar; }
|
|
|
|
unsigned Lexer::tokenLength() const
|
|
{ return _currentChar - _tokenStart; }
|
|
|
|
const char *Lexer::tokenBegin() const
|
|
{ return _tokenStart; }
|
|
|
|
const char *Lexer::tokenEnd() const
|
|
{ return _currentChar; }
|
|
|
|
unsigned Lexer::currentLine() const
|
|
{ return _currentLine; }
|
|
|
|
void Lexer::scan(Token *tok)
|
|
{
|
|
tok->reset();
|
|
scan_helper(tok);
|
|
tok->f.length = _currentChar - _tokenStart;
|
|
}
|
|
|
|
void Lexer::scan_helper(Token *tok)
|
|
{
|
|
_Lagain:
|
|
while (_yychar && std::isspace(_yychar)) {
|
|
if (_yychar == '\n') {
|
|
tok->f.joined = false;
|
|
tok->f.newline = true;
|
|
} else {
|
|
tok->f.whitespace = true;
|
|
}
|
|
yyinp();
|
|
}
|
|
|
|
if (! _translationUnit)
|
|
tok->lineno = _currentLine;
|
|
|
|
_tokenStart = _currentChar;
|
|
tok->offset = _currentChar - _firstChar;
|
|
|
|
if (_state == State_MultiLineComment || _state == State_MultiLineDoxyComment) {
|
|
const int originalState = _state;
|
|
|
|
if (! _yychar) {
|
|
tok->f.kind = T_EOF_SYMBOL;
|
|
return;
|
|
}
|
|
|
|
while (_yychar) {
|
|
if (_yychar != '*')
|
|
yyinp();
|
|
else {
|
|
yyinp();
|
|
if (_yychar == '/') {
|
|
yyinp();
|
|
_state = State_Default;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (! f._scanCommentTokens)
|
|
goto _Lagain;
|
|
|
|
else if (originalState == State_MultiLineComment)
|
|
tok->f.kind = T_COMMENT;
|
|
else
|
|
tok->f.kind = T_DOXY_COMMENT;
|
|
return; // done
|
|
}
|
|
|
|
if (! _yychar) {
|
|
tok->f.kind = T_EOF_SYMBOL;
|
|
return;
|
|
}
|
|
|
|
unsigned char ch = _yychar;
|
|
yyinp();
|
|
|
|
switch (ch) {
|
|
case '\\':
|
|
while (_yychar != '\n' && std::isspace(_yychar))
|
|
yyinp();
|
|
// ### assert(! _yychar || _yychar == '\n');
|
|
if (_yychar == '\n') {
|
|
tok->f.joined = true;
|
|
tok->f.newline = false;
|
|
yyinp();
|
|
}
|
|
goto _Lagain;
|
|
|
|
case '"':
|
|
scanStringLiteral(tok);
|
|
break;
|
|
|
|
case '\'':
|
|
scanCharLiteral(tok);
|
|
break;
|
|
|
|
case '{':
|
|
tok->f.kind = T_LBRACE;
|
|
break;
|
|
|
|
case '}':
|
|
tok->f.kind = T_RBRACE;
|
|
break;
|
|
|
|
case '[':
|
|
tok->f.kind = T_LBRACKET;
|
|
break;
|
|
|
|
case ']':
|
|
tok->f.kind = T_RBRACKET;
|
|
break;
|
|
|
|
case '#':
|
|
if (_yychar == '#') {
|
|
tok->f.kind = T_POUND_POUND;
|
|
yyinp();
|
|
} else {
|
|
tok->f.kind = T_POUND;
|
|
}
|
|
break;
|
|
|
|
case '(':
|
|
tok->f.kind = T_LPAREN;
|
|
break;
|
|
|
|
case ')':
|
|
tok->f.kind = T_RPAREN;
|
|
break;
|
|
|
|
case ';':
|
|
tok->f.kind = T_SEMICOLON;
|
|
break;
|
|
|
|
case ':':
|
|
if (_yychar == ':') {
|
|
yyinp();
|
|
tok->f.kind = T_COLON_COLON;
|
|
} else {
|
|
tok->f.kind = T_COLON;
|
|
}
|
|
break;
|
|
|
|
case '.':
|
|
if (_yychar == '*') {
|
|
yyinp();
|
|
tok->f.kind = T_DOT_STAR;
|
|
} else if (_yychar == '.') {
|
|
yyinp();
|
|
// ### assert(_yychar);
|
|
if (_yychar == '.') {
|
|
yyinp();
|
|
tok->f.kind = T_DOT_DOT_DOT;
|
|
} else {
|
|
tok->f.kind = T_ERROR;
|
|
}
|
|
} else if (std::isdigit(_yychar)) {
|
|
const char *yytext = _currentChar - 2;
|
|
do {
|
|
if (_yychar == 'e' || _yychar == 'E') {
|
|
yyinp();
|
|
if (_yychar == '-' || _yychar == '+') {
|
|
yyinp();
|
|
// ### assert(std::isdigit(_yychar));
|
|
}
|
|
} else if (std::isalnum(_yychar) || _yychar == '.') {
|
|
yyinp();
|
|
} else {
|
|
break;
|
|
}
|
|
} while (_yychar);
|
|
int yylen = _currentChar - yytext;
|
|
tok->f.kind = T_NUMERIC_LITERAL;
|
|
if (control())
|
|
tok->number = control()->numericLiteral(yytext, yylen);
|
|
} else {
|
|
tok->f.kind = T_DOT;
|
|
}
|
|
break;
|
|
|
|
case '?':
|
|
tok->f.kind = T_QUESTION;
|
|
break;
|
|
|
|
case '+':
|
|
if (_yychar == '+') {
|
|
yyinp();
|
|
tok->f.kind = T_PLUS_PLUS;
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_PLUS_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_PLUS;
|
|
}
|
|
break;
|
|
|
|
case '-':
|
|
if (_yychar == '-') {
|
|
yyinp();
|
|
tok->f.kind = T_MINUS_MINUS;
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_MINUS_EQUAL;
|
|
} else if (_yychar == '>') {
|
|
yyinp();
|
|
if (_yychar == '*') {
|
|
yyinp();
|
|
tok->f.kind = T_ARROW_STAR;
|
|
} else {
|
|
tok->f.kind = T_ARROW;
|
|
}
|
|
} else {
|
|
tok->f.kind = T_MINUS;
|
|
}
|
|
break;
|
|
|
|
case '*':
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_STAR_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_STAR;
|
|
}
|
|
break;
|
|
|
|
case '/':
|
|
if (_yychar == '/') {
|
|
yyinp();
|
|
|
|
bool doxy = false;
|
|
|
|
if (_yychar == '/' || _yychar == '!') {
|
|
yyinp();
|
|
|
|
if (_yychar == '<')
|
|
yyinp();
|
|
|
|
if (_yychar != '\n' && std::isspace(_yychar))
|
|
doxy = true;
|
|
}
|
|
|
|
while (_yychar && _yychar != '\n')
|
|
yyinp();
|
|
|
|
if (! f._scanCommentTokens)
|
|
goto _Lagain;
|
|
|
|
tok->f.kind = doxy ? T_CPP_DOXY_COMMENT : T_CPP_COMMENT;
|
|
|
|
} else if (_yychar == '*') {
|
|
yyinp();
|
|
|
|
bool doxy = false;
|
|
|
|
if (_yychar == '*' || _yychar == '!') {
|
|
const char ch = _yychar;
|
|
|
|
yyinp();
|
|
|
|
if (ch == '*' && _yychar == '/')
|
|
goto _Ldone;
|
|
|
|
if (_yychar == '<')
|
|
yyinp();
|
|
|
|
if (! _yychar || std::isspace(_yychar))
|
|
doxy = true;
|
|
}
|
|
|
|
while (_yychar) {
|
|
if (_yychar != '*') {
|
|
yyinp();
|
|
} else {
|
|
yyinp();
|
|
if (_yychar == '/')
|
|
break;
|
|
}
|
|
}
|
|
|
|
_Ldone:
|
|
if (_yychar)
|
|
yyinp();
|
|
else
|
|
_state = doxy ? State_MultiLineDoxyComment : State_MultiLineComment;
|
|
|
|
if (! f._scanCommentTokens)
|
|
goto _Lagain;
|
|
|
|
tok->f.kind = doxy ? T_DOXY_COMMENT : T_COMMENT;
|
|
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_SLASH_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_SLASH;
|
|
}
|
|
break;
|
|
|
|
case '%':
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_PERCENT_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_PERCENT;
|
|
}
|
|
break;
|
|
|
|
case '^':
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_CARET_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_CARET;
|
|
}
|
|
break;
|
|
|
|
case '&':
|
|
if (_yychar == '&') {
|
|
yyinp();
|
|
tok->f.kind = T_AMPER_AMPER;
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_AMPER_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_AMPER;
|
|
}
|
|
break;
|
|
|
|
case '|':
|
|
if (_yychar == '|') {
|
|
yyinp();
|
|
tok->f.kind = T_PIPE_PIPE;
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_PIPE_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_PIPE;
|
|
}
|
|
break;
|
|
|
|
case '~':
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_TILDE_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_TILDE;
|
|
}
|
|
break;
|
|
|
|
case '!':
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_EXCLAIM_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_EXCLAIM;
|
|
}
|
|
break;
|
|
|
|
case '=':
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_EQUAL_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_EQUAL;
|
|
}
|
|
break;
|
|
|
|
case '<':
|
|
if (f._scanAngleStringLiteralTokens) {
|
|
const char *yytext = _currentChar;
|
|
while (_yychar && _yychar != '>')
|
|
yyinp();
|
|
int yylen = _currentChar - yytext;
|
|
// ### assert(_yychar == '>');
|
|
if (_yychar == '>')
|
|
yyinp();
|
|
if (control())
|
|
tok->string = control()->stringLiteral(yytext, yylen);
|
|
tok->f.kind = T_ANGLE_STRING_LITERAL;
|
|
} else if (_yychar == '<') {
|
|
yyinp();
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_LESS_LESS_EQUAL;
|
|
} else
|
|
tok->f.kind = T_LESS_LESS;
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_LESS_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_LESS;
|
|
}
|
|
break;
|
|
|
|
case '>':
|
|
if (_yychar == '>') {
|
|
yyinp();
|
|
if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_GREATER_GREATER_EQUAL;
|
|
} else
|
|
tok->f.kind = T_LESS_LESS;
|
|
tok->f.kind = T_GREATER_GREATER;
|
|
} else if (_yychar == '=') {
|
|
yyinp();
|
|
tok->f.kind = T_GREATER_EQUAL;
|
|
} else {
|
|
tok->f.kind = T_GREATER;
|
|
}
|
|
break;
|
|
|
|
case ',':
|
|
tok->f.kind = T_COMMA;
|
|
break;
|
|
|
|
default: {
|
|
if (f._objCEnabled) {
|
|
if (ch == '@' && _yychar >= 'a' && _yychar <= 'z') {
|
|
const char *yytext = _currentChar;
|
|
|
|
do {
|
|
yyinp();
|
|
if (! (isalnum(_yychar) || _yychar == '_' || _yychar == '$'))
|
|
break;
|
|
} while (_yychar);
|
|
|
|
const int yylen = _currentChar - yytext;
|
|
tok->f.kind = classifyObjCAtKeyword(yytext, yylen);
|
|
break;
|
|
} else if (ch == '@' && _yychar == '"') {
|
|
yyinp();
|
|
scanStringLiteral(tok, '"');
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (ch == 'L' || ch == 'u' || ch == 'U') {
|
|
// Either a literal or still an identifier.
|
|
if (_yychar == '"') {
|
|
yyinp();
|
|
scanStringLiteral(tok, ch);
|
|
} else if (_yychar == '\'') {
|
|
yyinp();
|
|
scanCharLiteral(tok, ch);
|
|
} else {
|
|
if (_yychar == '8') {
|
|
unsigned char la = 0;
|
|
if (_currentChar + 1 != _lastChar)
|
|
la = *(_currentChar + 1);
|
|
if (la == '"') {
|
|
yyinp();
|
|
yyinp();
|
|
scanStringLiteral(tok, '8');
|
|
} else if (la == '\'') {
|
|
yyinp();
|
|
yyinp();
|
|
scanCharLiteral(tok, '8');
|
|
} else {
|
|
scanIdentifier(tok);
|
|
}
|
|
} else {
|
|
scanIdentifier(tok);
|
|
}
|
|
}
|
|
} else if (std::isalpha(ch) || ch == '_' || ch == '$') {
|
|
scanIdentifier(tok);
|
|
} else if (std::isdigit(ch)) {
|
|
scanNumericLiteral(tok);
|
|
} else {
|
|
tok->f.kind = T_ERROR;
|
|
}
|
|
break;
|
|
} // default
|
|
|
|
} // switch
|
|
}
|
|
|
|
void Lexer::scanStringLiteral(Token *tok, unsigned char hint)
|
|
{
|
|
scanUntilQuote(tok, '"');
|
|
|
|
if (hint == 'L')
|
|
tok->f.kind = T_WIDE_STRING_LITERAL;
|
|
else if (hint == 'U')
|
|
tok->f.kind = T_UTF32_STRING_LITERAL;
|
|
else if (hint == 'u')
|
|
tok->f.kind = T_UTF16_STRING_LITERAL;
|
|
else if (hint == '8')
|
|
tok->f.kind = T_UTF8_STRING_LITERAL;
|
|
else if (hint == '@')
|
|
tok->f.kind = T_AT_STRING_LITERAL;
|
|
else
|
|
tok->f.kind = T_STRING_LITERAL;
|
|
}
|
|
|
|
void Lexer::scanCharLiteral(Token *tok, unsigned char hint)
|
|
{
|
|
scanUntilQuote(tok, '\'');
|
|
|
|
if (hint == 'L')
|
|
tok->f.kind = T_WIDE_CHAR_LITERAL;
|
|
else if (hint == 'U')
|
|
tok->f.kind = T_UTF32_CHAR_LITERAL;
|
|
else if (hint == 'u')
|
|
tok->f.kind = T_UTF16_CHAR_LITERAL;
|
|
else
|
|
tok->f.kind = T_CHAR_LITERAL;
|
|
}
|
|
|
|
void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
|
|
{
|
|
assert(quote == '"' || quote == '\'');
|
|
|
|
const char *yytext = _currentChar;
|
|
while (_yychar && _yychar != quote) {
|
|
if (_yychar != '\\')
|
|
yyinp();
|
|
else {
|
|
yyinp(); // skip `\\'
|
|
|
|
if (_yychar)
|
|
yyinp();
|
|
}
|
|
}
|
|
int yylen = _currentChar - yytext;
|
|
|
|
if (_yychar == quote)
|
|
yyinp();
|
|
|
|
if (control())
|
|
tok->string = control()->stringLiteral(yytext, yylen);
|
|
}
|
|
|
|
void Lexer::scanNumericLiteral(Token *tok)
|
|
{
|
|
const char *yytext = _currentChar - 1;
|
|
while (_yychar) {
|
|
if (_yychar == 'e' || _yychar == 'E') {
|
|
yyinp();
|
|
if (_yychar == '-' || _yychar == '+') {
|
|
yyinp();
|
|
// ### assert(std::isdigit(_yychar));
|
|
}
|
|
} else if (std::isalnum(_yychar) || _yychar == '.') {
|
|
yyinp();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
int yylen = _currentChar - yytext;
|
|
|
|
tok->f.kind = T_NUMERIC_LITERAL;
|
|
|
|
if (control())
|
|
tok->number = control()->numericLiteral(yytext, yylen);
|
|
}
|
|
|
|
void Lexer::scanIdentifier(Token *tok)
|
|
{
|
|
const char *yytext = _currentChar - 1;
|
|
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
|
|
yyinp();
|
|
int yylen = _currentChar - yytext;
|
|
if (f._scanKeywords)
|
|
tok->f.kind = classify(yytext, yylen, f._qtMocRunEnabled, f._cxx0xEnabled);
|
|
else
|
|
tok->f.kind = T_IDENTIFIER;
|
|
|
|
if (tok->f.kind == T_IDENTIFIER) {
|
|
tok->f.kind = classifyOperator(yytext, yylen);
|
|
|
|
if (control())
|
|
tok->identifier = control()->identifier(yytext, yylen);
|
|
}
|
|
}
|