Files
qt-creator/src/libs/cplusplus/SimpleLexer.cpp
Nikolai Kosjar 70122b3061 C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.

API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
    utf16chars() - aequivalent of bytes()
    utf16charsBegin() - aequivalent of bytesBegin()
    utf16charsEnd() - aequivalent of bytesEnd()

Next steps:
 * Adapt functions from TranslationUnit. They should work with utf16
   chars in order to calculate lines and columns correctly also for
   UTF-8 multi-byte code points.
 * Adapt the higher level clients:
    * Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
    * Cpp{Tools,Editor}: When dealing with identifiers on the
      QString/QTextDocument layer, code points
      represendet by two QChars need to be respected, too.
 * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
   offsets usable in CppEditor/CppTools.

Addresses QTCREATORBUG-7356.

Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
2014-05-23 14:23:15 +02:00

155 lines
4.8 KiB
C++

/****************************************************************************
**
** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/legal
**
** This file is part of Qt Creator.
**
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and Digia. For licensing terms and
** conditions see http://qt.digia.com/licensing. For further information
** use the contact form at http://qt.digia.com/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Digia gives you certain additional
** rights. These rights are described in the Digia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
****************************************************************************/
#include "SimpleLexer.h"
#include <cplusplus/ObjectiveCTypeQualifiers.h>
#include <cplusplus/Lexer.h>
#include <cplusplus/Token.h>
#include <QDebug>
using namespace CPlusPlus;
SimpleLexer::SimpleLexer()
: _lastState(0),
_skipComments(false),
_endedJoined(false)
{}
SimpleLexer::~SimpleLexer()
{ }
bool SimpleLexer::skipComments() const
{
return _skipComments;
}
void SimpleLexer::setSkipComments(bool skipComments)
{
_skipComments = skipComments;
}
bool SimpleLexer::endedJoined() const
{
return _endedJoined;
}
QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8)
{
QList<Token> tokens;
const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1();
const char *firstChar = bytes.constData();
const char *lastChar = firstChar + bytes.size();
Lexer lex(firstChar, lastChar);
lex.setLanguageFeatures(_languageFeatures);
lex.setStartWithNewline(true);
if (! _skipComments)
lex.setScanCommentTokens(true);
if (state != -1)
lex.setState(state & 0xff);
bool inPreproc = false;
for (;;) {
Token tk;
lex(&tk);
if (tk.is(T_EOF_SYMBOL)) {
_endedJoined = tk.joined();
break;
}
QStringRef spell = text.midRef(tk.bytesBegin(), tk.bytes());
lex.setScanAngleStringLiteralTokens(false);
if (tk.newline() && tk.is(T_POUND))
inPreproc = true;
else if (inPreproc && tokens.size() == 1 && tk.is(T_IDENTIFIER) &&
spell == QLatin1String("include"))
lex.setScanAngleStringLiteralTokens(true);
else if (inPreproc && tokens.size() == 1 && tk.is(T_IDENTIFIER) &&
spell == QLatin1String("include_next"))
lex.setScanAngleStringLiteralTokens(true);
else if (_languageFeatures.objCEnabled
&& inPreproc && tokens.size() == 1 && tk.is(T_IDENTIFIER) &&
spell == QLatin1String("import"))
lex.setScanAngleStringLiteralTokens(true);
tokens.append(tk);
}
_lastState = lex.state();
return tokens;
}
int SimpleLexer::tokenAt(const QList<Token> &tokens, unsigned offset)
{
for (int index = tokens.size() - 1; index >= 0; --index) {
const Token &tk = tokens.at(index);
if (tk.bytesBegin() <= offset && tk.bytesEnd() >= offset)
return index;
}
return -1;
}
Token SimpleLexer::tokenAt(const QString &text,
unsigned offset,
int state,
bool qtMocRunEnabled)
{
// FIXME: Check default values.
LanguageFeatures features;
features.qtMocRunEnabled = qtMocRunEnabled;
features.qtEnabled = qtMocRunEnabled;
features.qtKeywordsEnabled = qtMocRunEnabled;
features.objCEnabled = qtMocRunEnabled;
features.cxx11Enabled = qtMocRunEnabled;
SimpleLexer tokenize;
tokenize.setLanguageFeatures(features);
const QList<Token> tokens = tokenize(text, state);
const int tokenIdx = tokenAt(tokens, offset);
return (tokenIdx == -1) ? Token() : tokens.at(tokenIdx);
}
int SimpleLexer::tokenBefore(const QList<Token> &tokens, unsigned offset)
{
for (int index = tokens.size() - 1; index >= 0; --index) {
const Token &tk = tokens.at(index);
if (tk.bytesBegin() <= offset)
return index;
}
return -1;
}