Files
qt-creator/src/libs/3rdparty/cplusplus/Lexer.h
Nikolai Kosjar 70122b3061 C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.

API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
    utf16chars() - aequivalent of bytes()
    utf16charsBegin() - aequivalent of bytesBegin()
    utf16charsEnd() - aequivalent of bytesEnd()

Next steps:
 * Adapt functions from TranslationUnit. They should work with utf16
   chars in order to calculate lines and columns correctly also for
   UTF-8 multi-byte code points.
 * Adapt the higher level clients:
    * Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
    * Cpp{Tools,Editor}: When dealing with identifiers on the
      QString/QTextDocument layer, code points
      represendet by two QChars need to be respected, too.
 * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
   offsets usable in CppEditor/CppTools.

Addresses QTCREATORBUG-7356.

Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
2014-05-23 14:23:15 +02:00

146 lines
4.6 KiB
C++

// Copyright (c) 2008 Roberto Raggi <roberto.raggi@gmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef CPLUSPLUS_LEXER_H
#define CPLUSPLUS_LEXER_H
#include "CPlusPlusForwardDeclarations.h"
#include "Token.h"
namespace CPlusPlus {
class CPLUSPLUS_EXPORT Lexer
{
Lexer(const Lexer &other);
void operator =(const Lexer &other);
public:
Lexer(TranslationUnit *unit);
Lexer(const char *firstChar, const char *lastChar);
~Lexer();
Control *control() const { return _control; }
TranslationUnit *translationUnit() const;
void scan(Token *tok);
inline void operator()(Token *tok)
{ scan(tok); }
bool scanCommentTokens() const;
void setScanCommentTokens(bool onoff);
bool scanKeywords() const;
void setScanKeywords(bool onoff);
bool scanAngleStringLiteralTokens() const;
void setScanAngleStringLiteralTokens(bool onoff);
void setStartWithNewline(bool enabled);
int state() const;
void setState(int state);
LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
private:
void pushLineStartOffset();
void scan_helper(Token *tok);
void setSource(const char *firstChar, const char *lastChar);
static int classify(const char *string, int length, LanguageFeatures features);
static int classifyObjCAtKeyword(const char *s, int n);
static int classifyOperator(const char *string, int length);
void scanStringLiteral(Token *tok, unsigned char hint = 0);
void scanRawStringLiteral(Token *tok, unsigned char hint = 0);
void scanCharLiteral(Token *tok, unsigned char hint = 0);
void scanUntilQuote(Token *tok, unsigned char quote);
void scanNumericLiteral(Token *tok);
void scanIdentifier(Token *tok, unsigned extraProcessedChars = 0);
void scanBackslash(Kind type);
void scanCppComment(Kind type);
static bool isByteOfMultiByteCodePoint(unsigned char byte)
{ return byte & 0x80; } // Check if most significant bit is set
void yyinp()
{
++_currentCharUtf16;
// Process multi-byte UTF-8 code point (non-latin1)
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
unsigned trailingBytesCurrentCodePoint = 1;
for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
++trailingBytesCurrentCodePoint;
// Code points >= 0x00010000 are represented by two UTF16 code units
if (trailingBytesCurrentCodePoint >= 3)
++_currentCharUtf16;
_yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
// Process single-byte UTF-8 code point (latin1)
} else {
_yychar = *++_currentChar;
}
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
pushLineStartOffset();
}
private:
struct Flags {
unsigned _scanCommentTokens: 1;
unsigned _scanKeywords: 1;
unsigned _scanAngleStringLiteralTokens: 1;
};
struct State {
unsigned char _tokenKind : 7;
unsigned char _newlineExpected : 1;
};
TranslationUnit *_translationUnit;
Control *_control;
const char *_firstChar;
const char *_currentChar;
const char *_lastChar;
const char *_tokenStart;
unsigned char _yychar;
unsigned _currentCharUtf16;
unsigned _tokenStartUtf16;
union {
unsigned char _state;
State s;
};
union {
unsigned _flags;
Flags f;
};
unsigned _currentLine;
LanguageFeatures _languageFeatures;
};
} // namespace CPlusPlus
#endif // CPLUSPLUS_LEXER_H