forked from qt-creator/qt-creator
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.
API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
utf16chars() - aequivalent of bytes()
utf16charsBegin() - aequivalent of bytesBegin()
utf16charsEnd() - aequivalent of bytesEnd()
Next steps:
* Adapt functions from TranslationUnit. They should work with utf16
chars in order to calculate lines and columns correctly also for
UTF-8 multi-byte code points.
* Adapt the higher level clients:
* Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
* Cpp{Tools,Editor}: When dealing with identifiers on the
QString/QTextDocument layer, code points
represendet by two QChars need to be respected, too.
* Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
offsets usable in CppEditor/CppTools.
Addresses QTCREATORBUG-7356.
Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
This commit is contained in:
20
src/libs/3rdparty/cplusplus/Lexer.cpp
vendored
20
src/libs/3rdparty/cplusplus/Lexer.cpp
vendored
@@ -29,6 +29,13 @@
|
||||
|
||||
using namespace CPlusPlus;
|
||||
|
||||
/*!
|
||||
\class Lexer
|
||||
\brief The Lexer generates tokens from an UTF-8 encoded source text.
|
||||
|
||||
\sa Token
|
||||
*/
|
||||
|
||||
Lexer::Lexer(TranslationUnit *unit)
|
||||
: _translationUnit(unit),
|
||||
_control(unit->control()),
|
||||
@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
|
||||
_firstChar = firstChar;
|
||||
_lastChar = lastChar;
|
||||
_currentChar = _firstChar - 1;
|
||||
_currentCharUtf16 = -1;
|
||||
_tokenStart = _currentChar;
|
||||
_yychar = '\n';
|
||||
}
|
||||
@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
|
||||
tok->reset();
|
||||
scan_helper(tok);
|
||||
tok->f.bytes = _currentChar - _tokenStart;
|
||||
tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
|
||||
}
|
||||
|
||||
void Lexer::scan_helper(Token *tok)
|
||||
@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
|
||||
_tokenStart = _currentChar;
|
||||
tok->byteOffset = _currentChar - _firstChar;
|
||||
|
||||
_tokenStartUtf16 = _currentCharUtf16;
|
||||
tok->utf16charOffset = _currentCharUtf16;
|
||||
|
||||
if (_yychar) {
|
||||
s._newlineExpected = false;
|
||||
} else if (s._tokenKind) {
|
||||
@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
|
||||
} else {
|
||||
scanIdentifier(tok);
|
||||
}
|
||||
} else if (std::isalpha(ch) || ch == '_' || ch == '$') {
|
||||
scanIdentifier(tok);
|
||||
} else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
|
||||
scanIdentifier(tok, _currentChar - _tokenStart - 1);
|
||||
} else if (std::isdigit(ch)) {
|
||||
scanNumericLiteral(tok);
|
||||
} else {
|
||||
@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
|
||||
void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
|
||||
{
|
||||
const char *yytext = _currentChar - 1 - extraProcessedChars;
|
||||
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
|
||||
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
|
||||
|| isByteOfMultiByteCodePoint(_yychar)) {
|
||||
yyinp();
|
||||
}
|
||||
int yylen = _currentChar - yytext;
|
||||
if (f._scanKeywords)
|
||||
tok->f.kind = classify(yytext, yylen, _languageFeatures);
|
||||
|
||||
31
src/libs/3rdparty/cplusplus/Lexer.h
vendored
31
src/libs/3rdparty/cplusplus/Lexer.h
vendored
@@ -62,6 +62,7 @@ public:
|
||||
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
|
||||
|
||||
private:
|
||||
void pushLineStartOffset();
|
||||
void scan_helper(Token *tok);
|
||||
void setSource(const char *firstChar, const char *lastChar);
|
||||
static int classify(const char *string, int length, LanguageFeatures features);
|
||||
@@ -77,15 +78,32 @@ private:
|
||||
void scanBackslash(Kind type);
|
||||
void scanCppComment(Kind type);
|
||||
|
||||
inline void yyinp()
|
||||
static bool isByteOfMultiByteCodePoint(unsigned char byte)
|
||||
{ return byte & 0x80; } // Check if most significant bit is set
|
||||
|
||||
void yyinp()
|
||||
{
|
||||
_yychar = *++_currentChar;
|
||||
++_currentCharUtf16;
|
||||
|
||||
// Process multi-byte UTF-8 code point (non-latin1)
|
||||
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
|
||||
unsigned trailingBytesCurrentCodePoint = 1;
|
||||
for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
|
||||
++trailingBytesCurrentCodePoint;
|
||||
// Code points >= 0x00010000 are represented by two UTF16 code units
|
||||
if (trailingBytesCurrentCodePoint >= 3)
|
||||
++_currentCharUtf16;
|
||||
_yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
|
||||
|
||||
// Process single-byte UTF-8 code point (latin1)
|
||||
} else {
|
||||
_yychar = *++_currentChar;
|
||||
}
|
||||
|
||||
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
|
||||
pushLineStartOffset();
|
||||
}
|
||||
|
||||
void pushLineStartOffset();
|
||||
|
||||
private:
|
||||
struct Flags {
|
||||
unsigned _scanCommentTokens: 1;
|
||||
@@ -105,6 +123,10 @@ private:
|
||||
const char *_lastChar;
|
||||
const char *_tokenStart;
|
||||
unsigned char _yychar;
|
||||
|
||||
unsigned _currentCharUtf16;
|
||||
unsigned _tokenStartUtf16;
|
||||
|
||||
union {
|
||||
unsigned char _state;
|
||||
State s;
|
||||
@@ -113,6 +135,7 @@ private:
|
||||
unsigned _flags;
|
||||
Flags f;
|
||||
};
|
||||
|
||||
unsigned _currentLine;
|
||||
LanguageFeatures _languageFeatures;
|
||||
};
|
||||
|
||||
1
src/libs/3rdparty/cplusplus/Token.cpp
vendored
1
src/libs/3rdparty/cplusplus/Token.cpp
vendored
@@ -85,6 +85,7 @@ void Token::reset()
|
||||
{
|
||||
flags = 0;
|
||||
byteOffset = 0;
|
||||
utf16charOffset = 0;
|
||||
ptr = 0;
|
||||
}
|
||||
|
||||
|
||||
20
src/libs/3rdparty/cplusplus/Token.h
vendored
20
src/libs/3rdparty/cplusplus/Token.h
vendored
@@ -285,7 +285,7 @@ enum Kind {
|
||||
class CPLUSPLUS_EXPORT Token
|
||||
{
|
||||
public:
|
||||
Token() : flags(0), byteOffset(0), ptr(0) {}
|
||||
Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
|
||||
|
||||
inline bool is(unsigned k) const { return f.kind == k; }
|
||||
inline bool isNot(unsigned k) const { return f.kind != k; }
|
||||
@@ -298,13 +298,14 @@ public:
|
||||
inline bool joined() const { return f.joined; }
|
||||
inline bool expanded() const { return f.expanded; }
|
||||
inline bool generated() const { return f.generated; }
|
||||
|
||||
inline unsigned bytes() const { return f.bytes; }
|
||||
inline unsigned bytesBegin() const { return byteOffset; }
|
||||
inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
|
||||
|
||||
inline unsigned bytesBegin() const
|
||||
{ return byteOffset; }
|
||||
|
||||
inline unsigned bytesEnd() const
|
||||
{ return byteOffset + f.bytes; }
|
||||
inline unsigned utf16chars() const { return f.utf16chars; }
|
||||
inline unsigned utf16charsBegin() const { return utf16charOffset; }
|
||||
inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
|
||||
|
||||
inline bool isLiteral() const
|
||||
{ return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
|
||||
@@ -354,15 +355,17 @@ public:
|
||||
unsigned generated : 1;
|
||||
// Unused...
|
||||
unsigned pad : 3;
|
||||
// The token length in bytes.
|
||||
// The token length in bytes and UTF16 chars.
|
||||
unsigned bytes : 16;
|
||||
unsigned utf16chars : 16;
|
||||
};
|
||||
union {
|
||||
unsigned flags;
|
||||
unsigned long flags;
|
||||
Flags f;
|
||||
};
|
||||
|
||||
unsigned byteOffset;
|
||||
unsigned utf16charOffset;
|
||||
|
||||
union {
|
||||
void *ptr;
|
||||
@@ -393,5 +396,4 @@ struct LanguageFeatures
|
||||
|
||||
} // namespace CPlusPlus
|
||||
|
||||
|
||||
#endif // CPLUSPLUS_TOKEN_H
|
||||
|
||||
Reference in New Issue
Block a user