forked from qt-creator/qt-creator
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.
API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
utf16chars() - aequivalent of bytes()
utf16charsBegin() - aequivalent of bytesBegin()
utf16charsEnd() - aequivalent of bytesEnd()
Next steps:
* Adapt functions from TranslationUnit. They should work with utf16
chars in order to calculate lines and columns correctly also for
UTF-8 multi-byte code points.
* Adapt the higher level clients:
* Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
* Cpp{Tools,Editor}: When dealing with identifiers on the
QString/QTextDocument layer, code points
represendet by two QChars need to be respected, too.
* Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
offsets usable in CppEditor/CppTools.
Addresses QTCREATORBUG-7356.
Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
This commit is contained in:
20
src/libs/3rdparty/cplusplus/Lexer.cpp
vendored
20
src/libs/3rdparty/cplusplus/Lexer.cpp
vendored
@@ -29,6 +29,13 @@
|
|||||||
|
|
||||||
using namespace CPlusPlus;
|
using namespace CPlusPlus;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
\class Lexer
|
||||||
|
\brief The Lexer generates tokens from an UTF-8 encoded source text.
|
||||||
|
|
||||||
|
\sa Token
|
||||||
|
*/
|
||||||
|
|
||||||
Lexer::Lexer(TranslationUnit *unit)
|
Lexer::Lexer(TranslationUnit *unit)
|
||||||
: _translationUnit(unit),
|
: _translationUnit(unit),
|
||||||
_control(unit->control()),
|
_control(unit->control()),
|
||||||
@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
|
|||||||
_firstChar = firstChar;
|
_firstChar = firstChar;
|
||||||
_lastChar = lastChar;
|
_lastChar = lastChar;
|
||||||
_currentChar = _firstChar - 1;
|
_currentChar = _firstChar - 1;
|
||||||
|
_currentCharUtf16 = -1;
|
||||||
_tokenStart = _currentChar;
|
_tokenStart = _currentChar;
|
||||||
_yychar = '\n';
|
_yychar = '\n';
|
||||||
}
|
}
|
||||||
@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
|
|||||||
tok->reset();
|
tok->reset();
|
||||||
scan_helper(tok);
|
scan_helper(tok);
|
||||||
tok->f.bytes = _currentChar - _tokenStart;
|
tok->f.bytes = _currentChar - _tokenStart;
|
||||||
|
tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Lexer::scan_helper(Token *tok)
|
void Lexer::scan_helper(Token *tok)
|
||||||
@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
|
|||||||
_tokenStart = _currentChar;
|
_tokenStart = _currentChar;
|
||||||
tok->byteOffset = _currentChar - _firstChar;
|
tok->byteOffset = _currentChar - _firstChar;
|
||||||
|
|
||||||
|
_tokenStartUtf16 = _currentCharUtf16;
|
||||||
|
tok->utf16charOffset = _currentCharUtf16;
|
||||||
|
|
||||||
if (_yychar) {
|
if (_yychar) {
|
||||||
s._newlineExpected = false;
|
s._newlineExpected = false;
|
||||||
} else if (s._tokenKind) {
|
} else if (s._tokenKind) {
|
||||||
@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
|
|||||||
} else {
|
} else {
|
||||||
scanIdentifier(tok);
|
scanIdentifier(tok);
|
||||||
}
|
}
|
||||||
} else if (std::isalpha(ch) || ch == '_' || ch == '$') {
|
} else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
|
||||||
scanIdentifier(tok);
|
scanIdentifier(tok, _currentChar - _tokenStart - 1);
|
||||||
} else if (std::isdigit(ch)) {
|
} else if (std::isdigit(ch)) {
|
||||||
scanNumericLiteral(tok);
|
scanNumericLiteral(tok);
|
||||||
} else {
|
} else {
|
||||||
@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
|
|||||||
void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
|
void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
|
||||||
{
|
{
|
||||||
const char *yytext = _currentChar - 1 - extraProcessedChars;
|
const char *yytext = _currentChar - 1 - extraProcessedChars;
|
||||||
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
|
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
|
||||||
|
|| isByteOfMultiByteCodePoint(_yychar)) {
|
||||||
yyinp();
|
yyinp();
|
||||||
|
}
|
||||||
int yylen = _currentChar - yytext;
|
int yylen = _currentChar - yytext;
|
||||||
if (f._scanKeywords)
|
if (f._scanKeywords)
|
||||||
tok->f.kind = classify(yytext, yylen, _languageFeatures);
|
tok->f.kind = classify(yytext, yylen, _languageFeatures);
|
||||||
|
|||||||
31
src/libs/3rdparty/cplusplus/Lexer.h
vendored
31
src/libs/3rdparty/cplusplus/Lexer.h
vendored
@@ -62,6 +62,7 @@ public:
|
|||||||
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
|
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void pushLineStartOffset();
|
||||||
void scan_helper(Token *tok);
|
void scan_helper(Token *tok);
|
||||||
void setSource(const char *firstChar, const char *lastChar);
|
void setSource(const char *firstChar, const char *lastChar);
|
||||||
static int classify(const char *string, int length, LanguageFeatures features);
|
static int classify(const char *string, int length, LanguageFeatures features);
|
||||||
@@ -77,15 +78,32 @@ private:
|
|||||||
void scanBackslash(Kind type);
|
void scanBackslash(Kind type);
|
||||||
void scanCppComment(Kind type);
|
void scanCppComment(Kind type);
|
||||||
|
|
||||||
inline void yyinp()
|
static bool isByteOfMultiByteCodePoint(unsigned char byte)
|
||||||
|
{ return byte & 0x80; } // Check if most significant bit is set
|
||||||
|
|
||||||
|
void yyinp()
|
||||||
{
|
{
|
||||||
_yychar = *++_currentChar;
|
++_currentCharUtf16;
|
||||||
|
|
||||||
|
// Process multi-byte UTF-8 code point (non-latin1)
|
||||||
|
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
|
||||||
|
unsigned trailingBytesCurrentCodePoint = 1;
|
||||||
|
for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
|
||||||
|
++trailingBytesCurrentCodePoint;
|
||||||
|
// Code points >= 0x00010000 are represented by two UTF16 code units
|
||||||
|
if (trailingBytesCurrentCodePoint >= 3)
|
||||||
|
++_currentCharUtf16;
|
||||||
|
_yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
|
||||||
|
|
||||||
|
// Process single-byte UTF-8 code point (latin1)
|
||||||
|
} else {
|
||||||
|
_yychar = *++_currentChar;
|
||||||
|
}
|
||||||
|
|
||||||
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
|
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
|
||||||
pushLineStartOffset();
|
pushLineStartOffset();
|
||||||
}
|
}
|
||||||
|
|
||||||
void pushLineStartOffset();
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct Flags {
|
struct Flags {
|
||||||
unsigned _scanCommentTokens: 1;
|
unsigned _scanCommentTokens: 1;
|
||||||
@@ -105,6 +123,10 @@ private:
|
|||||||
const char *_lastChar;
|
const char *_lastChar;
|
||||||
const char *_tokenStart;
|
const char *_tokenStart;
|
||||||
unsigned char _yychar;
|
unsigned char _yychar;
|
||||||
|
|
||||||
|
unsigned _currentCharUtf16;
|
||||||
|
unsigned _tokenStartUtf16;
|
||||||
|
|
||||||
union {
|
union {
|
||||||
unsigned char _state;
|
unsigned char _state;
|
||||||
State s;
|
State s;
|
||||||
@@ -113,6 +135,7 @@ private:
|
|||||||
unsigned _flags;
|
unsigned _flags;
|
||||||
Flags f;
|
Flags f;
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned _currentLine;
|
unsigned _currentLine;
|
||||||
LanguageFeatures _languageFeatures;
|
LanguageFeatures _languageFeatures;
|
||||||
};
|
};
|
||||||
|
|||||||
1
src/libs/3rdparty/cplusplus/Token.cpp
vendored
1
src/libs/3rdparty/cplusplus/Token.cpp
vendored
@@ -85,6 +85,7 @@ void Token::reset()
|
|||||||
{
|
{
|
||||||
flags = 0;
|
flags = 0;
|
||||||
byteOffset = 0;
|
byteOffset = 0;
|
||||||
|
utf16charOffset = 0;
|
||||||
ptr = 0;
|
ptr = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
20
src/libs/3rdparty/cplusplus/Token.h
vendored
20
src/libs/3rdparty/cplusplus/Token.h
vendored
@@ -285,7 +285,7 @@ enum Kind {
|
|||||||
class CPLUSPLUS_EXPORT Token
|
class CPLUSPLUS_EXPORT Token
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Token() : flags(0), byteOffset(0), ptr(0) {}
|
Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
|
||||||
|
|
||||||
inline bool is(unsigned k) const { return f.kind == k; }
|
inline bool is(unsigned k) const { return f.kind == k; }
|
||||||
inline bool isNot(unsigned k) const { return f.kind != k; }
|
inline bool isNot(unsigned k) const { return f.kind != k; }
|
||||||
@@ -298,13 +298,14 @@ public:
|
|||||||
inline bool joined() const { return f.joined; }
|
inline bool joined() const { return f.joined; }
|
||||||
inline bool expanded() const { return f.expanded; }
|
inline bool expanded() const { return f.expanded; }
|
||||||
inline bool generated() const { return f.generated; }
|
inline bool generated() const { return f.generated; }
|
||||||
|
|
||||||
inline unsigned bytes() const { return f.bytes; }
|
inline unsigned bytes() const { return f.bytes; }
|
||||||
|
inline unsigned bytesBegin() const { return byteOffset; }
|
||||||
|
inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
|
||||||
|
|
||||||
inline unsigned bytesBegin() const
|
inline unsigned utf16chars() const { return f.utf16chars; }
|
||||||
{ return byteOffset; }
|
inline unsigned utf16charsBegin() const { return utf16charOffset; }
|
||||||
|
inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
|
||||||
inline unsigned bytesEnd() const
|
|
||||||
{ return byteOffset + f.bytes; }
|
|
||||||
|
|
||||||
inline bool isLiteral() const
|
inline bool isLiteral() const
|
||||||
{ return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
|
{ return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
|
||||||
@@ -354,15 +355,17 @@ public:
|
|||||||
unsigned generated : 1;
|
unsigned generated : 1;
|
||||||
// Unused...
|
// Unused...
|
||||||
unsigned pad : 3;
|
unsigned pad : 3;
|
||||||
// The token length in bytes.
|
// The token length in bytes and UTF16 chars.
|
||||||
unsigned bytes : 16;
|
unsigned bytes : 16;
|
||||||
|
unsigned utf16chars : 16;
|
||||||
};
|
};
|
||||||
union {
|
union {
|
||||||
unsigned flags;
|
unsigned long flags;
|
||||||
Flags f;
|
Flags f;
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned byteOffset;
|
unsigned byteOffset;
|
||||||
|
unsigned utf16charOffset;
|
||||||
|
|
||||||
union {
|
union {
|
||||||
void *ptr;
|
void *ptr;
|
||||||
@@ -393,5 +396,4 @@ struct LanguageFeatures
|
|||||||
|
|
||||||
} // namespace CPlusPlus
|
} // namespace CPlusPlus
|
||||||
|
|
||||||
|
|
||||||
#endif // CPLUSPLUS_TOKEN_H
|
#endif // CPLUSPLUS_TOKEN_H
|
||||||
|
|||||||
@@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const
|
|||||||
return _endedJoined;
|
return _endedJoined;
|
||||||
}
|
}
|
||||||
|
|
||||||
QList<Token> SimpleLexer::operator()(const QString &text, int state)
|
QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8)
|
||||||
{
|
{
|
||||||
QList<Token> tokens;
|
QList<Token> tokens;
|
||||||
|
|
||||||
const QByteArray bytes = text.toLatin1();
|
const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1();
|
||||||
const char *firstChar = bytes.constData();
|
const char *firstChar = bytes.constData();
|
||||||
const char *lastChar = firstChar + bytes.size();
|
const char *lastChar = firstChar + bytes.size();
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ public:
|
|||||||
|
|
||||||
bool endedJoined() const;
|
bool endedJoined() const;
|
||||||
|
|
||||||
QList<Token> operator()(const QString &text, int state = 0);
|
QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false);
|
||||||
|
|
||||||
int state() const
|
int state() const
|
||||||
{ return _lastState; }
|
{ return _lastState; }
|
||||||
|
|||||||
@@ -12,4 +12,5 @@ SUBDIRS = \
|
|||||||
misc \
|
misc \
|
||||||
cxx11 \
|
cxx11 \
|
||||||
checksymbols \
|
checksymbols \
|
||||||
lexer
|
lexer \
|
||||||
|
translationunit
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ Project {
|
|||||||
"misc/misc.qbs",
|
"misc/misc.qbs",
|
||||||
"preprocessor/preprocessor.qbs",
|
"preprocessor/preprocessor.qbs",
|
||||||
"semantic/semantic.qbs",
|
"semantic/semantic.qbs",
|
||||||
|
"translationunit/translationunit.qbs",
|
||||||
"typeprettyprinter/typeprettyprinter.qbs"
|
"typeprettyprinter/typeprettyprinter.qbs"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -52,28 +52,49 @@ class tst_SimpleLexer: public QObject
|
|||||||
public:
|
public:
|
||||||
tst_SimpleLexer() : _state(0) {}
|
tst_SimpleLexer() : _state(0) {}
|
||||||
|
|
||||||
|
enum TokenCompareFlag {
|
||||||
|
CompareKind = 1 << 1,
|
||||||
|
CompareBytes = 1 << 2,
|
||||||
|
CompareBytesBegin = 1 << 3,
|
||||||
|
CompareBytesEnd = 1 << 4,
|
||||||
|
CompareUtf16Chars = 1 << 5,
|
||||||
|
CompareUtf16CharsBegin = 1 << 6,
|
||||||
|
CompareUtf16CharsEnd = 1 << 7
|
||||||
|
};
|
||||||
|
Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
|
||||||
|
|
||||||
private slots:
|
private slots:
|
||||||
void basic();
|
void basic();
|
||||||
void basic_data();
|
void basic_data();
|
||||||
void incremental();
|
void incremental();
|
||||||
void incremental_data();
|
void incremental_data();
|
||||||
|
|
||||||
|
//
|
||||||
|
// The following "non-latin1" code points are used in the tests following this comment:
|
||||||
|
//
|
||||||
|
// U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
|
||||||
|
// U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
|
||||||
|
// U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
|
||||||
|
//
|
||||||
|
|
||||||
|
void bytes_and_utf16chars();
|
||||||
|
void bytes_and_utf16chars_data();
|
||||||
|
void offsets();
|
||||||
|
void offsets_data();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static TokenList toTokenList(const TokenKindList &tokenKinds);
|
static TokenList toTokenList(const TokenKindList &tokenKinds);
|
||||||
|
|
||||||
enum TokenCompareFlag {
|
|
||||||
CompareKind = 1 << 1
|
|
||||||
};
|
|
||||||
Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
|
|
||||||
|
|
||||||
void run(const QByteArray &source,
|
void run(const QByteArray &source,
|
||||||
const TokenList &expectedTokenList,
|
const TokenList &expectedTokenList,
|
||||||
bool preserveState,
|
bool preserveState,
|
||||||
TokenCompareFlag compareFlags);
|
TokenCompareFlags compareFlags);
|
||||||
|
|
||||||
int _state;
|
int _state;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Q_DECLARE_OPERATORS_FOR_FLAGS(tst_SimpleLexer::TokenCompareFlags)
|
||||||
|
|
||||||
TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
|
TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
|
||||||
{
|
{
|
||||||
TokenList tokens;
|
TokenList tokens;
|
||||||
@@ -88,10 +109,13 @@ TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
|
|||||||
void tst_SimpleLexer::run(const QByteArray &source,
|
void tst_SimpleLexer::run(const QByteArray &source,
|
||||||
const TokenList &expectedTokenList,
|
const TokenList &expectedTokenList,
|
||||||
bool preserveState,
|
bool preserveState,
|
||||||
TokenCompareFlag compareFlags)
|
TokenCompareFlags compareFlags)
|
||||||
{
|
{
|
||||||
|
QVERIFY(compareFlags);
|
||||||
|
|
||||||
SimpleLexer lexer;
|
SimpleLexer lexer;
|
||||||
const QList<Token> tokenList = lexer(source, preserveState ? _state : 0);
|
const QList<Token> tokenList = lexer(source, preserveState ? _state : 0,
|
||||||
|
/*convertToUtf8=*/ true);
|
||||||
if (preserveState)
|
if (preserveState)
|
||||||
_state = lexer.state();
|
_state = lexer.state();
|
||||||
|
|
||||||
@@ -108,6 +132,20 @@ void tst_SimpleLexer::run(const QByteArray &source,
|
|||||||
#endif
|
#endif
|
||||||
if (compareFlags & CompareKind)
|
if (compareFlags & CompareKind)
|
||||||
QCOMPARE(token.kind(), expectedToken.kind());
|
QCOMPARE(token.kind(), expectedToken.kind());
|
||||||
|
|
||||||
|
if (compareFlags & CompareBytes)
|
||||||
|
QCOMPARE(token.bytes(), expectedToken.bytes());
|
||||||
|
if (compareFlags & CompareBytesBegin)
|
||||||
|
QCOMPARE(token.bytesBegin(), expectedToken.bytesBegin());
|
||||||
|
if (compareFlags & CompareBytesEnd)
|
||||||
|
QCOMPARE(token.bytesEnd(), expectedToken.bytesEnd());
|
||||||
|
|
||||||
|
if (compareFlags & CompareUtf16Chars)
|
||||||
|
QCOMPARE(token.utf16chars(), expectedToken.utf16chars());
|
||||||
|
if (compareFlags & CompareUtf16CharsBegin)
|
||||||
|
QCOMPARE(token.utf16charsBegin(), expectedToken.utf16charsBegin());
|
||||||
|
if (compareFlags & CompareUtf16CharsEnd)
|
||||||
|
QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
|
||||||
}
|
}
|
||||||
QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected.");
|
QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected.");
|
||||||
}
|
}
|
||||||
@@ -221,7 +259,168 @@ void tst_SimpleLexer::basic_data()
|
|||||||
<< T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE
|
<< T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE
|
||||||
<< T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER;
|
<< T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER;
|
||||||
QTest::newRow(source) << source << expectedTokenKindList;
|
QTest::newRow(source) << source << expectedTokenKindList;
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_SimpleLexer::bytes_and_utf16chars()
|
||||||
|
{
|
||||||
|
QFETCH(QByteArray, source);
|
||||||
|
QFETCH(QList<Token>, expectedTokenList);
|
||||||
|
|
||||||
|
const TokenCompareFlags compareFlags = CompareKind | CompareBytes | CompareUtf16Chars;
|
||||||
|
run(source, expectedTokenList, false, compareFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static QList<Token> createToken(unsigned kind, unsigned bytes, unsigned utf16chars)
|
||||||
|
{
|
||||||
|
Token t;
|
||||||
|
t.f.kind = kind;
|
||||||
|
t.f.bytes = bytes;
|
||||||
|
t.f.utf16chars = utf16chars;
|
||||||
|
return QList<Token>() << t;
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_SimpleLexer::bytes_and_utf16chars_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QByteArray>("source");
|
||||||
|
QTest::addColumn<QList<Token> >("expectedTokenList");
|
||||||
|
|
||||||
|
typedef QByteArray _;
|
||||||
|
|
||||||
|
// LATIN1 Identifier
|
||||||
|
QTest::newRow("latin1 identifier")
|
||||||
|
<< _("var") << createToken(T_IDENTIFIER, 3, 3);
|
||||||
|
|
||||||
|
// NON-LATIN1 identifier (code point with 2 UTF8 code units)
|
||||||
|
QTest::newRow("non-latin1 identifier (2-byte code unit at start)")
|
||||||
|
<< _("\u00FC_var") << createToken(T_IDENTIFIER, 6, 5);
|
||||||
|
QTest::newRow("non-latin1 identifier (2-byte code unit in center)")
|
||||||
|
<< _("_v\u00FCr_") << createToken(T_IDENTIFIER, 6, 5);
|
||||||
|
QTest::newRow("non-latin1 identifier (2-byte code unit at end)")
|
||||||
|
<< _("var_\u00FC") << createToken(T_IDENTIFIER, 6, 5);
|
||||||
|
QTest::newRow("non-latin1 identifier (2-byte code unit only)")
|
||||||
|
<< _("\u00FC") << createToken(T_IDENTIFIER, 2, 1);
|
||||||
|
|
||||||
|
// NON-LATIN1 identifier (code point with 3 UTF8 code units)
|
||||||
|
QTest::newRow("non-latin1 identifier (3-byte code unit at start)")
|
||||||
|
<< _("\u4E8C_var") << createToken(T_IDENTIFIER, 7, 5);
|
||||||
|
QTest::newRow("non-latin1 identifier (3-byte code unit in center)")
|
||||||
|
<< _("_v\u4E8Cr_") << createToken(T_IDENTIFIER, 7, 5);
|
||||||
|
QTest::newRow("non-latin1 identifier (3-byte code unit at end)")
|
||||||
|
<< _("var_\u4E8C") << createToken(T_IDENTIFIER, 7, 5);
|
||||||
|
QTest::newRow("non-latin1 identifier (3-byte code unit only)")
|
||||||
|
<< _("\u4E8C") << createToken(T_IDENTIFIER, 3, 1);
|
||||||
|
|
||||||
|
// NON-LATIN1 identifier (code point with 4 UTF8 code units)
|
||||||
|
QTest::newRow("non-latin1 identifier (4-byte code unit at start)")
|
||||||
|
<< _("\U00010302_var") << createToken(T_IDENTIFIER, 8, 6);
|
||||||
|
QTest::newRow("non-latin1 identifier (4-byte code unit in center)")
|
||||||
|
<< _("_v\U00010302r_") << createToken(T_IDENTIFIER, 8, 6);
|
||||||
|
QTest::newRow("non-latin1 identifier (4-byte code unit at end)")
|
||||||
|
<< _("var_\U00010302") << createToken(T_IDENTIFIER, 8, 6);
|
||||||
|
QTest::newRow("non-latin1 identifier (4-byte code unit only)")
|
||||||
|
<< _("\U00010302") << createToken(T_IDENTIFIER, 4, 2);
|
||||||
|
|
||||||
|
// NON-LATIN1 identifier (code points with several multi-byte UTF8 code units)
|
||||||
|
QTest::newRow("non-latin1 identifier (mixed multi-byte code units at start)")
|
||||||
|
<< _("\u00FC\u4E8C\U00010302_var") << createToken(T_IDENTIFIER, 13, 8);
|
||||||
|
QTest::newRow("non-latin1 identifier (mixed multi-byte code units in center)")
|
||||||
|
<< _("_v\u00FC\u4E8C\U00010302r_") << createToken(T_IDENTIFIER, 13, 8);
|
||||||
|
QTest::newRow("non-latin1 identifier (mixed multi-byte code units at end)")
|
||||||
|
<< _("var_\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 13, 8);
|
||||||
|
QTest::newRow("non-latin1 identifier (mixed multi-byte code units only)")
|
||||||
|
<< _("\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 9, 4);
|
||||||
|
|
||||||
|
// Comments
|
||||||
|
QTest::newRow("ascii comment /* ... */")
|
||||||
|
<< _("/* hello world */") << createToken(T_COMMENT, 17, 17);
|
||||||
|
QTest::newRow("latin1 comment //")
|
||||||
|
<< _("// hello world") << createToken(T_CPP_COMMENT, 14, 14);
|
||||||
|
QTest::newRow("non-latin1 comment /* ... */ (1)")
|
||||||
|
<< _("/* \u00FC\u4E8C\U00010302 */") << createToken(T_COMMENT, 15, 10);
|
||||||
|
QTest::newRow("non-latin1 comment /* ... */ (2)")
|
||||||
|
<< _("/*\u00FC\u4E8C\U00010302*/") << createToken(T_COMMENT, 13, 8);
|
||||||
|
QTest::newRow("non-latin1 comment // (1)")
|
||||||
|
<< _("// \u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 12, 7);
|
||||||
|
QTest::newRow("non-latin1 comment // (2)")
|
||||||
|
<< _("//\u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 11, 6);
|
||||||
|
|
||||||
|
// String Literals
|
||||||
|
QTest::newRow("latin1 string literal")
|
||||||
|
<< _("\"hello\"") << createToken(T_STRING_LITERAL, 7, 7);
|
||||||
|
QTest::newRow("non-latin1 string literal")
|
||||||
|
<< _("\"\u00FC\u4E8C\U00010302\"") << createToken(T_STRING_LITERAL, 11, 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Token createToken(unsigned kind, unsigned byteOffset, unsigned bytes,
|
||||||
|
unsigned utf16charsOffset, unsigned utf16chars)
|
||||||
|
{
|
||||||
|
Token t;
|
||||||
|
t.f.kind = kind;
|
||||||
|
t.byteOffset = byteOffset;
|
||||||
|
t.f.bytes = bytes;
|
||||||
|
t.utf16charOffset = utf16charsOffset;
|
||||||
|
t.f.utf16chars = utf16chars;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_SimpleLexer::offsets()
|
||||||
|
{
|
||||||
|
QFETCH(QByteArray, source);
|
||||||
|
QFETCH(QList<Token>, expectedTokenList);
|
||||||
|
|
||||||
|
const TokenCompareFlags compareFlags = CompareKind
|
||||||
|
| CompareBytesBegin
|
||||||
|
| CompareBytesEnd
|
||||||
|
| CompareUtf16CharsBegin
|
||||||
|
| CompareUtf16CharsEnd
|
||||||
|
;
|
||||||
|
run(source, expectedTokenList, false, compareFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_SimpleLexer::offsets_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QByteArray>("source");
|
||||||
|
QTest::addColumn<QList<Token> >("expectedTokenList");
|
||||||
|
|
||||||
|
typedef QByteArray _;
|
||||||
|
|
||||||
|
// LATIN1 Identifier
|
||||||
|
QTest::newRow("latin1 identifiers")
|
||||||
|
<< _("var var") << (QList<Token>()
|
||||||
|
<< createToken(T_IDENTIFIER, 0, 3, 0, 3)
|
||||||
|
<< createToken(T_IDENTIFIER, 4, 3, 4, 3)
|
||||||
|
);
|
||||||
|
|
||||||
|
// NON-LATIN1 identifier
|
||||||
|
QTest::newRow("non-latin1 identifiers 1")
|
||||||
|
<< _("var_\u00FC var_\u00FC") << (QList<Token>()
|
||||||
|
<< createToken(T_IDENTIFIER, 0, 6, 0, 5)
|
||||||
|
<< createToken(T_IDENTIFIER, 7, 6, 6, 5)
|
||||||
|
);
|
||||||
|
QTest::newRow("non-latin1 identifiers 2")
|
||||||
|
<< _("\u00FC\u4E8C\U00010302 \u00FC\u4E8C\U00010302") << (QList<Token>()
|
||||||
|
<< createToken(T_IDENTIFIER, 0, 9, 0, 4)
|
||||||
|
<< createToken(T_IDENTIFIER, 10, 9, 5, 4)
|
||||||
|
);
|
||||||
|
|
||||||
|
QTest::newRow("non-latin1 identifiers 3") // first code unit on line: <bytes> / <utf16char>
|
||||||
|
<< _("class v\u00FC\u4E8C\U00010302\n" // 0 / 0
|
||||||
|
"{\n" // 17 / 12
|
||||||
|
"public:\n" // 19 / 14
|
||||||
|
" v\u00FC\u4E8C\U00010302();\n" // 27 / 22
|
||||||
|
"};\n") << (QList<Token>() // 45 / 35
|
||||||
|
<< createToken(T_CLASS, 0, 5, 0, 5) // class
|
||||||
|
<< createToken(T_IDENTIFIER, 6, 10, 6, 5) // non-latin1 id
|
||||||
|
<< createToken(T_LBRACE, 17, 1, 12, 1) // {
|
||||||
|
<< createToken(T_PUBLIC, 19, 6, 14, 6) // public
|
||||||
|
<< createToken(T_COLON, 25, 1, 20, 1) // :
|
||||||
|
<< createToken(T_IDENTIFIER, 31, 10, 26, 5) // id
|
||||||
|
<< createToken(T_LPAREN, 41, 1, 31, 1) // (
|
||||||
|
<< createToken(T_RPAREN, 42, 1, 32, 1) // )
|
||||||
|
<< createToken(T_SEMICOLON, 43, 1, 33, 1) // ;
|
||||||
|
<< createToken(T_RBRACE, 45, 1, 35, 1) // }
|
||||||
|
<< createToken(T_SEMICOLON, 46, 1, 36, 1) // ;
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void tst_SimpleLexer::incremental()
|
void tst_SimpleLexer::incremental()
|
||||||
|
|||||||
2
tests/auto/cplusplus/translationunit/translationunit.pro
Normal file
2
tests/auto/cplusplus/translationunit/translationunit.pro
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
include(../shared/shared.pri)
|
||||||
|
SOURCES += tst_translationunit.cpp
|
||||||
7
tests/auto/cplusplus/translationunit/translationunit.qbs
Normal file
7
tests/auto/cplusplus/translationunit/translationunit.qbs
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import qbs
|
||||||
|
import "../cplusplusautotest.qbs" as CPlusPlusAutotest
|
||||||
|
|
||||||
|
CPlusPlusAutotest {
|
||||||
|
name: "CPlusPlus translation unit autotest"
|
||||||
|
files: "tst_translationunit.cpp"
|
||||||
|
}
|
||||||
225
tests/auto/cplusplus/translationunit/tst_translationunit.cpp
Normal file
225
tests/auto/cplusplus/translationunit/tst_translationunit.cpp
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
/****************************************************************************
|
||||||
|
**
|
||||||
|
** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
|
||||||
|
** Contact: http://www.qt-project.org/legal
|
||||||
|
**
|
||||||
|
** This file is part of Qt Creator.
|
||||||
|
**
|
||||||
|
** Commercial License Usage
|
||||||
|
** Licensees holding valid commercial Qt licenses may use this file in
|
||||||
|
** accordance with the commercial license agreement provided with the
|
||||||
|
** Software or, alternatively, in accordance with the terms contained in
|
||||||
|
** a written agreement between you and Digia. For licensing terms and
|
||||||
|
** conditions see http://qt.digia.com/licensing. For further information
|
||||||
|
** use the contact form at http://qt.digia.com/contact-us.
|
||||||
|
**
|
||||||
|
** GNU Lesser General Public License Usage
|
||||||
|
** Alternatively, this file may be used under the terms of the GNU Lesser
|
||||||
|
** General Public License version 2.1 as published by the Free Software
|
||||||
|
** Foundation and appearing in the file LICENSE.LGPL included in the
|
||||||
|
** packaging of this file. Please review the following information to
|
||||||
|
** ensure the GNU Lesser General Public License version 2.1 requirements
|
||||||
|
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
||||||
|
**
|
||||||
|
** In addition, as a special exception, Digia gives you certain additional
|
||||||
|
** rights. These rights are described in the Digia Qt LGPL Exception
|
||||||
|
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
|
||||||
|
**
|
||||||
|
****************************************************************************/
|
||||||
|
|
||||||
|
#include <cplusplus/PreprocessorClient.h>
|
||||||
|
#include <cplusplus/PreprocessorEnvironment.h>
|
||||||
|
#include <cplusplus/Token.h>
|
||||||
|
#include <cplusplus/TranslationUnit.h>
|
||||||
|
#include <cplusplus/pp-engine.h>
|
||||||
|
|
||||||
|
#include <QtTest>
|
||||||
|
#include <QDebug>
|
||||||
|
|
||||||
|
//TESTED_COMPONENT=src/libs/cplusplus
|
||||||
|
using namespace CPlusPlus;
|
||||||
|
|
||||||
|
class tst_TranslationUnit: public QObject
|
||||||
|
{
|
||||||
|
Q_OBJECT
|
||||||
|
private slots:
|
||||||
|
|
||||||
|
//
|
||||||
|
// The following "non-latin1" code points are used in the tests following this comment:
|
||||||
|
//
|
||||||
|
// U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
|
||||||
|
// U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
|
||||||
|
// U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
|
||||||
|
//
|
||||||
|
|
||||||
|
void unicodeIdentifier();
|
||||||
|
void unicodeIdentifier_data();
|
||||||
|
|
||||||
|
void unicodeStringLiteral();
|
||||||
|
void unicodeStringLiteral_data();
|
||||||
|
|
||||||
|
private:
|
||||||
|
class Document
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
typedef QSharedPointer<Document> Ptr;
|
||||||
|
|
||||||
|
static Document::Ptr create(const QByteArray &source)
|
||||||
|
{
|
||||||
|
LanguageFeatures features;
|
||||||
|
features.objCEnabled = true;
|
||||||
|
features.qtEnabled = false;
|
||||||
|
features.qtKeywordsEnabled = false;
|
||||||
|
features.qtMocRunEnabled = false;
|
||||||
|
|
||||||
|
Document::Ptr document = Document::Ptr(new Document);
|
||||||
|
document->translationUnit()->setLanguageFeatures(features);
|
||||||
|
const QByteArray preprocessedSource = preprocess(source);
|
||||||
|
document->translationUnit()->setSource(preprocessedSource.constData(),
|
||||||
|
preprocessedSource.length());
|
||||||
|
document->translationUnit()->parse();
|
||||||
|
|
||||||
|
if (document->hasParsingErrors())
|
||||||
|
return Document::Ptr();
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
Document()
|
||||||
|
: m_translationUnit(&m_control, m_control.stringLiteral("testFile"))
|
||||||
|
{
|
||||||
|
m_control.setDiagnosticClient(&m_diagnosticClient);
|
||||||
|
}
|
||||||
|
|
||||||
|
TranslationUnit *translationUnit()
|
||||||
|
{ return &m_translationUnit; }
|
||||||
|
|
||||||
|
bool hasParsingErrors() const
|
||||||
|
{ return m_diagnosticClient.errorCount != 0; }
|
||||||
|
|
||||||
|
const Identifier *lastIdentifier() const
|
||||||
|
{ return *(m_control.lastIdentifier() - 1); }
|
||||||
|
|
||||||
|
const StringLiteral *lastStringLiteral() const
|
||||||
|
{ return *(m_control.lastStringLiteral() - 1); }
|
||||||
|
|
||||||
|
private:
|
||||||
|
static QByteArray preprocess(const QByteArray &source)
|
||||||
|
{
|
||||||
|
Client *client = 0; // no client.
|
||||||
|
Environment env;
|
||||||
|
Preprocessor preprocess(client, &env);
|
||||||
|
preprocess.setKeepComments(true);
|
||||||
|
return preprocess.run(QLatin1String("<stdin>"), source);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
Control m_control;
|
||||||
|
TranslationUnit m_translationUnit;
|
||||||
|
|
||||||
|
class Diagnostic: public DiagnosticClient {
|
||||||
|
public:
|
||||||
|
int errorCount;
|
||||||
|
|
||||||
|
Diagnostic() : errorCount(0) {}
|
||||||
|
|
||||||
|
void report(int /*level*/, const StringLiteral *fileName, unsigned line,
|
||||||
|
unsigned column, const char *format, va_list ap)
|
||||||
|
{
|
||||||
|
++errorCount;
|
||||||
|
qDebug() << fileName->chars() << ':' << line << ':' << column
|
||||||
|
<< ' ' << QString().vsprintf(format, ap);
|
||||||
|
}
|
||||||
|
} m_diagnosticClient;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
void tst_TranslationUnit::unicodeIdentifier()
|
||||||
|
{
|
||||||
|
QFETCH(QByteArray, identifierText);
|
||||||
|
|
||||||
|
Document::Ptr document = Document::create("void " + identifierText + ";");
|
||||||
|
QVERIFY(document);
|
||||||
|
|
||||||
|
const Identifier *actual = document->lastIdentifier();
|
||||||
|
QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
|
||||||
|
QString::fromUtf8(identifierText));
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_TranslationUnit::unicodeIdentifier_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QByteArray>("identifierText");
|
||||||
|
|
||||||
|
typedef QByteArray _;
|
||||||
|
|
||||||
|
QTest::newRow("latin1 identifier") << _("var");
|
||||||
|
|
||||||
|
QTest::newRow("non-latin1 identifier 1") << _("prefix\u00FC\u4E8C\U00010302");
|
||||||
|
QTest::newRow("non-latin1 identifier 2") << _("prefix\U00010302\u00FC\u4E8C");
|
||||||
|
QTest::newRow("non-latin1 identifier 3") << _("\U00010302\u00FC\u4E8C");
|
||||||
|
QTest::newRow("non-latin1 identifier 4") << _("\u4E8C\U00010302\u00FC");
|
||||||
|
QTest::newRow("non-latin1 identifier 5") << _("\u4E8C\U00010302\u00FCsuffix");
|
||||||
|
QTest::newRow("non-latin1 identifier 6") << _("\U00010302\u00FC\u4E8Csuffix");
|
||||||
|
|
||||||
|
// Some special cases (different code path inside lexer)
|
||||||
|
QTest::newRow("non-latin1 identifier 7") << _("LR\U00010302\u00FC\u4E8C");
|
||||||
|
QTest::newRow("non-latin1 identifier 8") << _("u8R\U00010302\u00FC\u4E8C");
|
||||||
|
QTest::newRow("non-latin1 identifier 9") << _("u8\U00010302\u00FC\u4E8C");
|
||||||
|
QTest::newRow("non-latin1 identifier 10") << _("u\U00010302\u00FC\u4E8C");
|
||||||
|
}
|
||||||
|
|
||||||
|
static QByteArray stripQuotesFromLiteral(const QByteArray literal)
|
||||||
|
{
|
||||||
|
QByteArray result = literal;
|
||||||
|
|
||||||
|
// Strip front
|
||||||
|
while (!result.isEmpty() && result[0] != '"')
|
||||||
|
result = result.mid(1);
|
||||||
|
if (result.isEmpty())
|
||||||
|
return QByteArray();
|
||||||
|
result = result.mid(1);
|
||||||
|
|
||||||
|
// Strip end
|
||||||
|
while (result.size() >= 2
|
||||||
|
&& (std::isspace(result[result.size() - 1]) || result[result.size()-1] == '"')) {
|
||||||
|
result.chop(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_TranslationUnit::unicodeStringLiteral()
|
||||||
|
{
|
||||||
|
QFETCH(QByteArray, literalText);
|
||||||
|
|
||||||
|
Document::Ptr document = Document::create("char t[] = " + literalText + ";");
|
||||||
|
QVERIFY(document);
|
||||||
|
|
||||||
|
const StringLiteral *actual = document->lastStringLiteral();
|
||||||
|
QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
|
||||||
|
QString::fromUtf8(stripQuotesFromLiteral(literalText)));
|
||||||
|
}
|
||||||
|
|
||||||
|
void tst_TranslationUnit::unicodeStringLiteral_data()
|
||||||
|
{
|
||||||
|
QTest::addColumn<QByteArray>("literalText");
|
||||||
|
|
||||||
|
typedef QByteArray _;
|
||||||
|
|
||||||
|
QTest::newRow("latin1 literal") << _("\"var\"");
|
||||||
|
|
||||||
|
QTest::newRow("non-latin1 literal 1") << _("\"prefix\u00FC\u4E8C\U00010302\"");
|
||||||
|
QTest::newRow("non-latin1 literal 2") << _("\"prefix\U00010302\u00FC\u4E8C\"");
|
||||||
|
QTest::newRow("non-latin1 literal 3") << _("\"\U00010302\u00FC\u4E8C\"");
|
||||||
|
QTest::newRow("non-latin1 literal 4") << _("\"\u4E8C\U00010302\u00FC\"");
|
||||||
|
QTest::newRow("non-latin1 literal 5") << _("\"\u4E8C\U00010302\u00FCsuffix\"");
|
||||||
|
QTest::newRow("non-latin1 literal 6") << _("\"\U00010302\u00FC\u4E8Csuffix\"");
|
||||||
|
|
||||||
|
QTest::newRow("non-latin1 literal 7") << _("L\"\U00010302\u00FC\u4E8C\"");
|
||||||
|
QTest::newRow("non-latin1 literal 8") << _("u8\"\U00010302\u00FC\u4E8C\"");
|
||||||
|
QTest::newRow("non-latin1 literal 9") << _("u\"\U00010302\u00FC\u4E8C\"");
|
||||||
|
QTest::newRow("non-latin1 literal 10") << _("U\"\U00010302\u00FC\u4E8C\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
QTEST_APPLESS_MAIN(tst_TranslationUnit)
|
||||||
|
#include "tst_translationunit.moc"
|
||||||
Reference in New Issue
Block a user