forked from qt-creator/qt-creator
C++: Support for UTF-8 in the lexer
This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.
API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
utf16chars() - aequivalent of bytes()
utf16charsBegin() - aequivalent of bytesBegin()
utf16charsEnd() - aequivalent of bytesEnd()
Next steps:
* Adapt functions from TranslationUnit. They should work with utf16
chars in order to calculate lines and columns correctly also for
UTF-8 multi-byte code points.
* Adapt the higher level clients:
* Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
* Cpp{Tools,Editor}: When dealing with identifiers on the
QString/QTextDocument layer, code points
represendet by two QChars need to be respected, too.
* Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
offsets usable in CppEditor/CppTools.
Addresses QTCREATORBUG-7356.
Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
This commit is contained in:
@@ -52,28 +52,49 @@ class tst_SimpleLexer: public QObject
|
||||
public:
|
||||
tst_SimpleLexer() : _state(0) {}
|
||||
|
||||
enum TokenCompareFlag {
|
||||
CompareKind = 1 << 1,
|
||||
CompareBytes = 1 << 2,
|
||||
CompareBytesBegin = 1 << 3,
|
||||
CompareBytesEnd = 1 << 4,
|
||||
CompareUtf16Chars = 1 << 5,
|
||||
CompareUtf16CharsBegin = 1 << 6,
|
||||
CompareUtf16CharsEnd = 1 << 7
|
||||
};
|
||||
Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
|
||||
|
||||
private slots:
|
||||
void basic();
|
||||
void basic_data();
|
||||
void incremental();
|
||||
void incremental_data();
|
||||
|
||||
//
|
||||
// The following "non-latin1" code points are used in the tests following this comment:
|
||||
//
|
||||
// U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
|
||||
// U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
|
||||
// U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
|
||||
//
|
||||
|
||||
void bytes_and_utf16chars();
|
||||
void bytes_and_utf16chars_data();
|
||||
void offsets();
|
||||
void offsets_data();
|
||||
|
||||
private:
|
||||
static TokenList toTokenList(const TokenKindList &tokenKinds);
|
||||
|
||||
enum TokenCompareFlag {
|
||||
CompareKind = 1 << 1
|
||||
};
|
||||
Q_DECLARE_FLAGS(TokenCompareFlags, TokenCompareFlag)
|
||||
|
||||
void run(const QByteArray &source,
|
||||
const TokenList &expectedTokenList,
|
||||
bool preserveState,
|
||||
TokenCompareFlag compareFlags);
|
||||
TokenCompareFlags compareFlags);
|
||||
|
||||
int _state;
|
||||
};
|
||||
|
||||
Q_DECLARE_OPERATORS_FOR_FLAGS(tst_SimpleLexer::TokenCompareFlags)
|
||||
|
||||
TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
|
||||
{
|
||||
TokenList tokens;
|
||||
@@ -88,10 +109,13 @@ TokenList tst_SimpleLexer::toTokenList(const TokenKindList &tokenKinds)
|
||||
void tst_SimpleLexer::run(const QByteArray &source,
|
||||
const TokenList &expectedTokenList,
|
||||
bool preserveState,
|
||||
TokenCompareFlag compareFlags)
|
||||
TokenCompareFlags compareFlags)
|
||||
{
|
||||
QVERIFY(compareFlags);
|
||||
|
||||
SimpleLexer lexer;
|
||||
const QList<Token> tokenList = lexer(source, preserveState ? _state : 0);
|
||||
const QList<Token> tokenList = lexer(source, preserveState ? _state : 0,
|
||||
/*convertToUtf8=*/ true);
|
||||
if (preserveState)
|
||||
_state = lexer.state();
|
||||
|
||||
@@ -108,6 +132,20 @@ void tst_SimpleLexer::run(const QByteArray &source,
|
||||
#endif
|
||||
if (compareFlags & CompareKind)
|
||||
QCOMPARE(token.kind(), expectedToken.kind());
|
||||
|
||||
if (compareFlags & CompareBytes)
|
||||
QCOMPARE(token.bytes(), expectedToken.bytes());
|
||||
if (compareFlags & CompareBytesBegin)
|
||||
QCOMPARE(token.bytesBegin(), expectedToken.bytesBegin());
|
||||
if (compareFlags & CompareBytesEnd)
|
||||
QCOMPARE(token.bytesEnd(), expectedToken.bytesEnd());
|
||||
|
||||
if (compareFlags & CompareUtf16Chars)
|
||||
QCOMPARE(token.utf16chars(), expectedToken.utf16chars());
|
||||
if (compareFlags & CompareUtf16CharsBegin)
|
||||
QCOMPARE(token.utf16charsBegin(), expectedToken.utf16charsBegin());
|
||||
if (compareFlags & CompareUtf16CharsEnd)
|
||||
QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
|
||||
}
|
||||
QVERIFY2(i == expectedTokenList.size(), "Less tokens than expected.");
|
||||
}
|
||||
@@ -221,7 +259,168 @@ void tst_SimpleLexer::basic_data()
|
||||
<< T_LBRACKET << T_RBRACKET << T_LBRACE << T_RBRACE
|
||||
<< T_IDENTIFIER << T_QUESTION << T_IDENTIFIER << T_COLON << T_IDENTIFIER;
|
||||
QTest::newRow(source) << source << expectedTokenKindList;
|
||||
}
|
||||
|
||||
void tst_SimpleLexer::bytes_and_utf16chars()
|
||||
{
|
||||
QFETCH(QByteArray, source);
|
||||
QFETCH(QList<Token>, expectedTokenList);
|
||||
|
||||
const TokenCompareFlags compareFlags = CompareKind | CompareBytes | CompareUtf16Chars;
|
||||
run(source, expectedTokenList, false, compareFlags);
|
||||
}
|
||||
|
||||
static QList<Token> createToken(unsigned kind, unsigned bytes, unsigned utf16chars)
|
||||
{
|
||||
Token t;
|
||||
t.f.kind = kind;
|
||||
t.f.bytes = bytes;
|
||||
t.f.utf16chars = utf16chars;
|
||||
return QList<Token>() << t;
|
||||
}
|
||||
|
||||
void tst_SimpleLexer::bytes_and_utf16chars_data()
|
||||
{
|
||||
QTest::addColumn<QByteArray>("source");
|
||||
QTest::addColumn<QList<Token> >("expectedTokenList");
|
||||
|
||||
typedef QByteArray _;
|
||||
|
||||
// LATIN1 Identifier
|
||||
QTest::newRow("latin1 identifier")
|
||||
<< _("var") << createToken(T_IDENTIFIER, 3, 3);
|
||||
|
||||
// NON-LATIN1 identifier (code point with 2 UTF8 code units)
|
||||
QTest::newRow("non-latin1 identifier (2-byte code unit at start)")
|
||||
<< _("\u00FC_var") << createToken(T_IDENTIFIER, 6, 5);
|
||||
QTest::newRow("non-latin1 identifier (2-byte code unit in center)")
|
||||
<< _("_v\u00FCr_") << createToken(T_IDENTIFIER, 6, 5);
|
||||
QTest::newRow("non-latin1 identifier (2-byte code unit at end)")
|
||||
<< _("var_\u00FC") << createToken(T_IDENTIFIER, 6, 5);
|
||||
QTest::newRow("non-latin1 identifier (2-byte code unit only)")
|
||||
<< _("\u00FC") << createToken(T_IDENTIFIER, 2, 1);
|
||||
|
||||
// NON-LATIN1 identifier (code point with 3 UTF8 code units)
|
||||
QTest::newRow("non-latin1 identifier (3-byte code unit at start)")
|
||||
<< _("\u4E8C_var") << createToken(T_IDENTIFIER, 7, 5);
|
||||
QTest::newRow("non-latin1 identifier (3-byte code unit in center)")
|
||||
<< _("_v\u4E8Cr_") << createToken(T_IDENTIFIER, 7, 5);
|
||||
QTest::newRow("non-latin1 identifier (3-byte code unit at end)")
|
||||
<< _("var_\u4E8C") << createToken(T_IDENTIFIER, 7, 5);
|
||||
QTest::newRow("non-latin1 identifier (3-byte code unit only)")
|
||||
<< _("\u4E8C") << createToken(T_IDENTIFIER, 3, 1);
|
||||
|
||||
// NON-LATIN1 identifier (code point with 4 UTF8 code units)
|
||||
QTest::newRow("non-latin1 identifier (4-byte code unit at start)")
|
||||
<< _("\U00010302_var") << createToken(T_IDENTIFIER, 8, 6);
|
||||
QTest::newRow("non-latin1 identifier (4-byte code unit in center)")
|
||||
<< _("_v\U00010302r_") << createToken(T_IDENTIFIER, 8, 6);
|
||||
QTest::newRow("non-latin1 identifier (4-byte code unit at end)")
|
||||
<< _("var_\U00010302") << createToken(T_IDENTIFIER, 8, 6);
|
||||
QTest::newRow("non-latin1 identifier (4-byte code unit only)")
|
||||
<< _("\U00010302") << createToken(T_IDENTIFIER, 4, 2);
|
||||
|
||||
// NON-LATIN1 identifier (code points with several multi-byte UTF8 code units)
|
||||
QTest::newRow("non-latin1 identifier (mixed multi-byte code units at start)")
|
||||
<< _("\u00FC\u4E8C\U00010302_var") << createToken(T_IDENTIFIER, 13, 8);
|
||||
QTest::newRow("non-latin1 identifier (mixed multi-byte code units in center)")
|
||||
<< _("_v\u00FC\u4E8C\U00010302r_") << createToken(T_IDENTIFIER, 13, 8);
|
||||
QTest::newRow("non-latin1 identifier (mixed multi-byte code units at end)")
|
||||
<< _("var_\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 13, 8);
|
||||
QTest::newRow("non-latin1 identifier (mixed multi-byte code units only)")
|
||||
<< _("\u00FC\u4E8C\U00010302") << createToken(T_IDENTIFIER, 9, 4);
|
||||
|
||||
// Comments
|
||||
QTest::newRow("ascii comment /* ... */")
|
||||
<< _("/* hello world */") << createToken(T_COMMENT, 17, 17);
|
||||
QTest::newRow("latin1 comment //")
|
||||
<< _("// hello world") << createToken(T_CPP_COMMENT, 14, 14);
|
||||
QTest::newRow("non-latin1 comment /* ... */ (1)")
|
||||
<< _("/* \u00FC\u4E8C\U00010302 */") << createToken(T_COMMENT, 15, 10);
|
||||
QTest::newRow("non-latin1 comment /* ... */ (2)")
|
||||
<< _("/*\u00FC\u4E8C\U00010302*/") << createToken(T_COMMENT, 13, 8);
|
||||
QTest::newRow("non-latin1 comment // (1)")
|
||||
<< _("// \u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 12, 7);
|
||||
QTest::newRow("non-latin1 comment // (2)")
|
||||
<< _("//\u00FC\u4E8C\U00010302") << createToken(T_CPP_COMMENT, 11, 6);
|
||||
|
||||
// String Literals
|
||||
QTest::newRow("latin1 string literal")
|
||||
<< _("\"hello\"") << createToken(T_STRING_LITERAL, 7, 7);
|
||||
QTest::newRow("non-latin1 string literal")
|
||||
<< _("\"\u00FC\u4E8C\U00010302\"") << createToken(T_STRING_LITERAL, 11, 6);
|
||||
}
|
||||
|
||||
static Token createToken(unsigned kind, unsigned byteOffset, unsigned bytes,
|
||||
unsigned utf16charsOffset, unsigned utf16chars)
|
||||
{
|
||||
Token t;
|
||||
t.f.kind = kind;
|
||||
t.byteOffset = byteOffset;
|
||||
t.f.bytes = bytes;
|
||||
t.utf16charOffset = utf16charsOffset;
|
||||
t.f.utf16chars = utf16chars;
|
||||
return t;
|
||||
}
|
||||
|
||||
void tst_SimpleLexer::offsets()
|
||||
{
|
||||
QFETCH(QByteArray, source);
|
||||
QFETCH(QList<Token>, expectedTokenList);
|
||||
|
||||
const TokenCompareFlags compareFlags = CompareKind
|
||||
| CompareBytesBegin
|
||||
| CompareBytesEnd
|
||||
| CompareUtf16CharsBegin
|
||||
| CompareUtf16CharsEnd
|
||||
;
|
||||
run(source, expectedTokenList, false, compareFlags);
|
||||
}
|
||||
|
||||
void tst_SimpleLexer::offsets_data()
|
||||
{
|
||||
QTest::addColumn<QByteArray>("source");
|
||||
QTest::addColumn<QList<Token> >("expectedTokenList");
|
||||
|
||||
typedef QByteArray _;
|
||||
|
||||
// LATIN1 Identifier
|
||||
QTest::newRow("latin1 identifiers")
|
||||
<< _("var var") << (QList<Token>()
|
||||
<< createToken(T_IDENTIFIER, 0, 3, 0, 3)
|
||||
<< createToken(T_IDENTIFIER, 4, 3, 4, 3)
|
||||
);
|
||||
|
||||
// NON-LATIN1 identifier
|
||||
QTest::newRow("non-latin1 identifiers 1")
|
||||
<< _("var_\u00FC var_\u00FC") << (QList<Token>()
|
||||
<< createToken(T_IDENTIFIER, 0, 6, 0, 5)
|
||||
<< createToken(T_IDENTIFIER, 7, 6, 6, 5)
|
||||
);
|
||||
QTest::newRow("non-latin1 identifiers 2")
|
||||
<< _("\u00FC\u4E8C\U00010302 \u00FC\u4E8C\U00010302") << (QList<Token>()
|
||||
<< createToken(T_IDENTIFIER, 0, 9, 0, 4)
|
||||
<< createToken(T_IDENTIFIER, 10, 9, 5, 4)
|
||||
);
|
||||
|
||||
QTest::newRow("non-latin1 identifiers 3") // first code unit on line: <bytes> / <utf16char>
|
||||
<< _("class v\u00FC\u4E8C\U00010302\n" // 0 / 0
|
||||
"{\n" // 17 / 12
|
||||
"public:\n" // 19 / 14
|
||||
" v\u00FC\u4E8C\U00010302();\n" // 27 / 22
|
||||
"};\n") << (QList<Token>() // 45 / 35
|
||||
<< createToken(T_CLASS, 0, 5, 0, 5) // class
|
||||
<< createToken(T_IDENTIFIER, 6, 10, 6, 5) // non-latin1 id
|
||||
<< createToken(T_LBRACE, 17, 1, 12, 1) // {
|
||||
<< createToken(T_PUBLIC, 19, 6, 14, 6) // public
|
||||
<< createToken(T_COLON, 25, 1, 20, 1) // :
|
||||
<< createToken(T_IDENTIFIER, 31, 10, 26, 5) // id
|
||||
<< createToken(T_LPAREN, 41, 1, 31, 1) // (
|
||||
<< createToken(T_RPAREN, 42, 1, 32, 1) // )
|
||||
<< createToken(T_SEMICOLON, 43, 1, 33, 1) // ;
|
||||
<< createToken(T_RBRACE, 45, 1, 35, 1) // }
|
||||
<< createToken(T_SEMICOLON, 46, 1, 36, 1) // ;
|
||||
);
|
||||
}
|
||||
|
||||
void tst_SimpleLexer::incremental()
|
||||
|
||||
Reference in New Issue
Block a user