C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.

API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
    utf16chars() - aequivalent of bytes()
    utf16charsBegin() - aequivalent of bytesBegin()
    utf16charsEnd() - aequivalent of bytesEnd()

Next steps:
 * Adapt functions from TranslationUnit. They should work with utf16
   chars in order to calculate lines and columns correctly also for
   UTF-8 multi-byte code points.
 * Adapt the higher level clients:
    * Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
    * Cpp{Tools,Editor}: When dealing with identifiers on the
      QString/QTextDocument layer, code points
      represendet by two QChars need to be respected, too.
 * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
   offsets usable in CppEditor/CppTools.

Addresses QTCREATORBUG-7356.

Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
This commit is contained in:
Nikolai Kosjar
2014-02-25 13:44:11 -03:00
parent 4fefb1ca2a
commit 70122b3061
12 changed files with 503 additions and 28 deletions

View File

@@ -285,7 +285,7 @@ enum Kind {
class CPLUSPLUS_EXPORT Token
{
public:
Token() : flags(0), byteOffset(0), ptr(0) {}
Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
inline bool is(unsigned k) const { return f.kind == k; }
inline bool isNot(unsigned k) const { return f.kind != k; }
@@ -298,13 +298,14 @@ public:
inline bool joined() const { return f.joined; }
inline bool expanded() const { return f.expanded; }
inline bool generated() const { return f.generated; }
inline unsigned bytes() const { return f.bytes; }
inline unsigned bytesBegin() const { return byteOffset; }
inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
inline unsigned bytesBegin() const
{ return byteOffset; }
inline unsigned bytesEnd() const
{ return byteOffset + f.bytes; }
inline unsigned utf16chars() const { return f.utf16chars; }
inline unsigned utf16charsBegin() const { return utf16charOffset; }
inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
inline bool isLiteral() const
{ return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
@@ -354,15 +355,17 @@ public:
unsigned generated : 1;
// Unused...
unsigned pad : 3;
// The token length in bytes.
// The token length in bytes and UTF16 chars.
unsigned bytes : 16;
unsigned utf16chars : 16;
};
union {
unsigned flags;
unsigned long flags;
Flags f;
};
unsigned byteOffset;
unsigned utf16charOffset;
union {
void *ptr;
@@ -393,5 +396,4 @@ struct LanguageFeatures
} // namespace CPlusPlus
#endif // CPLUSPLUS_TOKEN_H