C++: Add utf16 indices to Macro and Document::MacroUse

In most cases we need to work with the utf16 indices. Only in
cppfindreferences the byte interface is still needed since there we read
in files and work on a QByteArray to save memory.

Change-Id: I6ef6a93fc1875a8c9a305c075d51a9ca034c41bb
Reviewed-by: Erik Verbruggen <erik.verbruggen@digia.com>
This commit is contained in:
Nikolai Kosjar
2014-05-09 10:04:13 -04:00
parent bb7da966b8
commit c6358e5d38
24 changed files with 345 additions and 215 deletions

View File

@@ -36,6 +36,21 @@ using namespace CPlusPlus;
\sa Token
*/
/*!
\fn static void Lexer::yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar, unsigned &utf16charCounter)
Process a single unicode code point in an UTF-8 encoded source.
\a currentSourceChar points to the UTF-8 encoded source.
\a yychar must be the byte pointed to by \a currentSourceChar.
Points \a currentSourceChar to the byte of the next code point
and modifies \a yychar to the value pointed by the updated
\a currentSourceChar. \a utf16charCounter will be incremented by
the number of UTF-16 code units that were needed for that code
point.
*/
Lexer::Lexer(TranslationUnit *unit)
: _translationUnit(unit),
_control(unit->control()),

View File

@@ -61,6 +61,28 @@ public:
LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
public:
static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar,
unsigned &utf16charCounter)
{
++utf16charCounter;
// Process multi-byte UTF-8 code point (non-latin1)
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(yychar))) {
unsigned trailingBytesCurrentCodePoint = 1;
for (unsigned char c = yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
++trailingBytesCurrentCodePoint;
// Code points >= 0x00010000 are represented by two UTF-16 code units
if (trailingBytesCurrentCodePoint >= 3)
++utf16charCounter;
yychar = *(currentSourceChar += trailingBytesCurrentCodePoint + 1);
// Process single-byte UTF-8 code point (latin1)
} else {
yychar = *++currentSourceChar;
}
}
private:
void pushLineStartOffset();
void scan_helper(Token *tok);
@@ -83,23 +105,7 @@ private:
void yyinp()
{
++_currentCharUtf16;
// Process multi-byte UTF-8 code point (non-latin1)
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
unsigned trailingBytesCurrentCodePoint = 1;
for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
++trailingBytesCurrentCodePoint;
// Code points >= 0x00010000 are represented by two UTF16 code units
if (trailingBytesCurrentCodePoint >= 3)
++_currentCharUtf16;
_yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
// Process single-byte UTF-8 code point (latin1)
} else {
_yychar = *++_currentChar;
}
yyinp_utf8(_currentChar, _yychar, _currentCharUtf16);
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
pushLineStartOffset();
}

View File

@@ -264,7 +264,7 @@ void TranslationUnit::tokenize()
currentExpanded = true;
const std::pair<unsigned, unsigned> &p = lineColumn[lineColumnIdx];
if (p.first)
_expandedLineColumn.insert(std::make_pair(tk.bytesBegin(), p));
_expandedLineColumn.insert(std::make_pair(tk.utf16charsBegin(), p));
else
currentGenerated = true;