CPaster: Fix fetching from pastebin

Parsing HTML with QXmlStreamReader is a bad idea as it
may be malformed HTML.
Replace XML parsing by using regular expressions.

Change-Id: I230e9d2b8e13d8bd736cb3f05eb6de7f812aab5b
Reviewed-by: Christian Kandeler <christian.kandeler@qt.io>
This commit is contained in:
Christian Stenger
2020-09-16 15:15:01 +02:00
parent 707a3cfaf3
commit 5cdcb872df

View File

@@ -29,9 +29,8 @@
#include <utils/qtcassert.h>
#include <QDebug>
#include <QRegularExpression>
#include <QStringList>
#include <QXmlStreamReader>
#include <QXmlStreamAttributes>
#include <QByteArray>
#include <QNetworkReply>
@@ -218,38 +217,77 @@ enum ParseState
WithinTableElement, WithinTableElementAnchor, ParseError
};
QDebug operator<<(QDebug d, const QXmlStreamAttributes &al)
static QString replaceEntities(const QString &original)
{
QDebug nospace = d.nospace();
foreach (const QXmlStreamAttribute &a, al)
nospace << a.name().toString() << '=' << a.value().toString() << ' ';
return d;
QString result(original);
static const QRegularExpression regex("&#((x[[:xdigit:]]+)|(\\d+));");
QRegularExpressionMatchIterator it = regex.globalMatch(original);
while (it.hasNext()) {
const QRegularExpressionMatch match = it.next();
const QString value = match.captured(1);
if (value.startsWith('x'))
result.replace(match.captured(0), QChar(value.midRef(1).toInt(nullptr, 16)));
else
result.replace(match.captured(0), QChar(value.toInt(nullptr, 10)));
}
static inline ParseState nextOpeningState(ParseState current, const QXmlStreamReader &reader)
return result;
}
namespace {
struct Attribute {
QString name;
QString value;
};
}
static QList<Attribute> toAttributes(const QStringView &attributes)
{
QList<Attribute> result;
const QRegularExpression att("\\s+([a-zA-Z]+)\\s*=\\s*('.*?'|\".*?\")");
QRegularExpressionMatchIterator it = att.globalMatch(attributes.toString());
while (it.hasNext()) {
const QRegularExpressionMatch match = it.next();
QString val = match.captured(2); // including quotes
if (val.size() > 2)
val = val.mid(1, val.size() - 2);
else
val= QString();
result.append(Attribute{match.captured(1), val});
}
return result;
}
static inline ParseState nextOpeningState(ParseState current, const QStringView &tagView,
const QStringView &attributesView)
{
const auto element = reader.name();
switch (current) {
case OutSideTable:
// Trigger on main table only.
if (element == QLatin1String("table")
&& reader.attributes().value(QLatin1String("class")) == QLatin1String("maintable"))
if (tagView == QLatin1String("table")) {
const QList<Attribute> attributes = toAttributes(attributesView);
for (const Attribute &att : attributes) {
if (att.name == "class" && att.value == "maintable")
return WithinTable;
}
}
return OutSideTable;
case WithinTable:
if (element == QLatin1String("tr"))
if (tagView == QLatin1String("tr"))
return WithinTableRow;
break;
case WithinTableRow:
if (element == QLatin1String("td"))
if (tagView == QLatin1String("td"))
return WithinTableElement;
if (element == QLatin1String("th"))
if (tagView == QLatin1String("th"))
return WithinTableHeaderElement;
break;
case WithinTableElement:
if (element == QLatin1String("img"))
if (tagView == QLatin1String("img"))
return WithinTableElement;
if (element == QLatin1String("a"))
if (tagView == QLatin1String("a"))
return WithinTableElementAnchor;
break;
case WithinTableHeaderElement:
@@ -257,10 +295,12 @@ static inline ParseState nextOpeningState(ParseState current, const QXmlStreamRe
case ParseError:
break;
}
if (tagView == QString("div") || tagView == QString("span") || tagView == QString("tbody"))
return current; // silently ignore
return ParseError;
}
static inline ParseState nextClosingState(ParseState current, const Utils::StringView &element)
static inline ParseState nextClosingState(ParseState current, const QStringView &element)
{
switch (current) {
case OutSideTable:
@@ -278,6 +318,8 @@ static inline ParseState nextClosingState(ParseState current, const Utils::Strin
return WithinTableRow;
if (element == QLatin1String("img"))
return WithinTableElement;
if (element == QString("tr")) // html file may have wrong XML syntax, but browsers ignore
return WithinTable;
break;
case WithinTableHeaderElement:
if (element == QLatin1String("th"))
@@ -290,6 +332,8 @@ static inline ParseState nextClosingState(ParseState current, const Utils::Strin
case ParseError:
break;
}
if (element == QString("div") || element == QString("span") || element == QString("tbody"))
return current; // silently ignore
return ParseError;
}
@@ -307,20 +351,36 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
return rc;
}
data.remove(0, tablePos);
QXmlStreamReader reader(data);
ParseState state = OutSideTable;
int tableRow = 0;
int tableColumn = 0;
const QString hrefAttribute = QLatin1String("href");
QString link;
QString title;
QString age;
while (!reader.atEnd()) {
switch (reader.readNext()) {
case QXmlStreamReader::StartElement:
state = nextOpeningState(state, reader);
QString dataStr = QString::fromUtf8(data);
// remove comments if any
const QRegularExpression comment("<!--.*--!>", QRegularExpression::MultilineOption);
for ( ;; ) {
const QRegularExpressionMatch match = comment.match(dataStr);
if (!match.hasMatch())
break;
dataStr.remove(match.capturedStart(), match.capturedLength());
}
const QRegularExpression tag("<(/?)\\s*([a-zA-Z][a-zA-Z0-9]*)(.*?)(/?)\\s*>",
QRegularExpression::MultilineOption);
const QRegularExpression wsOnly("^\\s+$", QRegularExpression::MultilineOption);
QRegularExpressionMatchIterator it = tag.globalMatch(dataStr);
while (it.hasNext()) {
const QRegularExpressionMatch match = it.next();
bool startElement = match.captured(4).length() == 0 && match.captured(1).length() == 0;
if (startElement) {
state = nextOpeningState(state, match.capturedView(2), match.capturedView(3));
switch (state) {
case WithinTableRow:
tableColumn = 0;
@@ -330,19 +390,24 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
case WithinTableHeaderElement:
case WithinTableElement:
break;
case WithinTableElementAnchor: // 'href="/svb5K8wS"'
case WithinTableElementAnchor:
if (tableColumn == 0) {
link = reader.attributes().value(hrefAttribute).toString();
if (link.startsWith(QLatin1Char('/')))
const QList<Attribute> attributes = toAttributes(match.capturedView(3));
for (const Attribute &att : attributes) {
if (att.name == "href") {
link = att.value;
if (link.startsWith('/'))
link.remove(0, 1);
break;
}
}
}
break;
case ParseError:
return rc;
} // switch startelement state
break;
case QXmlStreamReader::EndElement:
state = nextClosingState(state, reader.name());
}
} else { // not a start element
state = nextClosingState(state, match.capturedView(2));
switch (state) {
case OutSideTable:
if (tableRow) // Seen the table, bye.
@@ -375,28 +440,33 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
break;
case ParseError:
return rc;
} // switch endelement state
break;
case QXmlStreamReader::Characters:
}
}
// check and handle pure text
if (match.capturedEnd() + 1 < dataStr.size() - 1) {
int nextStartTag = dataStr.indexOf(tag, match.capturedEnd() + 1);
if (nextStartTag != -1) {
const QString text = replaceEntities(
dataStr.mid(match.capturedEnd(), nextStartTag - match.capturedEnd()));
if (!wsOnly.match(text).hasMatch()) {
switch (state) {
case WithinTableElement:
if (tableColumn == 1)
age = reader.text().toString();
age = text;
break;
case WithinTableElementAnchor:
if (tableColumn == 0)
title = reader.text().toString();
title = text;
break;
default:
break;
} // switch characters read state
break;
default:
break;
} // switch reader state
}
if (reader.hasError())
*errorMessage = QString::fromLatin1("Error at line %1:%2").arg(reader.lineNumber()).arg(reader.errorString());
}
}
}
return rc;
}
@@ -407,6 +477,12 @@ void PasteBinDotComProtocol::listFinished()
if (debug)
qDebug() << "listFinished: error" << m_listReply->errorString();
} else {
if (m_listReply->hasRawHeader("Content-Type")) {
// if the content type changes to xhtml we should switch back to QXmlStreamReader
const QByteArray contentType = m_listReply->rawHeader("Content-Type");
if (!contentType.startsWith("text/html"))
qWarning() << "Content type has changed to" << contentType;
}
QString errorMessage;
const QStringList list = parseLists(m_listReply, &errorMessage);
if (list.isEmpty())