CPaster: Fix fetching from pastebin

Parsing HTML with QXmlStreamReader is a bad idea as it
may be malformed HTML.
Replace XML parsing by using regular expressions.

Change-Id: I230e9d2b8e13d8bd736cb3f05eb6de7f812aab5b
Reviewed-by: Christian Kandeler <christian.kandeler@qt.io>
This commit is contained in:
Christian Stenger
2020-09-16 15:15:01 +02:00
parent 707a3cfaf3
commit 5cdcb872df

View File

@@ -29,9 +29,8 @@
#include <utils/qtcassert.h> #include <utils/qtcassert.h>
#include <QDebug> #include <QDebug>
#include <QRegularExpression>
#include <QStringList> #include <QStringList>
#include <QXmlStreamReader>
#include <QXmlStreamAttributes>
#include <QByteArray> #include <QByteArray>
#include <QNetworkReply> #include <QNetworkReply>
@@ -218,38 +217,77 @@ enum ParseState
WithinTableElement, WithinTableElementAnchor, ParseError WithinTableElement, WithinTableElementAnchor, ParseError
}; };
QDebug operator<<(QDebug d, const QXmlStreamAttributes &al) static QString replaceEntities(const QString &original)
{ {
QDebug nospace = d.nospace(); QString result(original);
foreach (const QXmlStreamAttribute &a, al) static const QRegularExpression regex("&#((x[[:xdigit:]]+)|(\\d+));");
nospace << a.name().toString() << '=' << a.value().toString() << ' ';
return d; QRegularExpressionMatchIterator it = regex.globalMatch(original);
while (it.hasNext()) {
const QRegularExpressionMatch match = it.next();
const QString value = match.captured(1);
if (value.startsWith('x'))
result.replace(match.captured(0), QChar(value.midRef(1).toInt(nullptr, 16)));
else
result.replace(match.captured(0), QChar(value.toInt(nullptr, 10)));
} }
static inline ParseState nextOpeningState(ParseState current, const QXmlStreamReader &reader) return result;
}
namespace {
struct Attribute {
QString name;
QString value;
};
}
static QList<Attribute> toAttributes(const QStringView &attributes)
{
QList<Attribute> result;
const QRegularExpression att("\\s+([a-zA-Z]+)\\s*=\\s*('.*?'|\".*?\")");
QRegularExpressionMatchIterator it = att.globalMatch(attributes.toString());
while (it.hasNext()) {
const QRegularExpressionMatch match = it.next();
QString val = match.captured(2); // including quotes
if (val.size() > 2)
val = val.mid(1, val.size() - 2);
else
val= QString();
result.append(Attribute{match.captured(1), val});
}
return result;
}
static inline ParseState nextOpeningState(ParseState current, const QStringView &tagView,
const QStringView &attributesView)
{ {
const auto element = reader.name();
switch (current) { switch (current) {
case OutSideTable: case OutSideTable:
// Trigger on main table only. // Trigger on main table only.
if (element == QLatin1String("table") if (tagView == QLatin1String("table")) {
&& reader.attributes().value(QLatin1String("class")) == QLatin1String("maintable")) const QList<Attribute> attributes = toAttributes(attributesView);
for (const Attribute &att : attributes) {
if (att.name == "class" && att.value == "maintable")
return WithinTable; return WithinTable;
}
}
return OutSideTable; return OutSideTable;
case WithinTable: case WithinTable:
if (element == QLatin1String("tr")) if (tagView == QLatin1String("tr"))
return WithinTableRow; return WithinTableRow;
break; break;
case WithinTableRow: case WithinTableRow:
if (element == QLatin1String("td")) if (tagView == QLatin1String("td"))
return WithinTableElement; return WithinTableElement;
if (element == QLatin1String("th")) if (tagView == QLatin1String("th"))
return WithinTableHeaderElement; return WithinTableHeaderElement;
break; break;
case WithinTableElement: case WithinTableElement:
if (element == QLatin1String("img")) if (tagView == QLatin1String("img"))
return WithinTableElement; return WithinTableElement;
if (element == QLatin1String("a")) if (tagView == QLatin1String("a"))
return WithinTableElementAnchor; return WithinTableElementAnchor;
break; break;
case WithinTableHeaderElement: case WithinTableHeaderElement:
@@ -257,10 +295,12 @@ static inline ParseState nextOpeningState(ParseState current, const QXmlStreamRe
case ParseError: case ParseError:
break; break;
} }
if (tagView == QString("div") || tagView == QString("span") || tagView == QString("tbody"))
return current; // silently ignore
return ParseError; return ParseError;
} }
static inline ParseState nextClosingState(ParseState current, const Utils::StringView &element) static inline ParseState nextClosingState(ParseState current, const QStringView &element)
{ {
switch (current) { switch (current) {
case OutSideTable: case OutSideTable:
@@ -278,6 +318,8 @@ static inline ParseState nextClosingState(ParseState current, const Utils::Strin
return WithinTableRow; return WithinTableRow;
if (element == QLatin1String("img")) if (element == QLatin1String("img"))
return WithinTableElement; return WithinTableElement;
if (element == QString("tr")) // html file may have wrong XML syntax, but browsers ignore
return WithinTable;
break; break;
case WithinTableHeaderElement: case WithinTableHeaderElement:
if (element == QLatin1String("th")) if (element == QLatin1String("th"))
@@ -290,6 +332,8 @@ static inline ParseState nextClosingState(ParseState current, const Utils::Strin
case ParseError: case ParseError:
break; break;
} }
if (element == QString("div") || element == QString("span") || element == QString("tbody"))
return current; // silently ignore
return ParseError; return ParseError;
} }
@@ -307,20 +351,36 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
return rc; return rc;
} }
data.remove(0, tablePos); data.remove(0, tablePos);
QXmlStreamReader reader(data);
ParseState state = OutSideTable; ParseState state = OutSideTable;
int tableRow = 0; int tableRow = 0;
int tableColumn = 0; int tableColumn = 0;
const QString hrefAttribute = QLatin1String("href");
QString link; QString link;
QString title; QString title;
QString age; QString age;
while (!reader.atEnd()) {
switch (reader.readNext()) { QString dataStr = QString::fromUtf8(data);
case QXmlStreamReader::StartElement: // remove comments if any
state = nextOpeningState(state, reader); const QRegularExpression comment("<!--.*--!>", QRegularExpression::MultilineOption);
for ( ;; ) {
const QRegularExpressionMatch match = comment.match(dataStr);
if (!match.hasMatch())
break;
dataStr.remove(match.capturedStart(), match.capturedLength());
}
const QRegularExpression tag("<(/?)\\s*([a-zA-Z][a-zA-Z0-9]*)(.*?)(/?)\\s*>",
QRegularExpression::MultilineOption);
const QRegularExpression wsOnly("^\\s+$", QRegularExpression::MultilineOption);
QRegularExpressionMatchIterator it = tag.globalMatch(dataStr);
while (it.hasNext()) {
const QRegularExpressionMatch match = it.next();
bool startElement = match.captured(4).length() == 0 && match.captured(1).length() == 0;
if (startElement) {
state = nextOpeningState(state, match.capturedView(2), match.capturedView(3));
switch (state) { switch (state) {
case WithinTableRow: case WithinTableRow:
tableColumn = 0; tableColumn = 0;
@@ -330,19 +390,24 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
case WithinTableHeaderElement: case WithinTableHeaderElement:
case WithinTableElement: case WithinTableElement:
break; break;
case WithinTableElementAnchor: // 'href="/svb5K8wS"' case WithinTableElementAnchor:
if (tableColumn == 0) { if (tableColumn == 0) {
link = reader.attributes().value(hrefAttribute).toString(); const QList<Attribute> attributes = toAttributes(match.capturedView(3));
if (link.startsWith(QLatin1Char('/'))) for (const Attribute &att : attributes) {
if (att.name == "href") {
link = att.value;
if (link.startsWith('/'))
link.remove(0, 1); link.remove(0, 1);
break;
}
}
} }
break; break;
case ParseError: case ParseError:
return rc; return rc;
} // switch startelement state }
break; } else { // not a start element
case QXmlStreamReader::EndElement: state = nextClosingState(state, match.capturedView(2));
state = nextClosingState(state, reader.name());
switch (state) { switch (state) {
case OutSideTable: case OutSideTable:
if (tableRow) // Seen the table, bye. if (tableRow) // Seen the table, bye.
@@ -375,28 +440,33 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
break; break;
case ParseError: case ParseError:
return rc; return rc;
} // switch endelement state }
break; }
case QXmlStreamReader::Characters: // check and handle pure text
if (match.capturedEnd() + 1 < dataStr.size() - 1) {
int nextStartTag = dataStr.indexOf(tag, match.capturedEnd() + 1);
if (nextStartTag != -1) {
const QString text = replaceEntities(
dataStr.mid(match.capturedEnd(), nextStartTag - match.capturedEnd()));
if (!wsOnly.match(text).hasMatch()) {
switch (state) { switch (state) {
case WithinTableElement: case WithinTableElement:
if (tableColumn == 1) if (tableColumn == 1)
age = reader.text().toString(); age = text;
break; break;
case WithinTableElementAnchor: case WithinTableElementAnchor:
if (tableColumn == 0) if (tableColumn == 0)
title = reader.text().toString(); title = text;
break; break;
default: default:
break; break;
} // switch characters read state } // switch characters read state
break;
default:
break;
} // switch reader state
} }
if (reader.hasError()) }
*errorMessage = QString::fromLatin1("Error at line %1:%2").arg(reader.lineNumber()).arg(reader.errorString()); }
}
return rc; return rc;
} }
@@ -407,6 +477,12 @@ void PasteBinDotComProtocol::listFinished()
if (debug) if (debug)
qDebug() << "listFinished: error" << m_listReply->errorString(); qDebug() << "listFinished: error" << m_listReply->errorString();
} else { } else {
if (m_listReply->hasRawHeader("Content-Type")) {
// if the content type changes to xhtml we should switch back to QXmlStreamReader
const QByteArray contentType = m_listReply->rawHeader("Content-Type");
if (!contentType.startsWith("text/html"))
qWarning() << "Content type has changed to" << contentType;
}
QString errorMessage; QString errorMessage;
const QStringList list = parseLists(m_listReply, &errorMessage); const QStringList list = parseLists(m_listReply, &errorMessage);
if (list.isEmpty()) if (list.isEmpty())