forked from qt-creator/qt-creator
CPaster: Fix fetching from pastebin
Parsing HTML with QXmlStreamReader is a bad idea as it may be malformed HTML. Replace XML parsing by using regular expressions. Change-Id: I230e9d2b8e13d8bd736cb3f05eb6de7f812aab5b Reviewed-by: Christian Kandeler <christian.kandeler@qt.io>
This commit is contained in:
@@ -29,9 +29,8 @@
|
|||||||
#include <utils/qtcassert.h>
|
#include <utils/qtcassert.h>
|
||||||
|
|
||||||
#include <QDebug>
|
#include <QDebug>
|
||||||
|
#include <QRegularExpression>
|
||||||
#include <QStringList>
|
#include <QStringList>
|
||||||
#include <QXmlStreamReader>
|
|
||||||
#include <QXmlStreamAttributes>
|
|
||||||
#include <QByteArray>
|
#include <QByteArray>
|
||||||
|
|
||||||
#include <QNetworkReply>
|
#include <QNetworkReply>
|
||||||
@@ -218,38 +217,77 @@ enum ParseState
|
|||||||
WithinTableElement, WithinTableElementAnchor, ParseError
|
WithinTableElement, WithinTableElementAnchor, ParseError
|
||||||
};
|
};
|
||||||
|
|
||||||
QDebug operator<<(QDebug d, const QXmlStreamAttributes &al)
|
static QString replaceEntities(const QString &original)
|
||||||
{
|
{
|
||||||
QDebug nospace = d.nospace();
|
QString result(original);
|
||||||
foreach (const QXmlStreamAttribute &a, al)
|
static const QRegularExpression regex("&#((x[[:xdigit:]]+)|(\\d+));");
|
||||||
nospace << a.name().toString() << '=' << a.value().toString() << ' ';
|
|
||||||
return d;
|
QRegularExpressionMatchIterator it = regex.globalMatch(original);
|
||||||
|
while (it.hasNext()) {
|
||||||
|
const QRegularExpressionMatch match = it.next();
|
||||||
|
const QString value = match.captured(1);
|
||||||
|
if (value.startsWith('x'))
|
||||||
|
result.replace(match.captured(0), QChar(value.midRef(1).toInt(nullptr, 16)));
|
||||||
|
else
|
||||||
|
result.replace(match.captured(0), QChar(value.toInt(nullptr, 10)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline ParseState nextOpeningState(ParseState current, const QXmlStreamReader &reader)
|
namespace {
|
||||||
|
struct Attribute {
|
||||||
|
QString name;
|
||||||
|
QString value;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static QList<Attribute> toAttributes(const QStringView &attributes)
|
||||||
|
{
|
||||||
|
QList<Attribute> result;
|
||||||
|
const QRegularExpression att("\\s+([a-zA-Z]+)\\s*=\\s*('.*?'|\".*?\")");
|
||||||
|
QRegularExpressionMatchIterator it = att.globalMatch(attributes.toString());
|
||||||
|
while (it.hasNext()) {
|
||||||
|
const QRegularExpressionMatch match = it.next();
|
||||||
|
QString val = match.captured(2); // including quotes
|
||||||
|
if (val.size() > 2)
|
||||||
|
val = val.mid(1, val.size() - 2);
|
||||||
|
else
|
||||||
|
val= QString();
|
||||||
|
|
||||||
|
result.append(Attribute{match.captured(1), val});
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline ParseState nextOpeningState(ParseState current, const QStringView &tagView,
|
||||||
|
const QStringView &attributesView)
|
||||||
{
|
{
|
||||||
const auto element = reader.name();
|
|
||||||
switch (current) {
|
switch (current) {
|
||||||
case OutSideTable:
|
case OutSideTable:
|
||||||
// Trigger on main table only.
|
// Trigger on main table only.
|
||||||
if (element == QLatin1String("table")
|
if (tagView == QLatin1String("table")) {
|
||||||
&& reader.attributes().value(QLatin1String("class")) == QLatin1String("maintable"))
|
const QList<Attribute> attributes = toAttributes(attributesView);
|
||||||
return WithinTable;
|
for (const Attribute &att : attributes) {
|
||||||
|
if (att.name == "class" && att.value == "maintable")
|
||||||
|
return WithinTable;
|
||||||
|
}
|
||||||
|
}
|
||||||
return OutSideTable;
|
return OutSideTable;
|
||||||
case WithinTable:
|
case WithinTable:
|
||||||
if (element == QLatin1String("tr"))
|
if (tagView == QLatin1String("tr"))
|
||||||
return WithinTableRow;
|
return WithinTableRow;
|
||||||
break;
|
break;
|
||||||
case WithinTableRow:
|
case WithinTableRow:
|
||||||
if (element == QLatin1String("td"))
|
if (tagView == QLatin1String("td"))
|
||||||
return WithinTableElement;
|
return WithinTableElement;
|
||||||
if (element == QLatin1String("th"))
|
if (tagView == QLatin1String("th"))
|
||||||
return WithinTableHeaderElement;
|
return WithinTableHeaderElement;
|
||||||
break;
|
break;
|
||||||
case WithinTableElement:
|
case WithinTableElement:
|
||||||
if (element == QLatin1String("img"))
|
if (tagView == QLatin1String("img"))
|
||||||
return WithinTableElement;
|
return WithinTableElement;
|
||||||
if (element == QLatin1String("a"))
|
if (tagView == QLatin1String("a"))
|
||||||
return WithinTableElementAnchor;
|
return WithinTableElementAnchor;
|
||||||
break;
|
break;
|
||||||
case WithinTableHeaderElement:
|
case WithinTableHeaderElement:
|
||||||
@@ -257,10 +295,12 @@ static inline ParseState nextOpeningState(ParseState current, const QXmlStreamRe
|
|||||||
case ParseError:
|
case ParseError:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (tagView == QString("div") || tagView == QString("span") || tagView == QString("tbody"))
|
||||||
|
return current; // silently ignore
|
||||||
return ParseError;
|
return ParseError;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline ParseState nextClosingState(ParseState current, const Utils::StringView &element)
|
static inline ParseState nextClosingState(ParseState current, const QStringView &element)
|
||||||
{
|
{
|
||||||
switch (current) {
|
switch (current) {
|
||||||
case OutSideTable:
|
case OutSideTable:
|
||||||
@@ -278,6 +318,8 @@ static inline ParseState nextClosingState(ParseState current, const Utils::Strin
|
|||||||
return WithinTableRow;
|
return WithinTableRow;
|
||||||
if (element == QLatin1String("img"))
|
if (element == QLatin1String("img"))
|
||||||
return WithinTableElement;
|
return WithinTableElement;
|
||||||
|
if (element == QString("tr")) // html file may have wrong XML syntax, but browsers ignore
|
||||||
|
return WithinTable;
|
||||||
break;
|
break;
|
||||||
case WithinTableHeaderElement:
|
case WithinTableHeaderElement:
|
||||||
if (element == QLatin1String("th"))
|
if (element == QLatin1String("th"))
|
||||||
@@ -287,9 +329,11 @@ static inline ParseState nextClosingState(ParseState current, const Utils::Strin
|
|||||||
if (element == QLatin1String("a"))
|
if (element == QLatin1String("a"))
|
||||||
return WithinTableElement;
|
return WithinTableElement;
|
||||||
break;
|
break;
|
||||||
case ParseError:
|
case ParseError:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (element == QString("div") || element == QString("span") || element == QString("tbody"))
|
||||||
|
return current; // silently ignore
|
||||||
return ParseError;
|
return ParseError;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -307,20 +351,36 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
data.remove(0, tablePos);
|
data.remove(0, tablePos);
|
||||||
QXmlStreamReader reader(data);
|
|
||||||
ParseState state = OutSideTable;
|
ParseState state = OutSideTable;
|
||||||
int tableRow = 0;
|
int tableRow = 0;
|
||||||
int tableColumn = 0;
|
int tableColumn = 0;
|
||||||
|
|
||||||
const QString hrefAttribute = QLatin1String("href");
|
|
||||||
QString link;
|
QString link;
|
||||||
QString title;
|
QString title;
|
||||||
QString age;
|
QString age;
|
||||||
|
|
||||||
while (!reader.atEnd()) {
|
|
||||||
switch (reader.readNext()) {
|
QString dataStr = QString::fromUtf8(data);
|
||||||
case QXmlStreamReader::StartElement:
|
// remove comments if any
|
||||||
state = nextOpeningState(state, reader);
|
const QRegularExpression comment("<!--.*--!>", QRegularExpression::MultilineOption);
|
||||||
|
for ( ;; ) {
|
||||||
|
const QRegularExpressionMatch match = comment.match(dataStr);
|
||||||
|
if (!match.hasMatch())
|
||||||
|
break;
|
||||||
|
dataStr.remove(match.capturedStart(), match.capturedLength());
|
||||||
|
}
|
||||||
|
|
||||||
|
const QRegularExpression tag("<(/?)\\s*([a-zA-Z][a-zA-Z0-9]*)(.*?)(/?)\\s*>",
|
||||||
|
QRegularExpression::MultilineOption);
|
||||||
|
const QRegularExpression wsOnly("^\\s+$", QRegularExpression::MultilineOption);
|
||||||
|
QRegularExpressionMatchIterator it = tag.globalMatch(dataStr);
|
||||||
|
while (it.hasNext()) {
|
||||||
|
const QRegularExpressionMatch match = it.next();
|
||||||
|
|
||||||
|
bool startElement = match.captured(4).length() == 0 && match.captured(1).length() == 0;
|
||||||
|
if (startElement) {
|
||||||
|
state = nextOpeningState(state, match.capturedView(2), match.capturedView(3));
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case WithinTableRow:
|
case WithinTableRow:
|
||||||
tableColumn = 0;
|
tableColumn = 0;
|
||||||
@@ -330,19 +390,24 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
|
|||||||
case WithinTableHeaderElement:
|
case WithinTableHeaderElement:
|
||||||
case WithinTableElement:
|
case WithinTableElement:
|
||||||
break;
|
break;
|
||||||
case WithinTableElementAnchor: // 'href="/svb5K8wS"'
|
case WithinTableElementAnchor:
|
||||||
if (tableColumn == 0) {
|
if (tableColumn == 0) {
|
||||||
link = reader.attributes().value(hrefAttribute).toString();
|
const QList<Attribute> attributes = toAttributes(match.capturedView(3));
|
||||||
if (link.startsWith(QLatin1Char('/')))
|
for (const Attribute &att : attributes) {
|
||||||
link.remove(0, 1);
|
if (att.name == "href") {
|
||||||
|
link = att.value;
|
||||||
|
if (link.startsWith('/'))
|
||||||
|
link.remove(0, 1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case ParseError:
|
case ParseError:
|
||||||
return rc;
|
return rc;
|
||||||
} // switch startelement state
|
}
|
||||||
break;
|
} else { // not a start element
|
||||||
case QXmlStreamReader::EndElement:
|
state = nextClosingState(state, match.capturedView(2));
|
||||||
state = nextClosingState(state, reader.name());
|
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case OutSideTable:
|
case OutSideTable:
|
||||||
if (tableRow) // Seen the table, bye.
|
if (tableRow) // Seen the table, bye.
|
||||||
@@ -375,28 +440,33 @@ static inline QStringList parseLists(QIODevice *io, QString *errorMessage)
|
|||||||
break;
|
break;
|
||||||
case ParseError:
|
case ParseError:
|
||||||
return rc;
|
return rc;
|
||||||
} // switch endelement state
|
}
|
||||||
break;
|
}
|
||||||
case QXmlStreamReader::Characters:
|
// check and handle pure text
|
||||||
switch (state) {
|
if (match.capturedEnd() + 1 < dataStr.size() - 1) {
|
||||||
case WithinTableElement:
|
int nextStartTag = dataStr.indexOf(tag, match.capturedEnd() + 1);
|
||||||
if (tableColumn == 1)
|
if (nextStartTag != -1) {
|
||||||
age = reader.text().toString();
|
const QString text = replaceEntities(
|
||||||
break;
|
dataStr.mid(match.capturedEnd(), nextStartTag - match.capturedEnd()));
|
||||||
case WithinTableElementAnchor:
|
if (!wsOnly.match(text).hasMatch()) {
|
||||||
if (tableColumn == 0)
|
switch (state) {
|
||||||
title = reader.text().toString();
|
case WithinTableElement:
|
||||||
break;
|
if (tableColumn == 1)
|
||||||
default:
|
age = text;
|
||||||
break;
|
break;
|
||||||
} // switch characters read state
|
case WithinTableElementAnchor:
|
||||||
break;
|
if (tableColumn == 0)
|
||||||
default:
|
title = text;
|
||||||
break;
|
break;
|
||||||
} // switch reader state
|
default:
|
||||||
|
break;
|
||||||
|
} // switch characters read state
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (reader.hasError())
|
|
||||||
*errorMessage = QString::fromLatin1("Error at line %1:%2").arg(reader.lineNumber()).arg(reader.errorString());
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -407,6 +477,12 @@ void PasteBinDotComProtocol::listFinished()
|
|||||||
if (debug)
|
if (debug)
|
||||||
qDebug() << "listFinished: error" << m_listReply->errorString();
|
qDebug() << "listFinished: error" << m_listReply->errorString();
|
||||||
} else {
|
} else {
|
||||||
|
if (m_listReply->hasRawHeader("Content-Type")) {
|
||||||
|
// if the content type changes to xhtml we should switch back to QXmlStreamReader
|
||||||
|
const QByteArray contentType = m_listReply->rawHeader("Content-Type");
|
||||||
|
if (!contentType.startsWith("text/html"))
|
||||||
|
qWarning() << "Content type has changed to" << contentType;
|
||||||
|
}
|
||||||
QString errorMessage;
|
QString errorMessage;
|
||||||
const QStringList list = parseLists(m_listReply, &errorMessage);
|
const QStringList list = parseLists(m_listReply, &errorMessage);
|
||||||
if (list.isEmpty())
|
if (list.isEmpty())
|
||||||
|
|||||||
Reference in New Issue
Block a user