Commit 627b0340 authored by Valério Valério's avatar Valério Valério

[qmf] Use QTextDocument to parse html.

Regular expression are not appropriated tool to parse a none regular language
like html, a proper parse should be used.
This commit introduces a dependency on QtGui making the messageserver
binary marginally bigger in size. Usage of Html parse is optional
can be defined via USE_HTML_PARSER compile flag.
parent d40bb905
......@@ -62,6 +62,9 @@
#include <qtextcodec.h>
#include <QTextCodec>
#include <QtDebug>
#ifdef USE_HTML_PARSER
#include <QTextDocument>
#endif
#include <stdlib.h>
#include <limits.h>
......@@ -8601,51 +8604,66 @@ void QMailMessage::refreshPreview()
{
const int maxPreviewLength = 280;
// TODO: don't load entire body into memory
// TODO: parse html correctly, e.g. closing brackets in quotes in tags
QMailMessagePartContainer *htmlPart= findHtmlContainer();
QMailMessagePartContainer *plainTextPart= findPlainTextContainer();
QString plainText;
if (multipartType() == MultipartRelated && htmlPart) // force taking the html in this case
plainTextPart=0;
if ( plainTextPart && plainTextPart->hasBody()) {
QString plaintext(plainTextPart->body().data());
plaintext.remove(QRegExp("\\[(image|cid):[^\\]]*\\]", Qt::CaseInsensitive));
metaDataImpl()->setPreview(plaintext.left(maxPreviewLength));
plainText = plainTextPart->body().data();
// These are not valid html, so remove them before
plainText.remove(QRegExp("\\[(image|cid):[^\\]]*\\]", Qt::CaseInsensitive));
} else if (htmlPart && ( multipartType() == MultipartRelated || htmlPart->hasBody())) {
QString markup = htmlPart->body().data();
markup.remove(QRegExp("<\\s*(style|head|form|script)[^<]*<\\s*/\\s*\\1\\s*>", Qt::CaseInsensitive));
markup.remove(QRegExp("<(.)[^>]*>"));
markup.replace("&quot;", "\"", Qt::CaseInsensitive);
markup.replace("&nbsp;", " ", Qt::CaseInsensitive);
markup.replace("&amp;", "&", Qt::CaseInsensitive);
markup.replace("&lt;", "<", Qt::CaseInsensitive);
markup.replace("&gt;", ">", Qt::CaseInsensitive);
plainText = htmlPart->body().data();
#ifndef USE_HTML_PARSER
plainText.remove(QRegExp("<\\s*(style|head|form|script)[^<]*<\\s*/\\s*\\1\\s*>", Qt::CaseInsensitive));
plainText.remove(QRegExp("<(.)[^>]*>"));
plainText.replace("&quot;", "\"", Qt::CaseInsensitive);
plainText.replace("&nbsp;", " ", Qt::CaseInsensitive);
plainText.replace("&amp;", "&", Qt::CaseInsensitive);
plainText.replace("&lt;", "<", Qt::CaseInsensitive);
plainText.replace("&gt;", ">", Qt::CaseInsensitive);
// now replace stuff like "&#1084;"
for (int pos = 0; ; ) {
pos = markup.indexOf("&#", pos);
pos = plainText.indexOf("&#", pos);
if (pos < 0)
break;
int semicolon = markup.indexOf(';', pos+2);
int semicolon = plainText.indexOf(';', pos+2);
if (semicolon < 0) {
++pos;
continue;
}
int code = (markup.mid(pos+2, semicolon-pos-2)).toInt();
int code = (plainText.mid(pos+2, semicolon-pos-2)).toInt();
if (code == 0) {
++pos;
continue;
}
markup.replace(pos, semicolon-pos+1, QChar(code));
plainText.replace(pos, semicolon-pos+1, QChar(code));
}
metaDataImpl()->setPreview(markup.simplified().left(maxPreviewLength));
}
#ifdef USE_HTML_PARSER
metaDataImpl()->setPreview(htmlToPlainText(plainText).left(maxPreviewLength));
#else
metaDataImpl()->setPreview(plainText.left(maxPreviewLength));
#endif
partContainerImpl()->setPreviewDirty(false);
}
#ifdef USE_HTML_PARSER
QString QMailMessage::htmlToPlainText(const QString &html)
{
QTextDocument doc;
doc.setHtml(html);
// Parse text a second time to prevent html injection via pre-hidden tags(e.g: &lt; img src="cenas.png" &gt;)
doc.setHtml(doc.toPlainText());
return doc.toPlainText();
}
#endif
/*! \internal */
QMailMessage QMailMessage::fromRfc2822(LongString& ls)
{
......
......@@ -814,6 +814,9 @@ private:
static QMailMessage fromRfc2822(LongString& ls);
void refreshPreview();
#ifdef USE_HTML_PARSER
static QString htmlToPlainText(const QString &html);
#endif
public:
virtual QString preview() const;
......
......@@ -17,6 +17,11 @@ win32: {
}
QT = core sql network
contains(DEFINES, USE_HTML_PARSER) {
QT += gui
}
DEPENDPATH += .
INCLUDEPATH += support
......
......@@ -45,13 +45,20 @@
#include <qmaillog.h>
#include <qloggers.h>
#include <signal.h>
#ifdef USE_HTML_PARSER
#include <QtGui>
#endif
#if !defined(NO_SHUTDOWN_SIGNAL_HANDLING) && defined(Q_OS_UNIX)
static void shutdown(int n)
{
qMailLog(Messaging) << "Received signal" << n << ", shutting down.";
#ifdef USE_HTML_PARSER
QGuiApplication::exit();
#else
QCoreApplication::exit();
#endif
}
#endif
......@@ -66,7 +73,12 @@ static void recreateLoggers(int n)
Q_DECL_EXPORT int main(int argc, char** argv)
{
#ifdef USE_HTML_PARSER
// Need for html parsing by <QTextdocument> in qmailmessage.cpp
QGuiApplication app(argc, argv);
#else
QCoreApplication app(argc, argv);
#endif
// This is ~/.config/QtProject/Messageserver.conf
qMailLoggersRecreate("QtProject", "Messageserver", "Msgsrv");
......
......@@ -26,6 +26,10 @@ equals(QT_MAJOR_VERSION, 5){
CONFIG += qmfmessageserver qmfclient
QT = core
contains(DEFINES, USE_HTML_PARSER) {
QT += gui
}
!contains(DEFINES,QMF_NO_MESSAGE_SERVICE_EDITOR) {
QT += gui
equals(QT_MAJOR_VERSION, 5): QT += widgets
......
......@@ -134,6 +134,7 @@ This package contains the documentation for Qt Messaging Framework (QMF).
DEFINES+=MESSAGESERVER_PLUGINS \
DEFINES+=QMF_NO_MESSAGE_SERVICE_EDITOR \
DEFINES+=USE_KEEPALIVE \
DEFINES+=USE_HTML_PARSER \
CONFIG+=syslog
make %{?_smp_mflags}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment