akregator/src/librss

tools_p.cpp

00001 /*
00002  * tools_p.cpp
00003  *
00004  * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
00005  *
00006  * This program is distributed in the hope that it will be useful, but WITHOUT
00007  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00008  * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
00009  * accompanying file 'COPYING'.
00010  */
00011 #include "tools_p.h"
00012 
00013 #include <krfcdate.h>
00014 #include <qdom.h>
00015 #include <kcharsets.h>
00016 #include <qregexp.h>
00017 
00018 namespace RSS {
00019 
00020 time_t parseISO8601Date(const QString &s)
00021 {
00022     // do some sanity check: 26-12-2004T00:00+00:00 is parsed to epoch+1 in the KRFCDate, which is wrong. So let's check if the date begins with YYYY -fo
00023     if (s.stripWhiteSpace().left(4).toInt() < 1000)
00024         return 0; // error
00025 
00026     // FIXME: imho this is done in KRFCDate::parseDateISO8601() automatically, so we could omit it? -fo
00027     if (s.find('T') != -1)
00028         return KRFCDate::parseDateISO8601(s);
00029     else
00030         return KRFCDate::parseDateISO8601(s + "T12:00:00");
00031 }
00032 
00033 QString childNodesAsXML(const QDomNode& parent)
00034 {
00035     QDomNodeList list = parent.childNodes();
00036     QString str;
00037     QTextStream ts( &str, IO_WriteOnly );
00038     for (uint i = 0; i < list.count(); ++i)
00039         ts << list.item(i);
00040     return str.stripWhiteSpace();
00041 }
00042 
00043 static QString plainTextToHtml(const QString& plainText)
00044 {
00045     QString str(plainText);
00046     str.replace("&", "&amp;");
00047     str.replace("\"", "&quot;");
00048     str.replace("<", "&lt;");
00049     //str.replace(">", "&gt;");
00050     str.replace("\n", "<br/>");
00051     return str;
00052 }
00053 
00054 enum ContentFormat { Text, HTML, XML, Binary };
00055         
00056 static ContentFormat mapTypeToFormat(const QString& modep, const QString& typep,  const QString& src)
00057 {
00058     QString mode = modep.isNull() ? "escaped" : modep;
00059     QString type = typep;
00060     
00061     //"If neither the type attribute nor the src attribute is provided,
00062     //Atom Processors MUST behave as though the type attribute were
00063     //present with a value of "text""
00064     if (type.isNull() && src.isEmpty())
00065         type = QString::fromUtf8("text");
00066 
00067     if (type == QString::fromUtf8("html")
00068         || type == QString::fromUtf8("text/html"))
00069         return HTML;
00070     
00071     if (type == QString::fromUtf8("text")
00072         || (type.startsWith(QString::fromUtf8("text/"), false)
00073         && !type.startsWith(QString::fromUtf8("text/xml"), false))
00074        )
00075         return Text;
00076     
00077     QStringList xmltypes;
00078     xmltypes.append(QString::fromUtf8("xhtml"));
00079     // XML media types as defined in RFC3023:
00080     xmltypes.append(QString::fromUtf8("text/xml"));
00081     xmltypes.append(QString::fromUtf8("application/xml"));
00082     xmltypes.append(QString::fromUtf8("text/xml-external-parsed-entity"));
00083     xmltypes.append(QString::fromUtf8("application/xml-external-parsed-entity"));
00084     xmltypes.append(QString::fromUtf8("application/xml-dtd"));
00085     
00086     
00087     if (xmltypes.contains(type)
00088         || type.endsWith(QString::fromUtf8("+xml"), false)
00089         || type.endsWith(QString::fromUtf8("/xml"), false))
00090         return XML;
00091     
00092     return Binary;
00093 }
00094 
00095 static QString extractAtomContent(const QDomElement& e)
00096 {
00097     ContentFormat format = mapTypeToFormat(e.attribute("mode"),
00098                                            e.attribute("type"),
00099                                            e.attribute("src"));
00100     
00101     switch (format)
00102     {
00103         case HTML:
00104             return KCharsets::resolveEntities(e.text().simplifyWhiteSpace());
00105         case Text:
00106             return plainTextToHtml(e.text().stripWhiteSpace());
00107         case XML:
00108             return childNodesAsXML(e).simplifyWhiteSpace();
00109         case Binary:
00110         default:
00111             return QString();
00112     }
00113     
00114     return QString();
00115 }
00116 
00117 QString extractNode(const QDomNode &parent, const QString &elemName, bool isInlined)
00118 {
00119     QDomNode node = parent.namedItem(elemName);
00120     if (node.isNull())
00121         return QString::null;
00122 
00123     QDomElement e = node.toElement();
00124         QString result = e.text().stripWhiteSpace(); // let's assume plain text
00125  
00126         if (elemName == "content") // we have Atom here
00127         {
00128             result = extractAtomContent(e);
00129         }        
00130         else // check for HTML; not necessary for Atom:content
00131         {
00132             bool hasPre = result.contains("<pre>",false);
00133             bool hasHtml = hasPre || result.contains("<");  // FIXME: test if we have html, should be more clever -> regexp
00134             if(!isInlined && !hasHtml)                      // perform nl2br if not a inline elt and it has no html elts
00135                     result = result = result.replace(QChar('\n'), "<br />");
00136             if(!hasPre)                                     // strip white spaces if no <pre>
00137                     result = result.simplifyWhiteSpace();
00138         }
00139         
00140         return result.isEmpty() ? QString::null : result;
00141 }
00142 
00143 QString extractTitle(const QDomNode & parent)
00144 {
00145     QDomNode node = parent.namedItem(QString::fromLatin1("title"));
00146     if (node.isNull())
00147         return QString::null;
00148 
00149     QString result = node.toElement().text();
00150 
00151     result = KCharsets::resolveEntities(KCharsets::resolveEntities(result).replace(QRegExp("<[^>]*>"), "").remove("\\"));
00152     result = result.simplifyWhiteSpace();
00153 
00154     if (result.isEmpty())
00155         return QString::null;
00156 
00157     return result;
00158 }
00159 
00160 static void authorFromString(const QString& strp, QString& name, QString& email)
00161 {
00162     QString str = strp.stripWhiteSpace();
00163     if (str.isEmpty())
00164         return;
00165     
00166     // look for something looking like a mail address ( "foo@bar.com", 
00167     // "<foo@bar.com>") and extract it
00168     
00169     QRegExp remail("<?([^@\\s<]+@[^>\\s]+)>?"); // FIXME: user "proper" regexp,
00170        // search kmail source for it
00171     
00172     int pos = remail.search(str);
00173     if (pos != -1)
00174     {
00175         QString all = remail.cap(0);
00176         email = remail.cap(1);
00177         str.replace(all, ""); // remove mail address
00178     }
00179     
00180     // simplify the rest and use it as name
00181     
00182     name = str.simplifyWhiteSpace();
00183     
00184     // str might have the format "foo@bar.com (Foo M. Bar)".
00185     // We cut off parentheses if there are any
00186     QRegExp rename("\\(([^\\)]*)\\)");
00187     
00188     pos = rename.search(name);
00189     
00190     if (pos != -1)
00191     {
00192         name = rename.cap(1);
00193     }
00194     
00195     name = name.isEmpty() ? QString() : name;
00196     email = email.isEmpty() ? QString() : email;
00197 }
00198 
00199 QString parseItemAuthor(const QDomElement& element, Format format, Version version)
00200 {
00201     QString name;
00202     QString email;
00203 
00204     QDomElement dcCreator = element.namedItem("dc:creator").toElement();
00205     
00206     if (!dcCreator.isNull())
00207          authorFromString(dcCreator.text(), name, email);
00208     else if (format == AtomFeed)
00209     {
00210         QDomElement atomAuthor = element.namedItem("author").toElement();
00211         if (atomAuthor.isNull())
00212             atomAuthor = element.namedItem("atom:author").toElement();
00213         if (!atomAuthor.isNull())
00214         {
00215             QDomElement atomName = atomAuthor.namedItem("name").toElement();
00216             if (atomName.isNull())
00217                 atomName = atomAuthor.namedItem("atom:name").toElement();
00218             name = atomName.text().stripWhiteSpace();
00219             
00220             QDomElement atomEmail = atomAuthor.namedItem("email").toElement();
00221             if (atomEmail.isNull())
00222                 atomEmail = atomAuthor.namedItem("atom:email").toElement();
00223             email = atomEmail.text().stripWhiteSpace();
00224         }
00225     }
00226     else if (format == RSSFeed)
00227     {
00228         authorFromString(element.namedItem("author").toElement().text(), name, email);
00229     }
00230     
00231     if (name.isNull())
00232         name = email;
00233     
00234     if (!email.isNull())
00235         return QString("<a href=\"mailto:%1\">%2</a>").arg(email).arg(name);
00236     else
00237         return name;
00238 }
00239 
00240 } // namespace RSS
00241 
00242 // vim:noet:ts=4
KDE Home | KDE Accessibility Home | Description of Access Keys