akregator/src/librss

document.cpp

00001 /*
00002  * document.cpp
00003  *
00004  * Copyright (c) 2001, 2002, 2003 Frerich Raabe <raabe@kde.org>
00005  *
00006  * This program is distributed in the hope that it will be useful, but WITHOUT
00007  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
00008  * FOR A PARTICULAR PURPOSE. For licensing and distribution details, check the
00009  * accompanying file 'COPYING'.
00010  *
00011  */
00012 #include "document.h"
00013 #include "article.h"
00014 #include "image.h"
00015 #include "textinput.h"
00016 #include "tools_p.h"
00017 
00018 #include <krfcdate.h>
00019 #include <kurl.h>
00020 
00021 #include <qdatetime.h>
00022 #include <qdom.h>
00023 #include <qptrlist.h>
00024 
00025 #include <kdebug.h>
00026 
00027 using namespace RSS;
00028 
00029 struct Document::Private : public Shared
00030 {
00031     Private() : version(v0_90), image(NULL), textInput(NULL), language(en)
00032     {
00033         format=UnknownFormat;
00034         valid=false;
00035         ttl=-1;
00036     }
00037 
00038     ~Private()
00039     {
00040         delete textInput;
00041         delete image;
00042     }
00043 
00044     Version version;
00045     QString title;
00046     QString description;
00047     KURL link;
00048     Image *image;
00049     TextInput *textInput;
00050     Article::List articles;
00051     Language language;
00052     Format format;
00053     QString copyright;
00054     QDateTime pubDate;
00055     QDateTime lastBuildDate;
00056     QString rating;
00057     KURL docs;
00058     int ttl;
00059     QString managingEditor;
00060     QString webMaster;
00061     HourList skipHours;
00062     DayList skipDays;
00063     bool valid;
00064 };
00065 
00066 Document::Document() : d(new Private)
00067 {
00068 }
00069 
00070 Document::Document(const Document &other) : d(0)
00071 {
00072     *this = other;
00073 }
00074 
00075 static QString extractLink(const QDomNode& node, Format format)
00076 {
00077     if (format == AtomFeed)
00078     {
00079         QDomNode n;
00080         for (n = node.firstChild(); !n.isNull(); n = n.nextSibling()) {
00081             const QDomElement e = n.toElement();
00082             if ( (e.tagName() == QString::fromLatin1("link")) 
00083                   && (e.attribute(QString::fromLatin1("rel"), QString::fromLatin1("alternate")) == QString::fromLatin1("alternate")))
00084             {   
00085                 return n.toElement().attribute(QString::fromLatin1("href"));
00086             }
00087         }
00088     }
00089 
00090     return extractNode(node, QString::fromLatin1("link"));
00091     
00092 }
00093 
00094 Document::Document(const QDomDocument &doc) : d(new Private)
00095 {
00096     QString elemText;
00097     QDomNode rootNode = doc.documentElement();
00098 
00099     // Determine the version of the present RSS markup.
00100     QString attr;
00101 
00102     // we should probably check that it ISN'T feed or rss, rather than check if it is xhtml
00103     if (rootNode.toElement().tagName()==QString::fromLatin1("html"))
00104         d->valid=false;
00105     else
00106         d->valid=true;
00107     
00108     attr = rootNode.toElement().attribute(QString::fromLatin1("version"), QString::null);
00109     if (rootNode.toElement().tagName()==QString::fromLatin1("feed"))
00110     {
00111         d->format=AtomFeed;
00112         if (attr == QString::fromLatin1("0.3"))
00113             d->version = vAtom_0_3;
00114         else if (attr == QString::fromLatin1("0.2")) /* smt -> review */
00115             d->version = vAtom_0_2;
00116         else if (attr == QString::fromLatin1("0.1")) /* smt -> review */
00117             d->version = vAtom_0_1;
00118         else
00119             d->version = vAtom_1_0;
00120         }
00121     else
00122     {
00123         d->format=RSSFeed;
00124         if (attr == QString::fromLatin1("0.91"))
00125             d->version = v0_91;
00126         else if (attr == QString::fromLatin1("0.92"))
00127             d->version = v0_92;
00128         else if (attr == QString::fromLatin1("0.93"))
00129             d->version = v0_93;
00130         else if (attr == QString::fromLatin1("0.94"))
00131             d->version = v0_94;
00132         else // otherwise, we just assume a RSS2 compatible feed. As rss2 is generally
00133              // backward-compatible, this should work
00134             d->version = v2_0;
00135     }
00136     
00137     
00138     if (d->format==UnknownFormat)
00139     {
00140         attr = rootNode.toElement().attribute(QString::fromLatin1("xmlns"), QString::null);
00141         if (!attr.isNull()) {
00142         /*
00143          * Hardcoding these URLs is actually a bad idea, since the DTD doesn't
00144          * dictate a specific namespace. Still, most RSS files seem to use
00145          * these two, so I'll go for them now. If it turns out that many
00146          * mirrors of this RSS namespace are in use, I'll probably have to
00147          * distinguish the RSS versions by analyzing the relationship between
00148          * the nodes.
00149          */
00150             if (attr == QString::fromLatin1("http://my.netscape.com/rdf/simple/0.9/")) {
00151                 d->format=RSSFeed;
00152                 d->version = v0_90;
00153              }
00154             else if (attr == QString::fromLatin1("http://purl.org/rss/1.0/")) {
00155                 d->format=RSSFeed;
00156                 d->version = v1_0;
00157             }
00158         }
00159     }
00160     
00161     QDomNode channelNode;
00162 
00163     if (d->format == AtomFeed)
00164         channelNode=rootNode;
00165     else
00166         channelNode=rootNode.namedItem(QString::fromLatin1("channel"));
00167 
00168     if (!(elemText = extractTitle(channelNode)).isNull())
00169         d->title = elemText;
00170     QString descriptionTagName = "description";
00171     
00172     if (d->format == AtomFeed)
00173     {
00174         if (d->version == vAtom_1_0)
00175             descriptionTagName = "subtitle";
00176         else
00177             descriptionTagName = "tagline";
00178     }
00179     
00180     if (!(elemText = extractNode(channelNode, descriptionTagName)).isNull())
00181         d->description = elemText;
00182         
00183     d->link = extractLink(channelNode, d->format);
00184 
00185     
00186     /* This is ugly but necessary since RSS 0.90 and 1.0 have a different parent
00187      * node for <image>, <textinput> and <item> than RSS 0.91-0.94 and RSS 2.0.
00188      */
00189     QDomNode parentNode;
00190     if (d->version == v0_90 || d->version == v1_0 || d->format == AtomFeed)
00191         parentNode = rootNode;
00192     else
00193     {
00194     // following is a HACK for broken 0.91 feeds like xanga.com's
00195     if (!rootNode.namedItem(QString::fromLatin1("item")).isNull())
00196         parentNode = rootNode;
00197     else
00198             parentNode = channelNode;
00199     }
00200     
00201     // image and textinput aren't supported by Atom.. handle in case feed provides
00202     QDomNode n = parentNode.namedItem(QString::fromLatin1("image"));
00203     if (!n.isNull())
00204         d->image = new Image(n);
00205 
00206     n = parentNode.namedItem(QString::fromLatin1("textinput"));
00207     if (!n.isNull())
00208         d->textInput = new TextInput(n);
00209 
00210     // Our (hopefully faster) version of elementsByTagName()
00211     QString tagName;
00212     if (d->format == AtomFeed)
00213         tagName=QString::fromLatin1("entry");
00214     else
00215         tagName=QString::fromLatin1("item");
00216 
00217     for (n = parentNode.firstChild(); !n.isNull(); n = n.nextSibling()) {
00218         const QDomElement e = n.toElement();
00219         if (e.tagName() == tagName)
00220             d->articles.append(Article(e, d->format, d->version));
00221     }
00222 
00223     if (!(elemText = extractNode(channelNode, QString::fromLatin1("copyright"))).isNull())
00224         d->copyright = elemText;
00225 
00226     if (d->format == AtomFeed)
00227         elemText = rootNode.toElement().attribute(QString::fromLatin1("xml:lang"), QString::null);
00228     else
00229         elemText = extractNode(channelNode, QString::fromLatin1("language"));
00230 
00231     if (!elemText.isNull()){
00232         if (elemText == QString::fromLatin1("af"))
00233             d->language = af;
00234         else if (elemText == QString::fromLatin1("sq"))
00235             d->language = sq;
00236         else if (elemText == QString::fromLatin1("eu"))
00237             d->language = eu;
00238         else if (elemText == QString::fromLatin1("be"))
00239             d->language = be;
00240         else if (elemText == QString::fromLatin1("bg"))
00241             d->language = bg;
00242         else if (elemText == QString::fromLatin1("ca"))
00243             d->language = ca;
00244         else if (elemText == QString::fromLatin1("zh-cn"))
00245             d->language = zh_cn;
00246         else if (elemText == QString::fromLatin1("zh-tw"))
00247             d->language = zh_tw;
00248         else if (elemText == QString::fromLatin1("hr"))
00249             d->language = hr;
00250         else if (elemText == QString::fromLatin1("cs"))
00251             d->language = cs;
00252         else if (elemText == QString::fromLatin1("da"))
00253             d->language = da;
00254         else if (elemText == QString::fromLatin1("nl"))
00255             d->language = nl;
00256         else if (elemText == QString::fromLatin1("nl-be"))
00257             d->language = nl_be;
00258         else if (elemText == QString::fromLatin1("nl-nl"))
00259             d->language = nl_nl;
00260         else if (elemText == QString::fromLatin1("en"))
00261             d->language = en;
00262         else if (elemText == QString::fromLatin1("en-au"))
00263             d->language = en_au;
00264         else if (elemText == QString::fromLatin1("en-bz"))
00265             d->language = en_bz;
00266         else if (elemText == QString::fromLatin1("en-ca"))
00267             d->language = en_ca;
00268         else if (elemText == QString::fromLatin1("en-ie"))
00269             d->language = en_ie;
00270         else if (elemText == QString::fromLatin1("en-jm"))
00271             d->language = en_jm;
00272         else if (elemText == QString::fromLatin1("en-nz"))
00273             d->language = en_nz;
00274         else if (elemText == QString::fromLatin1("en-ph"))
00275             d->language = en_ph;
00276         else if (elemText == QString::fromLatin1("en-za"))
00277             d->language = en_za;
00278         else if (elemText == QString::fromLatin1("en-tt"))
00279             d->language = en_tt;
00280         else if (elemText == QString::fromLatin1("en-gb"))
00281             d->language = en_gb;
00282         else if (elemText == QString::fromLatin1("en-us"))
00283             d->language = en_us;
00284         else if (elemText == QString::fromLatin1("en-zw"))
00285             d->language = en_zw;
00286         else if (elemText == QString::fromLatin1("fo"))
00287             d->language = fo;
00288         else if (elemText == QString::fromLatin1("fi"))
00289             d->language = fi;
00290         else if (elemText == QString::fromLatin1("fr"))
00291             d->language = fr;
00292         else if (elemText == QString::fromLatin1("fr-be"))
00293             d->language = fr_be;
00294         else if (elemText == QString::fromLatin1("fr-ca"))
00295             d->language = fr_ca;
00296         else if (elemText == QString::fromLatin1("fr-fr"))
00297             d->language = fr_fr;
00298         else if (elemText == QString::fromLatin1("fr-lu"))
00299             d->language = fr_lu;
00300         else if (elemText == QString::fromLatin1("fr-mc"))
00301             d->language = fr_mc;
00302         else if (elemText == QString::fromLatin1("fr-ch"))
00303             d->language = fr_ch;
00304         else if (elemText == QString::fromLatin1("gl"))
00305             d->language = gl;
00306         else if (elemText == QString::fromLatin1("gd"))
00307             d->language = gd;
00308         else if (elemText == QString::fromLatin1("de"))
00309             d->language = de;
00310         else if (elemText == QString::fromLatin1("de-at"))
00311             d->language = de_at;
00312         else if (elemText == QString::fromLatin1("de-de"))
00313             d->language = de_de;
00314         else if (elemText == QString::fromLatin1("de-li"))
00315             d->language = de_li;
00316         else if (elemText == QString::fromLatin1("de-lu"))
00317             d->language = de_lu;
00318         else if (elemText == QString::fromLatin1("de-ch"))
00319             d->language = de_ch;
00320         else if (elemText == QString::fromLatin1("el"))
00321             d->language = el;
00322         else if (elemText == QString::fromLatin1("hu"))
00323             d->language = hu;
00324         else if (elemText == QString::fromLatin1("is"))
00325             d->language = is;
00326         else if (elemText == QString::fromLatin1("id"))
00327             d->language = id;
00328         else if (elemText == QString::fromLatin1("ga"))
00329             d->language = ga;
00330         else if (elemText == QString::fromLatin1("it"))
00331             d->language = it;
00332         else if (elemText == QString::fromLatin1("it-it"))
00333             d->language = it_it;
00334         else if (elemText == QString::fromLatin1("it-ch"))
00335             d->language = it_ch;
00336         else if (elemText == QString::fromLatin1("ja"))
00337             d->language = ja;
00338         else if (elemText == QString::fromLatin1("ko"))
00339             d->language = ko;
00340         else if (elemText == QString::fromLatin1("mk"))
00341             d->language = mk;
00342         else if (elemText == QString::fromLatin1("no"))
00343             d->language = no;
00344         else if (elemText == QString::fromLatin1("pl"))
00345             d->language = pl;
00346         else if (elemText == QString::fromLatin1("pt"))
00347             d->language = pt;
00348         else if (elemText == QString::fromLatin1("pt-br"))
00349             d->language = pt_br;
00350         else if (elemText == QString::fromLatin1("pt-pt"))
00351             d->language = pt_pt;
00352         else if (elemText == QString::fromLatin1("ro"))
00353             d->language = ro;
00354         else if (elemText == QString::fromLatin1("ro-mo"))
00355             d->language = ro_mo;
00356         else if (elemText == QString::fromLatin1("ro-ro"))
00357             d->language = ro_ro;
00358         else if (elemText == QString::fromLatin1("ru"))
00359             d->language = ru;
00360         else if (elemText == QString::fromLatin1("ru-mo"))
00361             d->language = ru_mo;
00362         else if (elemText == QString::fromLatin1("ru-ru"))
00363             d->language = ru_ru;
00364         else if (elemText == QString::fromLatin1("sr"))
00365             d->language = sr;
00366         else if (elemText == QString::fromLatin1("sk"))
00367             d->language = sk;
00368         else if (elemText == QString::fromLatin1("sl"))
00369             d->language = sl;
00370         else if (elemText == QString::fromLatin1("es"))
00371             d->language = es;
00372         else if (elemText == QString::fromLatin1("es-ar"))
00373             d->language = es_ar;
00374         else if (elemText == QString::fromLatin1("es-bo"))
00375             d->language = es_bo;
00376         else if (elemText == QString::fromLatin1("es-cl"))
00377             d->language = es_cl;
00378         else if (elemText == QString::fromLatin1("es-co"))
00379             d->language = es_co;
00380         else if (elemText == QString::fromLatin1("es-cr"))
00381             d->language = es_cr;
00382         else if (elemText == QString::fromLatin1("es-do"))
00383             d->language = es_do;
00384         else if (elemText == QString::fromLatin1("es-ec"))
00385             d->language = es_ec;
00386         else if (elemText == QString::fromLatin1("es-sv"))
00387             d->language = es_sv;
00388         else if (elemText == QString::fromLatin1("es-gt"))
00389             d->language = es_gt;
00390         else if (elemText == QString::fromLatin1("es-hn"))
00391             d->language = es_hn;
00392         else if (elemText == QString::fromLatin1("es-mx"))
00393             d->language = es_mx;
00394         else if (elemText == QString::fromLatin1("es-ni"))
00395             d->language = es_ni;
00396         else if (elemText == QString::fromLatin1("es-pa"))
00397             d->language = es_pa;
00398         else if (elemText == QString::fromLatin1("es-py"))
00399             d->language = es_py;
00400         else if (elemText == QString::fromLatin1("es-pe"))
00401             d->language = es_pe;
00402         else if (elemText == QString::fromLatin1("es-pr"))
00403             d->language = es_pr;
00404         else if (elemText == QString::fromLatin1("es-es"))
00405             d->language = es_es;
00406         else if (elemText == QString::fromLatin1("es-uy"))
00407             d->language = es_uy;
00408         else if (elemText == QString::fromLatin1("es-ve"))
00409             d->language = es_ve;
00410         else if (elemText == QString::fromLatin1("sv"))
00411             d->language = sv;
00412         else if (elemText == QString::fromLatin1("sv-fi"))
00413             d->language = sv_fi;
00414         else if (elemText == QString::fromLatin1("sv-se"))
00415             d->language = sv_se;
00416         else if (elemText == QString::fromLatin1("tr"))
00417             d->language = tr;
00418         else if (elemText == QString::fromLatin1("uk"))
00419             d->language = uk;
00420         else
00421             d->language = UndefinedLanguage;
00422     }
00423 
00424     if (d->format == AtomFeed)
00425         tagName=QString::fromLatin1("issued"); // atom doesn't specify this for feeds
00426                                                // but some broken feeds do this
00427     else
00428         tagName=QString::fromLatin1("pubDate");
00429 
00430     if (!(elemText = extractNode(channelNode, tagName)).isNull()) {
00431         time_t _time;
00432 
00433         if (d->format == AtomFeed)
00434             _time=parseISO8601Date(elemText);
00435         else
00436             _time=KRFCDate::parseDate(elemText);
00437         /* \bug This isn't really the right way since it will set the date to
00438          * Jan 1 1970, 1:00:00 if the passed date was invalid; this means that
00439          * we cannot distinguish between that date, and invalid values. :-/
00440          */
00441         d->pubDate.setTime_t(_time);
00442     }
00443 
00444     if (!(elemText = extractNode(channelNode, QString::fromLatin1("dc:date"))).isNull()) {
00445         time_t _time = parseISO8601Date(elemText);
00446         /* \bug This isn't really the right way since it will set the date to
00447          * Jan 1 1970, 1:00:00 if the passed date was invalid; this means that
00448          * we cannot distinguish between that date, and invalid values. :-/
00449          */
00450         d->pubDate.setTime_t(_time);
00451     }
00452 
00453     if (d->format == AtomFeed)
00454         tagName=QString::fromLatin1("modified");
00455     else
00456         tagName=QString::fromLatin1("lastBuildDate");
00457     if (!(elemText = extractNode(channelNode, tagName)).isNull()) {
00458         time_t _time;
00459         if (d->format == AtomFeed)
00460             _time = parseISO8601Date(elemText);
00461         else
00462             _time = KRFCDate::parseDate(elemText);
00463         d->lastBuildDate.setTime_t(_time);
00464     }
00465 
00466     if (!(elemText = extractNode(channelNode, QString::fromLatin1("rating"))).isNull())
00467         d->rating = elemText;
00468     if (!(elemText = extractNode(channelNode, QString::fromLatin1("docs"))).isNull())
00469         d->docs = elemText;
00470     if (!(elemText = extractNode(channelNode, QString::fromLatin1((d->format == AtomFeed) ? "author" : "managingEditor"))).isNull())
00471         d->managingEditor = elemText;
00472     if (!(elemText = extractNode(channelNode, QString::fromLatin1("webMaster"))).isNull())
00473         d->webMaster = elemText;
00474 
00475     if (!(elemText = extractNode(channelNode, QString::fromLatin1("ttl"))).isNull())
00476         d->ttl = elemText.toUInt();
00477 
00478     n = channelNode.namedItem(QString::fromLatin1("skipHours"));
00479     if (!n.isNull())
00480         for (QDomElement e = n.firstChild().toElement(); !e.isNull(); e = e.nextSibling().toElement())
00481             if (e.tagName() == QString::fromLatin1("hour"))
00482                 d->skipHours.append(e.text().toUInt());
00483 
00484     n = channelNode.namedItem(QString::fromLatin1("skipDays"));
00485     if (!n.isNull()) {
00486         Day day;
00487         QString elemText;
00488         for (QDomElement e = n.firstChild().toElement(); !e.isNull(); e = e.nextSibling().toElement())
00489             if (e.tagName() == QString::fromLatin1("day")) {
00490                 elemText = e.text().lower();
00491                 if (elemText == QString::fromLatin1("monday"))
00492                     day = Monday;
00493                 else if (elemText == QString::fromLatin1("tuesday"))
00494                     day = Tuesday;
00495                 else if (elemText == QString::fromLatin1("wednesday"))
00496                     day = Wednesday;
00497                 else if (elemText == QString::fromLatin1("thursday"))
00498                     day = Thursday;
00499                 else if (elemText == QString::fromLatin1("friday"))
00500                     day = Friday;
00501                 else if (elemText == QString::fromLatin1("saturday"))
00502                     day = Saturday;
00503                 else if (elemText == QString::fromLatin1("sunday"))
00504                     day = Sunday;
00505                 else
00506                     day = UndefinedDay;
00507                 if (day != UndefinedDay)
00508                     d->skipDays.append(day);
00509             }
00510     }
00511 }
00512 
00513 Document::~Document()
00514 {
00515     if (d->deref())
00516         delete d;
00517 }
00518 
00519 bool Document::isValid() const
00520 {
00521     return d->valid;
00522 }
00523 
00524 Version Document::version() const
00525 {
00526     return d->version;
00527 }
00528 
00529 QString Document::verbVersion() const
00530 {
00531     switch (d->version) {
00532         case v0_90: return QString::fromLatin1("0.90");
00533         case v0_91: return QString::fromLatin1("0.91");
00534         case v0_92: return QString::fromLatin1("0.92");
00535         case v0_93: return QString::fromLatin1("0.93");
00536         case v0_94: return QString::fromLatin1("0.94");
00537         case v1_0: return QString::fromLatin1("1.0");
00538         case v2_0: return QString::fromLatin1("2.0");
00539         case vAtom_0_3: return QString::fromLatin1("0.3");
00540         case vAtom_0_2: return QString::fromLatin1("0.2");
00541         case vAtom_0_1: return QString::fromLatin1("0.1");
00542         case vAtom_1_0: return QString::fromLatin1("1.0");
00543     }
00544     return QString::null;
00545 }
00546 
00547 QString Document::title() const
00548 {
00549     return d->title;
00550 }
00551 
00552 QString Document::description() const
00553 {
00554     return d->description;
00555 }
00556 
00557 const KURL &Document::link() const
00558 {
00559     return d->link;
00560 }
00561 
00562 Image *Document::image()
00563 {
00564     return d->image;
00565 }
00566 
00567 const Image *Document::image() const
00568 {
00569     return d->image;
00570 }
00571 
00572 TextInput *Document::textInput()
00573 {
00574     return d->textInput;
00575 }
00576 
00577 const TextInput *Document::textInput() const
00578 {
00579     return d->textInput;
00580 }
00581 
00582 const Article::List &Document::articles() const
00583 {
00584     return d->articles;
00585 }
00586 
00587 Language Document::language() const
00588 {
00589     return d->language;
00590 }
00591 
00592 QString Document::copyright() const
00593 {
00594     return d->copyright;
00595 }
00596 
00597 const QDateTime &Document::pubDate() const
00598 {
00599     return d->pubDate;
00600 }
00601 
00602 const QDateTime &Document::lastBuildDate() const
00603 {
00604     return d->lastBuildDate;
00605 }
00606 
00607 QString Document::rating() const
00608 {
00609     return d->rating;
00610 }
00611 
00612 const KURL &Document::docs() const
00613 {
00614     return d->docs;
00615 }
00616 
00617 QString Document::managingEditor() const
00618 {
00619     return d->managingEditor;
00620 }
00621 
00622 QString Document::webMaster() const
00623 {
00624     return d->webMaster;
00625 }
00626 
00627 const HourList &Document::skipHours() const
00628 {
00629     return d->skipHours;
00630 }
00631 
00632 const DayList &Document::skipDays() const
00633 {
00634     return d->skipDays;
00635 }
00636 
00637 int Document::ttl() const
00638 {
00639     return d->ttl;
00640 }
00641 
00642 Document &Document::operator=(const Document &other)
00643 {
00644     if (this != &other) {
00645         other.d->ref();
00646         if (d && d->deref())
00647             delete d;
00648         d = other.d;
00649     }
00650     return *this;
00651 }
00652 
00653 // vim:noet:ts=4
KDE Home | KDE Accessibility Home | Description of Access Keys