akregator/src/librss

feeddetector.cpp

00001 /*
00002     This file is part of Akregator.
00003 
00004     Copyright (C) 2004 Teemu Rytilahti <tpr@d5k.net>
00005 
00006     This program is free software; you can redistribute it and/or modify
00007     it under the terms of the GNU General Public License as published by
00008     the Free Software Foundation; either version 2 of the License, or
00009     (at your option) any later version.
00010 
00011     This program is distributed in the hope that it will be useful,
00012     but WITHOUT ANY WARRANTY; without even the implied warranty of
00013     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
00014     GNU General Public License for more details.
00015 
00016     You should have received a copy of the GNU General Public License
00017     along with this program; if not, write to the Free Software
00018     Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
00019 
00020     As a special exception, permission is given to link this program
00021     with any edition of Qt, and distribute the resulting executable,
00022     without including the source code for Qt in the source distribution.
00023 */
00024  
00025 #include <qregexp.h>
00026 #include <qstring.h>
00027 #include <qstringlist.h>
00028 #include <qvaluelist.h>
00029 #include <kcharsets.h>
00030 #include <kurl.h>
00031 
00032 #include "feeddetector.h"
00033 
00034 
00035 using namespace RSS;
00036 
00037 FeedDetectorEntryList FeedDetector::extractFromLinkTags(const QString& s)   
00038 {
00039     //reduce all sequences of spaces, newlines etc. to one space:
00040     QString str = s.simplifyWhiteSpace();
00041 
00042     // extracts <link> tags
00043     QRegExp reLinkTag("<[\\s]?LINK[^>]*REL[\\s]?=[\\s]?\\\"[\\s]?(ALTERNATE|SERVICE\\.FEED)[\\s]?\\\"[^>]*>", false);
00044 
00045     // extracts the URL (href="url")
00046     QRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
00047     // extracts type attribute
00048     QRegExp reType("TYPE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
00049     // extracts the title (title="title")
00050     QRegExp reTitle("TITLE[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
00051 
00052     int pos = 0;
00053     int matchpos = 0;
00054 
00055     // get all <link> tags
00056     QStringList linkTags;
00057     //int strlength = str.length();
00058     while ( matchpos != -1 )
00059     {
00060         matchpos = reLinkTag.search(str, pos);
00061         if (matchpos != -1)
00062         {
00063             linkTags.append( str.mid(matchpos, reLinkTag.matchedLength()) );
00064             pos = matchpos + reLinkTag.matchedLength();
00065         }
00066     }
00067 
00068     FeedDetectorEntryList list;
00069 
00070     for ( QStringList::Iterator it = linkTags.begin(); it != linkTags.end(); ++it )
00071     {
00072         QString type;
00073         int pos = reType.search(*it, 0);
00074         if (pos != -1)
00075             type = reType.cap(1).lower();
00076 
00077         // we accept only type attributes indicating a feed
00078         if ( type != "application/rss+xml" && type != "application/rdf+xml"
00079           && type != "application/atom+xml" && type != "text/xml" )
00080             continue;
00081                 
00082         QString title;
00083         pos = reTitle.search(*it, 0);
00084         if (pos != -1)
00085         title = reTitle.cap(1);
00086 
00087         title = KCharsets::resolveEntities(title);
00088 
00089         QString url;
00090         pos = reHref.search(*it, 0);
00091         if (pos != -1)
00092             url = reHref.cap(1);
00093 
00094         url = KCharsets::resolveEntities(url);
00095 
00096         // if feed has no title, use the url as preliminary title (until feed is parsed)
00097         if ( title.isEmpty() )
00098             title = url;
00099 
00100         if ( !url.isEmpty() )
00101             list.append(FeedDetectorEntry(url, title) );        
00102     }
00103 
00104 
00105     return list;
00106 }
00107 
00108 QStringList FeedDetector::extractBruteForce(const QString& s)
00109 {
00110     QString str = s.simplifyWhiteSpace();
00111     
00112     QRegExp reAhrefTag("<[\\s]?A[^>]?HREF=[\\s]?\\\"[^\\\"]*\\\"[^>]*>", false);
00113     
00114     // extracts the URL (href="url")
00115     QRegExp reHref("HREF[\\s]?=[\\s]?\\\"([^\\\"]*)\\\"", false);
00116 
00117     QRegExp rssrdfxml(".*(RSS|RDF|XML)", false);
00118 
00119     int pos = 0;
00120     int matchpos = 0;
00121     
00122     // get all <a href> tags and capture url
00123     QStringList list;
00124     //int strlength = str.length();
00125     while ( matchpos != -1 )
00126     {
00127         matchpos = reAhrefTag.search(str, pos);
00128         if ( matchpos != -1 )
00129         {
00130             QString ahref = str.mid(matchpos, reAhrefTag.matchedLength());
00131             int hrefpos = reHref.search(ahref, 0);
00132             if ( hrefpos != -1 )
00133             {
00134                 QString url = reHref.cap(1);
00135 
00136                 url = KCharsets::resolveEntities(url);
00137 
00138                 if ( rssrdfxml.exactMatch(url) )
00139                     list.append(url);
00140             }
00141 
00142             pos = matchpos + reAhrefTag.matchedLength();
00143         }
00144     }
00145     
00146     return list;
00147 }
00148 
00149 QString FeedDetector::fixRelativeURL(const QString &s, const KURL &baseurl)
00150 {
00151     QString s2=s;
00152     KURL u;
00153     if (KURL::isRelativeURL(s2))
00154     {
00155         if (s2.startsWith("//"))
00156         {
00157             s2=s2.prepend(baseurl.protocol()+":");
00158             u=s2;
00159         }
00160         else if (s2.startsWith("/"))
00161         {
00162             KURL b2(baseurl);
00163             b2.setPath(QString()); // delete path and query, so that only protocol://host remains
00164             b2.setQuery(QString());
00165             u = KURL(b2, s2.remove(0,1)); // remove leading "/" 
00166         }
00167         else
00168         {
00169             u = KURL(baseurl, s2);
00170         }
00171     }
00172     else
00173         u=s2;
00174 
00175     u.cleanPath();
00176     //kdDebug() << "AKREGATOR_PLUGIN_FIXURL: " << "url=" << s << " baseurl=" << baseurl.url() << " fixed=" << u.url() << 
00177     //endl;
00178     return u.url();
00179 }
KDE Home | KDE Accessibility Home | Description of Access Keys