/home/phil/devel/fmtorrent/fmtorrent/master/aggregator/feedfinder.py

0001"""Ultra-liberal feed finder

0002

0003http://diveintomark.org/projects/feed_finder/

0004

0005Usage:

0006getFeeds(uri) - returns list of feeds associated with this address

0007

0008Example:

0009>>> import feedfinder

0010>>> feedfinder.getFeeds('http://diveintomark.org/')

0011['http://diveintomark.org/xml/atom.xml']

0012>>> feedfinder.getFeeds('macnn.com')

0013['http://www.macnn.com/macnn.rdf']

0014

0015Can also use from the command line.  Feeds are returned one per line:

0016$ python feedfinder.py diveintomark.org

0017http://diveintomark.org/xml/atom.xml

0018

0019How it works:

00200. At every step, feeds are minimally verified to make sure they are really feeds.

00211. If the URI points to a feed, it is simply returned; otherwise

0022   the page is downloaded and the real fun begins.

00232. Feeds pointed to by LINK tags in the header of the page (autodiscovery)

00243. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or ".atom"

00254. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"

00265. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or ".atom"

00276. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"

00287. As a last ditch effort, we search Syndic8 for feeds matching the URI

0029

0030"""

0031

0032__version__ = "1.2"

0033__date__ = "2004-01-09"

0034__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"

0035__copyright__ = "Copyright 2002-4, Mark Pilgrim"

0036__license__ = "Python"

0037__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity

0038Also Jason Diamond, Brian Lalor for bug reporting and patches"""

0039__history__ = """

00401.1 - MAP - 2003/02/20 - added support for Robot Exclusion Standard.  Will

0041fetch /robots.txt once per domain and verify that URLs are allowed to be

0042downloaded.  Identifies itself as

0043  rssfinder/<version> Python-urllib/<version> +http://diveintomark.org/projects/rss_finder/

00441.2 - MAP - 2004-01-09 - added Atom support, changed name, relicensed,

0045  don't query Syndic8 by default (pass querySyndic8=1 to getFeeds to do it anyway)

0046"""

0047

0048_debug = 0

0049

0050# ---------- required modules (should come with any Python distribution) ----------

0051import sgmllib, urllib, urlparse, re, sys, robotparser

0052

0053# ---------- optional modules (feedfinder will work without these, but with reduced functionality) ----------

0054

0055# timeoutsocket allows feedfinder to time out rather than hang forever on ultra-slow servers.

0056# Python 2.3 now has this functionality available in the standard socket library, so under

0057# 2.3 you don't need to install anything.

0058import socket

0059if hasattr(socket, 'setdefaulttimeout'):

0060    socket.setdefaulttimeout(10)

0061else:

0062    try:

0063        import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py

0064        timeoutsocket.setDefaultSocketTimeout(10)

0065    except ImportError:

0066        pass

0067

0068# XML-RPC support allows feedfinder to query Syndic8 for possible matches.

0069# Python 2.3 now comes with this module by default, otherwise you can download it

0070try:

0071    import xmlrpclib # http://www.pythonware.com/products/xmlrpc/

0072except ImportError:

0073    xmlrpclib = None

0074

0075if not dict:

0076    def dict(aList):

0077        rc = {}

0078        for k, v in aList:

0079            rc[k] = v

0080        return rc

0081

0082def _debuglog(message):

0083    if _debug: print message

0084

0085class RobotFileParserFixed(robotparser.RobotFileParser):

0086    """patched version of RobotFileParser, integrating fixes from Python 2.3a2 and bug 690214"""

0087

0088    def can_fetch(self, useragent, url):

0089        """using the parsed robots.txt decide if useragent can fetch url"""

0090        if self.disallow_all:

0091            return 0

0092        if self.allow_all:

0093            return 1

0094        # search for given user agent matches

0095        # the first match counts

0096        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"

0097        for entry in self.entries:

0098            if entry.applies_to(useragent):

0099                if not entry.allowance(url):

0100                    return 0

0101        # agent not found ==> access granted

0102        return 1

0103

0104class URLGatekeeper:

0105    """a class to track robots.txt rules across multiple servers"""

0106    def __init__(self):

0107        self.rpcache = {} # a dictionary of RobotFileParserFixed objects, by domain

0108        self.urlopener = urllib.FancyURLopener()

0109        self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://diveintomark.org/projects/feed_finder/"

0110        _debuglog(self.urlopener.version)

0111        self.urlopener.addheaders = [('User-agent', self.urlopener.version)]

0112        robotparser.URLopener.version = self.urlopener.version

0113        robotparser.URLopener.addheaders = self.urlopener.addheaders

0114

0115    def _getrp(self, url):

0116        protocol, domain = urlparse.urlparse(url)[:2]

0117        if self.rpcache.has_key(domain):

0118            return self.rpcache[domain]

0119        baseurl = '%s://%s' % (protocol, domain)

0120        robotsurl = urlparse.urljoin(baseurl, 'robots.txt')

0121        _debuglog('fetching %s' % robotsurl)

0122        rp = RobotFileParserFixed(robotsurl)

0123        rp.read()

0124        self.rpcache[domain] = rp

0125        return rp

0126

0127    def can_fetch(self, url):

0128        rp = self._getrp(url)

0129        allow = rp.can_fetch(self.urlopener.version, url)

0130        _debuglog("Gatekeeper examined %s and said %s" % (url, allow))

0131        return allow

0132

0133    def get(self, url):

0134        if not self.can_fetch(url): return ''

0135        return self.urlopener.open(url).read()

0136

0137_gatekeeper = URLGatekeeper()

0138

0139class BaseParser(sgmllib.SGMLParser):

0140    def __init__(self, baseuri):

0141        sgmllib.SGMLParser.__init__(self)

0142        self.links = []

0143        self.baseuri = baseuri

0144

0145    def normalize_attrs(self, attrs):

0146        attrs = [(k.lower(), sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v).strip()) for k, v in attrs]

0147        attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]

0148        return attrs

0149

0150    def do_base(self, attrs):

0151        attrsD = dict(self.normalize_attrs(attrs))

0152        if not attrsD.has_key('href'): return

0153        self.baseuri = attrsD['href']

0154

0155class LinkParser(BaseParser):

0156    FEED_TYPES = ('application/rss+xml',

0157                  'text/xml',

0158                  'application/atom+xml',

0159                  'application/x.atom+xml',

0160                  'application/x-atom+xml')

0161    def do_link(self, attrs):

0162        attrsD = dict(self.normalize_attrs(attrs))

0163        if not attrsD.has_key('rel'): return

0164        rels = attrsD['rel'].split()

0165        if 'alternate' not in rels: return

0166        if attrsD.get('type') not in self.FEED_TYPES: return

0167        if not attrsD.has_key('href'): return

0168        self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))

0169

0170class ALinkParser(BaseParser):

0171    def start_a(self, attrs):

0172        attrsD = dict(self.normalize_attrs(attrs))

0173        if not attrsD.has_key('href'): return

0174        self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))

0175

0176def makeFullURI(uri):

0177    if (not uri.startswith('http://')) and (not uri.startswith('https://')):

0178        uri = 'http://%s' % uri

0179    return uri

0180

0181def getLinks(data, baseuri):

0182    p = LinkParser(baseuri)

0183    p.feed(data)

0184    return p.links

0185

0186def getALinks(data, baseuri):

0187    p = ALinkParser(baseuri)

0188    p.feed(data)

0189    return p.links

0190

0191def getLocalLinks(links, baseuri):

0192    baseuri = baseuri.lower()

0193    urilen = len(baseuri)

0194    return [l for l in links if l.lower().startswith(baseuri)]

0195

0196def isFeedLink(link):

0197    return link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')

0198

0199def isXMLRelatedLink(link):

0200    link = link.lower()

0201    return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')

0202

0203def couldBeFeedData(data):

0204    data = data.lower()

0205    if data.count('<html'): return 0

0206    return data.count('<rss') + data.count('<rdf') + data.count('<feed')

0207

0208def isFeed(uri):

0209    _debuglog('verifying that %s is a feed' % uri)

0210    protocol = urlparse.urlparse(uri)

0211    if protocol[0] not in ('http', 'https'): return 0

0212    data = _gatekeeper.get(uri)

0213    return couldBeFeedData(data)

0214

0215def sortFeeds(feed1Info, feed2Info):

0216    return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])

0217

0218def getFeedsFromSyndic8(uri):

0219    feeds = []

0220    try:

0221        server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')

0222        feedids = server.syndic8.FindFeeds(uri)

0223        infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])

0224        infolist.sort(sortFeeds)

0225        feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']

0226        _debuglog('found %s feeds through Syndic8' % len(feeds))

0227    except:

0228        pass

0229    return feeds

0230

0231def getFeeds(uri, querySyndic8=0):

0232    fulluri = makeFullURI(uri)

0233    data = _gatekeeper.get(fulluri)

0234    # is this already a feed?

0235    if couldBeFeedData(data):

0236        return [fulluri]

0237    # nope, it's a page, try LINK tags first

0238    _debuglog('looking for LINK tags')

0239    feeds = getLinks(data, fulluri)

0240    _debuglog('found %s feeds through LINK tags' % len(feeds))

0241    feeds = filter(isFeed, feeds)

0242    if not feeds:

0243        # no LINK tags, look for regular <A> links that point to feeds

0244        _debuglog('no LINK tags, looking at A tags')

0245        links = getALinks(data, fulluri)

0246        locallinks = getLocalLinks(links, fulluri)

0247        # look for obvious feed links on the same server

0248        feeds = filter(isFeed, filter(isFeedLink, locallinks))

0249        if not feeds:

0250            # look harder for feed links on the same server

0251            feeds = filter(isFeed, filter(isXMLRelatedLink, locallinks))

0252        if not feeds:

0253            # look for obvious feed links on another server

0254            feeds = filter(isFeed, filter(isFeedLink, links))

0255        if not feeds:

0256            # look harder for feed links on another server

0257            feeds = filter(isFeed, filter(isXMLRelatedLink, links))

0258    if not feeds and querySyndic8:

0259        # still no luck, search Syndic8 for feeds (requires xmlrpclib)

0260        _debuglog('still no luck, searching Syndic8')

0261        feeds = getFeedsFromSyndic8(uri)

0262    return feeds

0263

0264##### test harness ######

0265

0266def test():

0267    uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'

0268    failed = []

0269    count = 0

0270    while 1:

0271        data = urllib.urlopen(uri).read()

0272        if data.find('Atom autodiscovery test') == -1: break

0273        sys.stdout.write('.')

0274        count += 1

0275        links = getLinks(data, uri)

0276        if not links:

0277            print '\n*** FAILED ***', uri, 'could not find link'

0278            failed.append(uri)

0279        elif len(links) > 1:

0280            print '\n*** FAILED ***', uri, 'found too many links'

0281            failed.append(uri)

0282        else:

0283            atomdata = urllib.urlopen(links[0]).read()

0284            if atomdata.find('<link rel="alternate"') == -1:

0285                print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'

0286                failed.append(uri)

0287            else:

0288                backlink = atomdata.split('href="').pop().split('"')[0]

0289                if backlink != uri:

0290                    print '\n*** FAILED ***', uri, 'retrieved wrong feed'

0291                    failed.append(uri)

0292        if data.find('<link rel="next" href="') == -1: break

0293        uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])

0294    print

0295    print count, 'tests executed,', len(failed), 'failed'

0296

0297if __name__ == '__main__':

0298    if sys.argv[1:]:

0299        uri = sys.argv[1]

0300    else:

0301        uri = 'http://diveintomark.org/'

0302    if uri == 'test':

0303        test()

0304    else:

0305        print "\n".join(getFeeds(uri))