0001"""Ultra-liberal feed finder
0002
0003http://diveintomark.org/projects/feed_finder/
0004
0005Usage:
0006getFeeds(uri) - returns list of feeds associated with this address
0007
0008Example:
0009>>> import feedfinder
0010>>> feedfinder.getFeeds('http://diveintomark.org/')
0011['http://diveintomark.org/xml/atom.xml']
0012>>> feedfinder.getFeeds('macnn.com')
0013['http://www.macnn.com/macnn.rdf']
0014
0015Can also use from the command line. Feeds are returned one per line:
0016$ python feedfinder.py diveintomark.org
0017http://diveintomark.org/xml/atom.xml
0018
0019How it works:
00200. At every step, feeds are minimally verified to make sure they are really feeds.
00211. If the URI points to a feed, it is simply returned; otherwise
0022 the page is downloaded and the real fun begins.
00232. Feeds pointed to by LINK tags in the header of the page (autodiscovery)
00243. <A> links to feeds on the same server ending in ".rss", ".rdf", ".xml", or ".atom"
00254. <A> links to feeds on the same server containing "rss", "rdf", "xml", or "atom"
00265. <A> links to feeds on external servers ending in ".rss", ".rdf", ".xml", or ".atom"
00276. <A> links to feeds on external servers containing "rss", "rdf", "xml", or "atom"
00287. As a last ditch effort, we search Syndic8 for feeds matching the URI
0029
0030"""
0031
0032__version__ = "1.2"
0033__date__ = "2004-01-09"
0034__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
0035__copyright__ = "Copyright 2002-4, Mark Pilgrim"
0036__license__ = "Python"
0037__credits__ = """Abe Fettig for a patch to sort Syndic8 feeds by popularity
0038Also Jason Diamond, Brian Lalor for bug reporting and patches"""
0039__history__ = """
00401.1 - MAP - 2003/02/20 - added support for Robot Exclusion Standard. Will
0041fetch /robots.txt once per domain and verify that URLs are allowed to be
0042downloaded. Identifies itself as
0043 rssfinder/<version> Python-urllib/<version> +http://diveintomark.org/projects/rss_finder/
00441.2 - MAP - 2004-01-09 - added Atom support, changed name, relicensed,
0045 don't query Syndic8 by default (pass querySyndic8=1 to getFeeds to do it anyway)
0046"""
0047
0048_debug = 0
0049
0050
0051import sgmllib, urllib, urlparse, re, sys, robotparser
0052
0053
0054
0055
0056
0057
0058import socket
0059if hasattr(socket, 'setdefaulttimeout'):
0060 socket.setdefaulttimeout(10)
0061else:
0062 try:
0063 import timeoutsocket
0064 timeoutsocket.setDefaultSocketTimeout(10)
0065 except ImportError:
0066 pass
0067
0068
0069
0070try:
0071 import xmlrpclib
0072except ImportError:
0073 xmlrpclib = None
0074
0075if not dict:
0076 def dict(aList):
0077 rc = {}
0078 for k, v in aList:
0079 rc[k] = v
0080 return rc
0081
0082def _debuglog(message):
0083 if _debug: print message
0084
0085class RobotFileParserFixed(robotparser.RobotFileParser):
0086 """patched version of RobotFileParser, integrating fixes from Python 2.3a2 and bug 690214"""
0087
0088 def can_fetch(self, useragent, url):
0089 """using the parsed robots.txt decide if useragent can fetch url"""
0090 if self.disallow_all:
0091 return 0
0092 if self.allow_all:
0093 return 1
0094
0095
0096 url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
0097 for entry in self.entries:
0098 if entry.applies_to(useragent):
0099 if not entry.allowance(url):
0100 return 0
0101
0102 return 1
0103
0104class URLGatekeeper:
0105 """a class to track robots.txt rules across multiple servers"""
0106 def __init__(self):
0107 self.rpcache = {}
0108 self.urlopener = urllib.FancyURLopener()
0109 self.urlopener.version = "feedfinder/" + __version__ + " " + self.urlopener.version + " +http://diveintomark.org/projects/feed_finder/"
0110 _debuglog(self.urlopener.version)
0111 self.urlopener.addheaders = [('User-agent', self.urlopener.version)]
0112 robotparser.URLopener.version = self.urlopener.version
0113 robotparser.URLopener.addheaders = self.urlopener.addheaders
0114
0115 def _getrp(self, url):
0116 protocol, domain = urlparse.urlparse(url)[:2]
0117 if self.rpcache.has_key(domain):
0118 return self.rpcache[domain]
0119 baseurl = '%s://%s' % (protocol, domain)
0120 robotsurl = urlparse.urljoin(baseurl, 'robots.txt')
0121 _debuglog('fetching %s' % robotsurl)
0122 rp = RobotFileParserFixed(robotsurl)
0123 rp.read()
0124 self.rpcache[domain] = rp
0125 return rp
0126
0127 def can_fetch(self, url):
0128 rp = self._getrp(url)
0129 allow = rp.can_fetch(self.urlopener.version, url)
0130 _debuglog("Gatekeeper examined %s and said %s" % (url, allow))
0131 return allow
0132
0133 def get(self, url):
0134 if not self.can_fetch(url): return ''
0135 return self.urlopener.open(url).read()
0136
0137_gatekeeper = URLGatekeeper()
0138
0139class BaseParser(sgmllib.SGMLParser):
0140 def __init__(self, baseuri):
0141 sgmllib.SGMLParser.__init__(self)
0142 self.links = []
0143 self.baseuri = baseuri
0144
0145 def normalize_attrs(self, attrs):
0146 attrs = [(k.lower(), sgmllib.charref.sub(lambda m: chr(int(m.groups()[0])), v).strip()) for k, v in attrs]
0147 attrs = [(k, k in ('rel','type') and v.lower() or v) for k, v in attrs]
0148 return attrs
0149
0150 def do_base(self, attrs):
0151 attrsD = dict(self.normalize_attrs(attrs))
0152 if not attrsD.has_key('href'): return
0153 self.baseuri = attrsD['href']
0154
0155class LinkParser(BaseParser):
0156 FEED_TYPES = ('application/rss+xml',
0157 'text/xml',
0158 'application/atom+xml',
0159 'application/x.atom+xml',
0160 'application/x-atom+xml')
0161 def do_link(self, attrs):
0162 attrsD = dict(self.normalize_attrs(attrs))
0163 if not attrsD.has_key('rel'): return
0164 rels = attrsD['rel'].split()
0165 if 'alternate' not in rels: return
0166 if attrsD.get('type') not in self.FEED_TYPES: return
0167 if not attrsD.has_key('href'): return
0168 self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
0169
0170class ALinkParser(BaseParser):
0171 def start_a(self, attrs):
0172 attrsD = dict(self.normalize_attrs(attrs))
0173 if not attrsD.has_key('href'): return
0174 self.links.append(urlparse.urljoin(self.baseuri, attrsD['href']))
0175
0176def makeFullURI(uri):
0177 if (not uri.startswith('http://')) and (not uri.startswith('https://')):
0178 uri = 'http://%s' % uri
0179 return uri
0180
0181def getLinks(data, baseuri):
0182 p = LinkParser(baseuri)
0183 p.feed(data)
0184 return p.links
0185
0186def getALinks(data, baseuri):
0187 p = ALinkParser(baseuri)
0188 p.feed(data)
0189 return p.links
0190
0191def getLocalLinks(links, baseuri):
0192 baseuri = baseuri.lower()
0193 urilen = len(baseuri)
0194 return [l for l in links if l.lower().startswith(baseuri)]
0195
0196def isFeedLink(link):
0197 return link[-4:].lower() in ('.rss', '.rdf', '.xml', '.atom')
0198
0199def isXMLRelatedLink(link):
0200 link = link.lower()
0201 return link.count('rss') + link.count('rdf') + link.count('xml') + link.count('atom')
0202
0203def couldBeFeedData(data):
0204 data = data.lower()
0205 if data.count('<html'): return 0
0206 return data.count('<rss') + data.count('<rdf') + data.count('<feed')
0207
0208def isFeed(uri):
0209 _debuglog('verifying that %s is a feed' % uri)
0210 protocol = urlparse.urlparse(uri)
0211 if protocol[0] not in ('http', 'https'): return 0
0212 data = _gatekeeper.get(uri)
0213 return couldBeFeedData(data)
0214
0215def sortFeeds(feed1Info, feed2Info):
0216 return cmp(feed2Info['headlines_rank'], feed1Info['headlines_rank'])
0217
0218def getFeedsFromSyndic8(uri):
0219 feeds = []
0220 try:
0221 server = xmlrpclib.Server('http://www.syndic8.com/xmlrpc.php')
0222 feedids = server.syndic8.FindFeeds(uri)
0223 infolist = server.syndic8.GetFeedInfo(feedids, ['headlines_rank','status','dataurl'])
0224 infolist.sort(sortFeeds)
0225 feeds = [f['dataurl'] for f in infolist if f['status']=='Syndicated']
0226 _debuglog('found %s feeds through Syndic8' % len(feeds))
0227 except:
0228 pass
0229 return feeds
0230
0231def getFeeds(uri, querySyndic8=0):
0232 fulluri = makeFullURI(uri)
0233 data = _gatekeeper.get(fulluri)
0234
0235 if couldBeFeedData(data):
0236 return [fulluri]
0237
0238 _debuglog('looking for LINK tags')
0239 feeds = getLinks(data, fulluri)
0240 _debuglog('found %s feeds through LINK tags' % len(feeds))
0241 feeds = filter(isFeed, feeds)
0242 if not feeds:
0243
0244 _debuglog('no LINK tags, looking at A tags')
0245 links = getALinks(data, fulluri)
0246 locallinks = getLocalLinks(links, fulluri)
0247
0248 feeds = filter(isFeed, filter(isFeedLink, locallinks))
0249 if not feeds:
0250
0251 feeds = filter(isFeed, filter(isXMLRelatedLink, locallinks))
0252 if not feeds:
0253
0254 feeds = filter(isFeed, filter(isFeedLink, links))
0255 if not feeds:
0256
0257 feeds = filter(isFeed, filter(isXMLRelatedLink, links))
0258 if not feeds and querySyndic8:
0259
0260 _debuglog('still no luck, searching Syndic8')
0261 feeds = getFeedsFromSyndic8(uri)
0262 return feeds
0263
0264
0265
0266def test():
0267 uri = 'http://diveintomark.org/tests/client/autodiscovery/html4-001.html'
0268 failed = []
0269 count = 0
0270 while 1:
0271 data = urllib.urlopen(uri).read()
0272 if data.find('Atom autodiscovery test') == -1: break
0273 sys.stdout.write('.')
0274 count += 1
0275 links = getLinks(data, uri)
0276 if not links:
0277 print '\n*** FAILED ***', uri, 'could not find link'
0278 failed.append(uri)
0279 elif len(links) > 1:
0280 print '\n*** FAILED ***', uri, 'found too many links'
0281 failed.append(uri)
0282 else:
0283 atomdata = urllib.urlopen(links[0]).read()
0284 if atomdata.find('<link rel="alternate"') == -1:
0285 print '\n*** FAILED ***', uri, 'retrieved something that is not a feed'
0286 failed.append(uri)
0287 else:
0288 backlink = atomdata.split('href="').pop().split('"')[0]
0289 if backlink != uri:
0290 print '\n*** FAILED ***', uri, 'retrieved wrong feed'
0291 failed.append(uri)
0292 if data.find('<link rel="next" href="') == -1: break
0293 uri = urlparse.urljoin(uri, data.split('<link rel="next" href="').pop().split('"')[0])
0294 print
0295 print count, 'tests executed,', len(failed), 'failed'
0296
0297if __name__ == '__main__':
0298 if sys.argv[1:]:
0299 uri = sys.argv[1]
0300 else:
0301 uri = 'http://diveintomark.org/'
0302 if uri == 'test':
0303 test()
0304 else:
0305 print "\n".join(getFeeds(uri))