0001
0002"""Universal feed parser
0003
0004Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
0005
0006Visit http://feedparser.org/ for the latest version
0007Visit http://feedparser.org/docs/ for the latest documentation
0008
0009Required: Python 2.1 or later
0010Recommended: Python 2.3 or later
0011Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
0012"""
0013
0014
0015__version__ = "3.3"
0016__license__ = "Python"
0017__copyright__ = "Copyright 2002-4, Mark Pilgrim"
0018__author__ = "Mark Pilgrim <http://diveintomark.org/>"
0019__contributors__ = ["Jason Diamond <http://injektilo.org/>",
0020 "John Beimler <http://john.beimler.org/>",
0021 "Fazal Majid <http://www.majid.info/mylos/weblog/>",
0022 "Aaron Swartz <http://aaronsw.com>"]
0023_debug = 0
0024
0025
0026
0027
0028USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
0029
0030
0031
0032ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
0033
0034
0035
0036
0037PREFERRED_XML_PARSERS = ["drv_libxml2"]
0038
0039
0040
0041
0042
0043
0044TIDY_MARKUP = 0
0045
0046
0047import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi
0048try:
0049 from cStringIO import StringIO as _StringIO
0050except:
0051 from StringIO import StringIO as _StringIO
0052
0053
0054
0055
0056try:
0057 import gzip
0058except:
0059 gzip = None
0060try:
0061 import zlib
0062except:
0063 zlib = None
0064
0065
0066
0067
0068
0069try:
0070 import timeoutsocket
0071 timeoutsocket.setDefaultSocketTimeout(20)
0072except ImportError:
0073 import socket
0074 if hasattr(socket, 'setdefaulttimeout'):
0075 socket.setdefaulttimeout(20)
0076import urllib, urllib2
0077
0078_mxtidy = None
0079if TIDY_MARKUP:
0080 try:
0081 from mx.Tidy import Tidy as _mxtidy
0082 except:
0083 pass
0084
0085
0086
0087
0088
0089try:
0090 import xml.sax
0091 xml.sax.make_parser(PREFERRED_XML_PARSERS)
0092 from xml.sax.saxutils import escape as _xmlescape
0093 _XML_AVAILABLE = 1
0094except:
0095 _XML_AVAILABLE = 0
0096 def _xmlescape(data):
0097 data = data.replace("&", "&")
0098 data = data.replace(">", ">")
0099 data = data.replace("<", "<")
0100 return data
0101
0102
0103try:
0104 import base64, binascii
0105except:
0106 base64 = binascii = None
0107
0108
0109
0110try:
0111 import cjkcodecs.aliases
0112except:
0113 pass
0114try:
0115 import iconv_codec
0116except:
0117 pass
0118
0119
0120class CharacterEncodingOverride(Exception): pass
0121class CharacterEncodingUnknown(Exception): pass
0122class NonXMLContentType(Exception): pass
0123
0124sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
0125sgmllib.special = re.compile('<!')
0126sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
0127
0128SUPPORTED_VERSIONS = {'': 'unknown',
0129 'rss090': 'RSS 0.90',
0130 'rss091n': 'RSS 0.91 (Netscape)',
0131 'rss091u': 'RSS 0.91 (Userland)',
0132 'rss092': 'RSS 0.92',
0133 'rss093': 'RSS 0.93',
0134 'rss094': 'RSS 0.94',
0135 'rss20': 'RSS 2.0',
0136 'rss10': 'RSS 1.0',
0137 'rss': 'RSS (unknown version)',
0138 'atom01': 'Atom 0.1',
0139 'atom02': 'Atom 0.2',
0140 'atom03': 'Atom 0.3',
0141 'atom': 'Atom (unknown version)',
0142 'cdf': 'CDF',
0143 'hotrss': 'Hot RSS'
0144 }
0145
0146try:
0147 UserDict = dict
0148except NameError:
0149
0150 from UserDict import UserDict
0151 def dict(aList):
0152 rc = {}
0153 for k, v in aList:
0154 rc[k] = v
0155 return rc
0156
0157class FeedParserDict(UserDict):
0158 def __getitem__(self, key):
0159 keymap = {'channel': 'feed',
0160 'items': 'entries',
0161 'guid': 'id',
0162 'date': 'modified',
0163 'date_parsed': 'modified_parsed',
0164 'description': ['tagline', 'summary']}
0165 realkey = keymap.get(key, key)
0166 if type(realkey) == types.ListType:
0167 for k in realkey:
0168 if UserDict.has_key(self, k):
0169 return UserDict.__getitem__(self, k)
0170 return UserDict.__getitem__(self, key)
0171 return UserDict.__getitem__(self, realkey)
0172
0173 def has_key(self, key):
0174 return hasattr(self, key) or UserDict.has_key(self, key)
0175
0176 def __getattr__(self, key):
0177 try:
0178 return self.__dict__[key]
0179 except KeyError:
0180 pass
0181 try:
0182 return self.__getitem__(key)
0183 except:
0184 raise AttributeError, "object has no attribute '%s'" % key
0185
0186 def __contains__(self, key):
0187 return self.has_key(key)
0188
0189def zopeCompatibilityHack():
0190 global FeedParserDict
0191 del FeedParserDict
0192 def FeedParserDict(aDict=None):
0193 rc = {}
0194 if aDict:
0195 rc.update(aDict)
0196 return rc
0197
0198_ebcdic_to_ascii_map = None
0199def _ebcdic_to_ascii(s):
0200 global _ebcdic_to_ascii_map
0201 if not _ebcdic_to_ascii_map:
0202 emap = (
0203 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
0204 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
0205 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
0206 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
0207 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
0208 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
0209 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
0210 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
0211 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
0212 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
0213 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
0214 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
0215 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
0216 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
0217 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
0218 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
0219 )
0220 import string
0221 _ebcdic_to_ascii_map = string.maketrans( "".join(map(chr, range(256))), "".join(map(chr, emap)))
0223 return s.translate(_ebcdic_to_ascii_map)
0224
0225class _FeedParserMixin:
0226 namespaces = {"": "",
0227 "http://backend.userland.com/rss": "",
0228 "http://blogs.law.harvard.edu/tech/rss": "",
0229 "http://purl.org/rss/1.0/": "",
0230 "http://my.netscape.com/rdf/simple/0.9/": "",
0231 "http://example.com/newformat#": "",
0232 "http://example.com/necho": "",
0233 "http://purl.org/echo/": "",
0234 "uri/of/echo/namespace#": "",
0235 "http://purl.org/pie/": "",
0236 "http://purl.org/atom/ns#": "",
0237 "http://purl.org/rss/1.0/modules/rss091#": "",
0238
0239 "http://webns.net/mvcb/": "admin",
0240 "http://purl.org/rss/1.0/modules/aggregation/": "ag",
0241 "http://purl.org/rss/1.0/modules/annotate/": "annotate",
0242 "http://media.tangent.org/rss/1.0/": "audio",
0243 "http://backend.userland.com/blogChannelModule": "blogChannel",
0244 "http://web.resource.org/cc/": "cc",
0245 "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
0246 "http://purl.org/rss/1.0/modules/company": "co",
0247 "http://purl.org/rss/1.0/modules/content/": "content",
0248 "http://my.theinfo.org/changed/1.0/rss/": "cp",
0249 "http://purl.org/dc/elements/1.1/": "dc",
0250 "http://purl.org/dc/terms/": "dcterms",
0251 "http://purl.org/rss/1.0/modules/email/": "email",
0252 "http://purl.org/rss/1.0/modules/event/": "ev",
0253 "http://postneo.com/icbm/": "icbm",
0254 "http://purl.org/rss/1.0/modules/image/": "image",
0255 "http://xmlns.com/foaf/0.1/": "foaf",
0256 "http://freshmeat.net/rss/fm/": "fm",
0257 "http://purl.org/rss/1.0/modules/link/": "l",
0258 "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
0259 "http://prismstandard.org/namespaces/1.2/basic/": "prism",
0260 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
0261 "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
0262 "http://purl.org/rss/1.0/modules/reference/": "ref",
0263 "http://purl.org/rss/1.0/modules/richequiv/": "reqv",
0264 "http://purl.org/rss/1.0/modules/search/": "search",
0265 "http://purl.org/rss/1.0/modules/slash/": "slash",
0266 "http://purl.org/rss/1.0/modules/servicestatus/": "ss",
0267 "http://hacks.benhammersley.com/rss/streaming/": "str",
0268 "http://purl.org/rss/1.0/modules/subscription/": "sub",
0269 "http://purl.org/rss/1.0/modules/syndication/": "sy",
0270 "http://purl.org/rss/1.0/modules/taxonomy/": "taxo",
0271 "http://purl.org/rss/1.0/modules/threading/": "thr",
0272 "http://purl.org/rss/1.0/modules/textinput/": "ti",
0273 "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
0274 "http://wellformedweb.org/CommentAPI/": "wfw",
0275 "http://purl.org/rss/1.0/modules/wiki/": "wiki",
0276 "http://schemas.xmlsoap.org/soap/envelope/": "soap",
0277 "http://www.w3.org/1999/xhtml": "xhtml",
0278 "http://www.w3.org/XML/1998/namespace": "xml"
0279}
0280
0281 can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments', 'license']
0282 can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']
0283 can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']
0284 html_types = ['text/html', 'application/xhtml+xml']
0285
0286 def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
0287 if _debug: sys.stderr.write("initializing FeedParser\n")
0288 self.feeddata = FeedParserDict()
0289 self.encoding = encoding
0290 self.entries = []
0291 self.version = ''
0292
0293
0294
0295
0296 self.infeed = 0
0297 self.inentry = 0
0298 self.incontent = 0
0299 self.intextinput = 0
0300 self.inimage = 0
0301 self.inauthor = 0
0302 self.incontributor = 0
0303 self.contentparams = FeedParserDict()
0304 self.namespacemap = {}
0305 self.elementstack = []
0306 self.basestack = []
0307 self.langstack = []
0308 self.baseuri = baseuri or ''
0309 self.lang = baselang or None
0310 if baselang:
0311 self.feeddata['language'] = baselang
0312
0313 def unknown_starttag(self, tag, attrs):
0314 if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
0315
0316 attrs = [(k.lower(), v) for k, v in attrs]
0317 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
0318
0319
0320 attrsD = dict(attrs)
0321 baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
0322 self.baseuri = baseuri
0323 lang = attrsD.get('xml:lang', attrsD.get('lang'))
0324 if lang == '':
0325
0326 lang = None
0327 elif lang is None:
0328
0329 lang = self.lang
0330 if lang:
0331 if tag in ('feed', 'rss', 'rdf:RDF'):
0332 self.feeddata['language'] = lang
0333 self.lang = lang
0334 self.basestack.append(baseuri)
0335 self.langstack.append(lang)
0336
0337
0338 for prefix, uri in attrs:
0339 if prefix.startswith('xmlns:'):
0340 self.trackNamespace(prefix[6:], uri)
0341 elif prefix == 'xmlns':
0342 self.trackNamespace(None, uri)
0343
0344
0345 if self.incontent and self.contentparams.get('mode') == 'escaped':
0346
0347 self.contentparams['mode'] = 'xml'
0348 if self.incontent and self.contentparams.get('mode') == 'xml':
0349
0350
0351
0352
0353
0354
0355
0356
0357 tag = tag.split(':')[-1]
0358 return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)
0359
0360
0361 if tag.find(':') <> -1:
0362 prefix, suffix = tag.split(':', 1)
0363 else:
0364 prefix, suffix = '', tag
0365 prefix = self.namespacemap.get(prefix, prefix)
0366 if prefix:
0367 prefix = prefix + '_'
0368
0369
0370 if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
0371 self.intextinput = 0
0372 if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'width', 'height'):
0373 self.inimage = 0
0374
0375
0376 methodname = '_start_' + prefix + suffix
0377 try:
0378 method = getattr(self, methodname)
0379 return method(attrsD)
0380 except AttributeError:
0381 return self.push(prefix + suffix, 1)
0382
0383 def unknown_endtag(self, tag):
0384 if _debug: sys.stderr.write('end %s\n' % tag)
0385
0386 if tag.find(':') <> -1:
0387 prefix, suffix = tag.split(':', 1)
0388 else:
0389 prefix, suffix = '', tag
0390 prefix = self.namespacemap.get(prefix, prefix)
0391 if prefix:
0392 prefix = prefix + '_'
0393
0394
0395 methodname = '_end_' + prefix + suffix
0396 try:
0397 method = getattr(self, methodname)
0398 method()
0399 except AttributeError:
0400 self.pop(prefix + suffix)
0401
0402
0403 if self.incontent and self.contentparams.get('mode') == 'escaped':
0404
0405 self.contentparams['mode'] = 'xml'
0406 if self.incontent and self.contentparams.get('mode') == 'xml':
0407 tag = tag.split(':')[-1]
0408 self.handle_data("</%s>" % tag, escape=0)
0409
0410
0411 if self.basestack:
0412 self.basestack.pop()
0413 if self.basestack and self.basestack[-1]:
0414 self.baseuri = self.basestack[-1]
0415 if self.langstack:
0416 self.langstack.pop()
0417 if self.langstack:
0418 self.lang = self.langstack[-1]
0419
0420 def handle_charref(self, ref):
0421
0422 if not self.elementstack: return
0423 ref = ref.lower()
0424 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
0425 text = "&#%s;" % ref
0426 else:
0427 if ref[0] == 'x':
0428 c = int(ref[1:], 16)
0429 else:
0430 c = int(ref)
0431 text = unichr(c).encode('utf-8')
0432 self.elementstack[-1][2].append(text)
0433
0434 def handle_entityref(self, ref):
0435
0436 if not self.elementstack: return
0437 if _debug: sys.stderr.write("entering handle_entityref with %s\n" % ref)
0438 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
0439 text = '&%s;' % ref
0440 else:
0441
0442 def name2cp(k):
0443 import htmlentitydefs
0444 if hasattr(htmlentitydefs, "name2codepoint"):
0445 return htmlentitydefs.name2codepoint[k]
0446 k = htmlentitydefs.entitydefs[k]
0447 if k.startswith("&#") and k.endswith(";"):
0448 return int(k[2:-1])
0449 return ord(k)
0450 try: name2cp(ref)
0451 except KeyError: text = "&%s;" % ref
0452 else: text = unichr(name2cp(ref)).encode('utf-8')
0453 self.elementstack[-1][2].append(text)
0454
0455 def handle_data(self, text, escape=1):
0456
0457
0458 if not self.elementstack: return
0459 if escape and self.contentparams.get('mode') == 'xml':
0460 text = _xmlescape(text)
0461 self.elementstack[-1][2].append(text)
0462
0463 def handle_comment(self, text):
0464
0465 pass
0466
0467 def handle_pi(self, text):
0468
0469 pass
0470
0471 def handle_decl(self, text):
0472 pass
0473
0474 def parse_declaration(self, i):
0475
0476 if _debug: sys.stderr.write("entering parse_declaration\n")
0477 if self.rawdata[i:i+9] == '<![CDATA[':
0478 k = self.rawdata.find(']]>', i)
0479 if k == -1: k = len(self.rawdata)
0480 self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
0481 return k+3
0482 else:
0483 k = self.rawdata.find('>', i)
0484 return k+1
0485
0486 def trackNamespace(self, prefix, uri):
0487 if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
0488 self.version = 'rss090'
0489 if uri == 'http://purl.org/rss/1.0/' and not self.version:
0490 self.version = 'rss10'
0491 if not prefix: return
0492 if uri.find('backend.userland.com/rss') <> -1:
0493
0494 uri = 'http://backend.userland.com/rss'
0495 if self.namespaces.has_key(uri):
0496 self.namespacemap[prefix] = self.namespaces[uri]
0497
0498 def resolveURI(self, uri):
0499 return urlparse.urljoin(self.baseuri or '', uri)
0500
0501 def decodeEntities(self, element, data):
0502 return data
0503
0504 def push(self, element, expectingText):
0505 self.elementstack.append([element, expectingText, []])
0506
0507 def pop(self, element):
0508 if not self.elementstack: return
0509 if self.elementstack[-1][0] != element: return
0510
0511 element, expectingText, pieces = self.elementstack.pop()
0512 output = "".join(pieces)
0513 output = output.strip()
0514 if not expectingText: return output
0515
0516
0517 if self.contentparams.get('mode') == 'base64' and base64:
0518 try:
0519 output = base64.decodestring(output)
0520 except binascii.Error:
0521 pass
0522 except binascii.Incomplete:
0523 pass
0524
0525
0526 if (element in self.can_be_relative_uri) and output:
0527 output = self.resolveURI(output)
0528
0529
0530 output = self.decodeEntities(element, output)
0531
0532
0533 if self.contentparams.get('type', 'text/html') in self.html_types:
0534 if element in self.can_contain_relative_uris:
0535 output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
0536
0537
0538 if self.contentparams.get('type', 'text/html') in self.html_types:
0539 if element in self.can_contain_dangerous_markup:
0540 output = _sanitizeHTML(output, self.encoding)
0541
0542 if self.encoding and (type(output) == types.StringType):
0543 try:
0544 output = unicode(output, self.encoding)
0545 except:
0546 pass
0547
0548
0549 if self.inentry:
0550 if element == 'content':
0551 self.entries[-1].setdefault(element, [])
0552 contentparams = copy.deepcopy(self.contentparams)
0553 contentparams['value'] = output
0554 self.entries[-1][element].append(contentparams)
0555 elif element == 'category':
0556 self.entries[-1][element] = output
0557 domain = self.entries[-1]['categories'][-1][0]
0558 self.entries[-1]['categories'][-1] = (domain, output)
0559 elif element == 'source':
0560 self.entries[-1]['source']['value'] = output
0561 elif element == 'link':
0562 self.entries[-1][element] = output
0563 if output:
0564 self.entries[-1]['links'][-1]['href'] = output
0565 else:
0566 if element == 'description':
0567 element = 'summary'
0568 self.entries[-1][element] = output
0569 if self.incontent:
0570 contentparams = copy.deepcopy(self.contentparams)
0571 contentparams['value'] = output
0572 self.entries[-1][element + '_detail'] = contentparams
0573 elif self.infeed and (not self.intextinput) and (not self.inimage):
0574 if element == 'description':
0575 element = 'tagline'
0576 self.feeddata[element] = output
0577 if element == 'category':
0578 domain = self.feeddata['categories'][-1][0]
0579 self.feeddata['categories'][-1] = (domain, output)
0580 elif element == 'link':
0581 self.feeddata['links'][-1]['href'] = output
0582 elif self.incontent:
0583 contentparams = copy.deepcopy(self.contentparams)
0584 contentparams['value'] = output
0585 self.feeddata[element + '_detail'] = contentparams
0586 return output
0587
0588 def _mapToStandardPrefix(self, name):
0589 colonpos = name.find(':')
0590 if colonpos <> -1:
0591 prefix = name[:colonpos]
0592 suffix = name[colonpos+1:]
0593 prefix = self.namespacemap.get(prefix, prefix)
0594 name = prefix + ':' + suffix
0595 return name
0596
0597 def _getAttribute(self, attrsD, name):
0598 return attrsD.get(self._mapToStandardPrefix(name))
0599
0600 def _save(self, key, value):
0601 if self.inentry:
0602 self.entries[-1].setdefault(key, value)
0603 elif self.feeddata:
0604 self.feeddata.setdefault(key, value)
0605
0606 def _start_rss(self, attrsD):
0607 versionmap = {'0.91': 'rss091u',
0608 '0.92': 'rss092',
0609 '0.93': 'rss093',
0610 '0.94': 'rss094'}
0611 if not self.version:
0612 attr_version = attrsD.get('version', '')
0613 version = versionmap.get(attr_version)
0614 if version:
0615 self.version = version
0616 elif attr_version.startswith('2.'):
0617 self.version = 'rss20'
0618 else:
0619 self.version = 'rss'
0620
0621 def _start_dlhottitles(self, attrsD):
0622 self.version = 'hotrss'
0623
0624 def _start_channel(self, attrsD):
0625 self.infeed = 1
0626 self._cdf_common(attrsD)
0627 _start_feedinfo = _start_channel
0628
0629 def _cdf_common(self, attrsD):
0630 if attrsD.has_key('lastmod'):
0631 self._start_modified({})
0632 self.elementstack[-1][-1] = attrsD['lastmod']
0633 self._end_modified()
0634 if attrsD.has_key('href'):
0635 self._start_link({})
0636 self.elementstack[-1][-1] = attrsD['href']
0637 self._end_link()
0638
0639 def _start_feed(self, attrsD):
0640 self.infeed = 1
0641 versionmap = {'0.1': 'atom01',
0642 '0.2': 'atom02',
0643 '0.3': 'atom03'}
0644 if not self.version:
0645 attr_version = attrsD.get('version')
0646 version = versionmap.get(attr_version)
0647 if version:
0648 self.version = version
0649 else:
0650 self.version = 'atom'
0651
0652 def _end_channel(self):
0653 self.infeed = 0
0654 _end_feed = _end_channel
0655
0656 def _start_image(self, attrsD):
0657 self.inimage = 1
0658 self.push('image', 0)
0659 context = self._getContext()
0660 context.setdefault('image', FeedParserDict())
0661
0662 def _end_image(self):
0663 self.pop('image')
0664 self.inimage = 0
0665
0666 def _start_textinput(self, attrsD):
0667 self.intextinput = 1
0668 self.push('textinput', 0)
0669 context = self._getContext()
0670 context.setdefault('textinput', FeedParserDict())
0671 _start_textInput = _start_textinput
0672
0673 def _end_textinput(self):
0674 self.pop('textinput')
0675 self.intextinput = 0
0676 _end_textInput = _end_textinput
0677
0678 def _start_author(self, attrsD):
0679 self.inauthor = 1
0680 self.push('author', 1)
0681 _start_managingeditor = _start_author
0682 _start_dc_author = _start_author
0683 _start_dc_creator = _start_author
0684
0685 def _end_author(self):
0686 self.pop('author')
0687 self.inauthor = 0
0688 self._sync_author_detail()
0689 _end_managingeditor = _end_author
0690 _end_dc_author = _end_author
0691 _end_dc_creator = _end_author
0692
0693 def _start_contributor(self, attrsD):
0694 self.incontributor = 1
0695 context = self._getContext()
0696 context.setdefault('contributors', [])
0697 context['contributors'].append(FeedParserDict())
0698 self.push('contributor', 0)
0699
0700 def _end_contributor(self):
0701 self.pop('contributor')
0702 self.incontributor = 0
0703
0704 def _start_name(self, attrsD):
0705 self.push('name', 0)
0706
0707 def _end_name(self):
0708 value = self.pop('name')
0709 if self.inauthor:
0710 self._save_author('name', value)
0711 elif self.incontributor:
0712 self._save_contributor('name', value)
0713 elif self.intextinput:
0714 context = self._getContext()
0715 context['textinput']['name'] = value
0716
0717 def _start_width(self, attrsD):
0718 self.push('width', 0)
0719
0720 def _end_width(self):
0721 value = self.pop('width')
0722 try:
0723 value = int(value)
0724 except:
0725 value = 0
0726 if self.inimage:
0727 context = self._getContext()
0728 context['image']['width'] = value
0729
0730 def _start_height(self, attrsD):
0731 self.push('height', 0)
0732
0733 def _end_height(self):
0734 value = self.pop('height')
0735 try:
0736 value = int(value)
0737 except:
0738 value = 0
0739 if self.inimage:
0740 context = self._getContext()
0741 context['image']['height'] = value
0742
0743 def _start_url(self, attrsD):
0744 self.push('url', 1)
0745 _start_homepage = _start_url
0746 _start_uri = _start_url
0747
0748 def _end_url(self):
0749 value = self.pop('url')
0750 if self.inauthor:
0751 self._save_author('url', value)
0752 elif self.incontributor:
0753 self._save_contributor('url', value)
0754 elif self.inimage:
0755 context = self._getContext()
0756 context['image']['url'] = value
0757 elif self.intextinput:
0758 context = self._getContext()
0759 context['textinput']['link'] = value
0760 _end_homepage = _end_url
0761 _end_uri = _end_url
0762
0763 def _start_email(self, attrsD):
0764 self.push('email', 0)
0765
0766 def _end_email(self):
0767 value = self.pop('email')
0768 if self.inauthor:
0769 self._save_author('email', value)
0770 elif self.incontributor:
0771 self._save_contributor('email', value)
0772 pass
0773
0774 def _getContext(self):
0775 if self.inentry:
0776 context = self.entries[-1]
0777 else:
0778 context = self.feeddata
0779 return context
0780
0781 def _save_author(self, key, value):
0782 context = self._getContext()
0783 context.setdefault('author_detail', FeedParserDict())
0784 context['author_detail'][key] = value
0785 self._sync_author_detail()
0786
0787 def _save_contributor(self, key, value):
0788 context = self._getContext()
0789 context.setdefault('contributors', [FeedParserDict()])
0790 context['contributors'][-1][key] = value
0791
0792 def _sync_author_detail(self, key='author'):
0793 context = self._getContext()
0794 detail = context.get('%s_detail' % key)
0795 if detail:
0796 name = detail.get('name')
0797 email = detail.get('email')
0798 if name and email:
0799 context[key] = "%s (%s)" % (name, email)
0800 elif name:
0801 context[key] = name
0802 elif email:
0803 context[key] = email
0804 else:
0805 author = context.get(key)
0806 if not author: return
0807 emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)
0808 if not emailmatch: return
0809 email = emailmatch.group(0)
0810
0811 author = author.replace(email, '')
0812 author = author.replace('()', '')
0813 author = author.strip()
0814 if author and (author[0] == '('):
0815 author = author[1:]
0816 if author and (author[-1] == ')'):
0817 author = author[:-1]
0818 author = author.strip()
0819 context.setdefault('%s_detail' % key, FeedParserDict())
0820 context['%s_detail' % key]['name'] = author
0821 context['%s_detail' % key]['email'] = email
0822
0823 def _start_tagline(self, attrsD):
0824 self.incontent += 1
0825 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
0826 'type': attrsD.get('type', 'text/plain'),
0827 'language': self.lang,
0828 'base': self.baseuri})
0829 self.push('tagline', 1)
0830 _start_subtitle = _start_tagline
0831
0832 def _end_tagline(self):
0833 value = self.pop('tagline')
0834 self.incontent -= 1
0835 self.contentparams.clear()
0836 if self.infeed:
0837 self.feeddata['description'] = value
0838 _end_subtitle = _end_tagline
0839
0840 def _start_copyright(self, attrsD):
0841 self.incontent += 1
0842 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
0843 'type': attrsD.get('type', 'text/plain'),
0844 'language': self.lang,
0845 'base': self.baseuri})
0846 self.push('copyright', 1)
0847 _start_dc_rights = _start_copyright
0848
0849 def _end_copyright(self):
0850 self.pop('copyright')
0851 self.incontent -= 1
0852 self.contentparams.clear()
0853 _end_dc_rights = _end_copyright
0854
0855 def _start_item(self, attrsD):
0856 self.entries.append(FeedParserDict())
0857 self.push('item', 0)
0858 self.inentry = 1
0859 self.guidislink = 0
0860 id = self._getAttribute(attrsD, 'rdf:about')
0861 if id:
0862 context = self._getContext()
0863 context['id'] = id
0864 self._cdf_common(attrsD)
0865 _start_entry = _start_item
0866 _start_product = _start_item
0867
0868 def _end_item(self):
0869 self.pop('item')
0870 self.inentry = 0
0871 _end_entry = _end_item
0872
0873 def _start_dc_language(self, attrsD):
0874 self.push('language', 1)
0875 _start_language = _start_dc_language
0876
0877 def _end_dc_language(self):
0878 self.lang = self.pop('language')
0879 _end_language = _end_dc_language
0880
0881 def _start_dc_publisher(self, attrsD):
0882 self.push('publisher', 1)
0883 _start_webmaster = _start_dc_publisher
0884
0885 def _end_dc_publisher(self):
0886 self.pop('publisher')
0887 self._sync_author_detail('publisher')
0888 _end_webmaster = _end_dc_publisher
0889
0890 def _start_dcterms_issued(self, attrsD):
0891 self.push('issued', 1)
0892 _start_issued = _start_dcterms_issued
0893
0894 def _end_dcterms_issued(self):
0895 value = self.pop('issued')
0896 self._save('issued_parsed', _parse_date(value))
0897 _end_issued = _end_dcterms_issued
0898
0899 def _start_dcterms_created(self, attrsD):
0900 self.push('created', 1)
0901 _start_created = _start_dcterms_created
0902
0903 def _end_dcterms_created(self):
0904 value = self.pop('created')
0905 self._save('created_parsed', _parse_date(value))
0906 _end_created = _end_dcterms_created
0907
0908 def _start_dcterms_modified(self, attrsD):
0909 self.push('modified', 1)
0910 _start_modified = _start_dcterms_modified
0911 _start_dc_date = _start_dcterms_modified
0912 _start_pubdate = _start_dcterms_modified
0913
0914 def _end_dcterms_modified(self):
0915 value = self.pop('modified')
0916 parsed_value = _parse_date(value)
0917 self._save('modified_parsed', parsed_value)
0918 _end_modified = _end_dcterms_modified
0919 _end_dc_date = _end_dcterms_modified
0920 _end_pubdate = _end_dcterms_modified
0921
0922 def _start_expirationdate(self, attrsD):
0923 self.push('expired', 1)
0924
0925 def _end_expirationdate(self):
0926 self._save('expired_parsed', _parse_date(self.pop('expired')))
0927
0928 def _start_cc_license(self, attrsD):
0929 self.push('license', 1)
0930 value = self._getAttribute(attrsD, 'rdf:resource')
0931 if value:
0932 self.elementstack[-1][2].append(value)
0933 self.pop('license')
0934
0935 def _start_creativecommons_license(self, attrsD):
0936 self.push('license', 1)
0937
0938 def _end_creativecommons_license(self):
0939 self.pop('license')
0940
0941 def _start_category(self, attrsD):
0942 self.push('category', 1)
0943 domain = self._getAttribute(attrsD, 'domain')
0944 cats = []
0945 if self.inentry:
0946 cats = self.entries[-1].setdefault('categories', [])
0947 elif self.infeed:
0948 cats = self.feeddata.setdefault('categories', [])
0949 cats.append((domain, None))
0950 _start_dc_subject = _start_category
0951 _start_keywords = _start_category
0952
0953 def _end_category(self):
0954 self.pop('category')
0955 _end_dc_subject = _end_category
0956 _end_keywords = _end_category
0957
0958 def _start_cloud(self, attrsD):
0959 self.feeddata['cloud'] = FeedParserDict(attrsD)
0960
0961 def _start_link(self, attrsD):
0962 attrsD.setdefault('rel', 'alternate')
0963 attrsD.setdefault('type', 'text/html')
0964 if attrsD.has_key('href'):
0965 attrsD['href'] = self.resolveURI(attrsD['href'])
0966 expectingText = self.infeed or self.inentry
0967 if self.inentry:
0968 self.entries[-1].setdefault('links', [])
0969 self.entries[-1]['links'].append(FeedParserDict(attrsD))
0970 elif self.infeed:
0971 self.feeddata.setdefault('links', [])
0972 self.feeddata['links'].append(FeedParserDict(attrsD))
0973 if attrsD.has_key('href'):
0974 expectingText = 0
0975 if attrsD.get('type', '') in self.html_types:
0976 if self.inentry:
0977 self.entries[-1]['link'] = attrsD['href']
0978 elif self.infeed:
0979 self.feeddata['link'] = attrsD['href']
0980 else:
0981 self.push('link', expectingText)
0982 _start_producturl = _start_link
0983
0984 def _end_link(self):
0985 value = self.pop('link')
0986 if self.intextinput:
0987 context = self._getContext()
0988 context['textinput']['link'] = value
0989 if self.inimage:
0990 context = self._getContext()
0991 context['image']['link'] = value
0992 _end_producturl = _end_link
0993
0994 def _start_guid(self, attrsD):
0995 self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
0996 self.push('id', 1)
0997
0998 def _end_guid(self):
0999 value = self.pop('id')
1000 self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1001 if self.guidislink:
1002
1003
1004 self._save('link', value)
1005
1006 def _start_id(self, attrsD):
1007 self.push('id', 1)
1008
1009 def _end_id(self):
1010 value = self.pop('id')
1011
1012 def _start_title(self, attrsD):
1013 self.incontent += 1
1014 if _debug: sys.stderr.write('attrsD.xml:lang = %s\n' % attrsD.get('xml:lang'))
1015 if _debug: sys.stderr.write('self.lang = %s\n' % self.lang)
1016 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1017 'type': attrsD.get('type', 'text/plain'),
1018 'language': self.lang,
1019 'base': self.baseuri})
1020 self.push('title', self.infeed or self.inentry)
1021 _start_dc_title = _start_title
1022
1023 def _end_title(self):
1024 value = self.pop('title')
1025 self.incontent -= 1
1026 self.contentparams.clear()
1027 if self.intextinput:
1028 context = self._getContext()
1029 context['textinput']['title'] = value
1030 elif self.inimage:
1031 context = self._getContext()
1032 context['image']['title'] = value
1033 _end_dc_title = _end_title
1034
1035 def _start_description(self, attrsD, default_content_type='text/html'):
1036 self.incontent += 1
1037 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1038 'type': attrsD.get('type', default_content_type),
1039 'language': self.lang,
1040 'base': self.baseuri})
1041 self.push('description', self.infeed or self.inentry)
1042
1043 def _start_abstract(self, attrsD):
1044 return self._start_description(attrsD, 'text/plain')
1045
1046 def _end_description(self):
1047 value = self.pop('description')
1048 self.incontent -= 1
1049 self.contentparams.clear()
1050 context = self._getContext()
1051 if self.intextinput:
1052 context['textinput']['description'] = value
1053 elif self.inimage:
1054 context['image']['description'] = value
1055
1056
1057
1058
1059 _end_abstract = _end_description
1060
1061 def _start_info(self, attrsD):
1062 self.incontent += 1
1063 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1064 'type': attrsD.get('type', 'text/plain'),
1065 'language': self.lang,
1066 'base': self.baseuri})
1067 self.push('info', 1)
1068
1069 def _end_info(self):
1070 self.pop('info')
1071 self.incontent -= 1
1072 self.contentparams.clear()
1073
1074 def _start_generator(self, attrsD):
1075 if attrsD:
1076 if attrsD.has_key('url'):
1077 attrsD['url'] = self.resolveURI(attrsD['url'])
1078 self.feeddata['generator_detail'] = FeedParserDict(attrsD)
1079 self.push('generator', 1)
1080
1081 def _end_generator(self):
1082 value = self.pop('generator')
1083 if self.feeddata.has_key('generator_detail'):
1084 self.feeddata['generator_detail']['name'] = value
1085
1086 def _start_admin_generatoragent(self, attrsD):
1087 self.push('generator', 1)
1088 value = self._getAttribute(attrsD, 'rdf:resource')
1089 if value:
1090 self.elementstack[-1][2].append(value)
1091 self.pop('generator')
1092 self.feeddata['generator_detail'] = FeedParserDict({"url": value})
1093
1094 def _start_admin_errorreportsto(self, attrsD):
1095 self.push('errorreportsto', 1)
1096 value = self._getAttribute(attrsD, 'rdf:resource')
1097 if value:
1098 self.elementstack[-1][2].append(value)
1099 self.pop('errorreportsto')
1100
1101 def _start_summary(self, attrsD):
1102 self.incontent += 1
1103 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),
1104 'type': attrsD.get('type', 'text/plain'),
1105 'language': self.lang,
1106 'base': self.baseuri})
1107 self.push('summary', 1)
1108
1109 def _end_summary(self):
1110 value = self.pop('summary')
1111 if self.entries:
1112 self.entries[-1]['description'] = value
1113 self.incontent -= 1
1114 self.contentparams.clear()
1115
1116 def _start_enclosure(self, attrsD):
1117 if self.inentry:
1118 self.entries[-1].setdefault('enclosures', [])
1119 self.entries[-1]['enclosures'].append(FeedParserDict(attrsD))
1120
1121 def _start_source(self, attrsD):
1122 if self.inentry:
1123 self.entries[-1]['source'] = FeedParserDict(attrsD)
1124 self.push('source', 1)
1125
1126 def _end_source(self):
1127 self.pop('source')
1128
1129 def _start_content(self, attrsD):
1130 self.incontent += 1
1131 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
1132 'type': attrsD.get('type', 'text/plain'),
1133 'language': self.lang,
1134 'base': self.baseuri})
1135 self.push('content', 1)
1136
1137 def _start_prodlink(self, attrsD):
1138 self.incontent += 1
1139 self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),
1140 'type': attrsD.get('type', 'text/html'),
1141 'language': self.lang,
1142 'base': self.baseuri})
1143 self.push('content', 1)
1144
1145 def _start_body(self, attrsD):
1146 self.incontent += 1
1147 self.contentparams = FeedParserDict({'mode': 'xml',
1148 'type': 'application/xhtml+xml',
1149 'language': self.lang,
1150 'base': self.baseuri})
1151 self.push('content', 1)
1152 _start_xhtml_body = _start_body
1153
1154 def _start_content_encoded(self, attrsD):
1155 self.incontent += 1
1156 self.contentparams = FeedParserDict({'mode': 'escaped',
1157 'type': 'text/html',
1158 'language': self.lang,
1159 'base': self.baseuri})
1160 self.push('content', 1)
1161 _start_fullitem = _start_content_encoded
1162
1163 def _end_content(self):
1164 value = self.pop('content')
1165 if self.contentparams.get('type') in (['text/plain'] + self.html_types):
1166 self._save('description', value)
1167 self.incontent -= 1
1168 self.contentparams.clear()
1169 _end_body = _end_content
1170 _end_xhtml_body = _end_content
1171 _end_content_encoded = _end_content
1172 _end_fullitem = _end_content
1173 _end_prodlink = _end_content
1174
1175if _XML_AVAILABLE:
1176 class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1177 def __init__(self, baseuri, baselang, encoding):
1178 if _debug: sys.stderr.write('trying StrictFeedParser\n')
1179 xml.sax.handler.ContentHandler.__init__(self)
1180 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1181 self.bozo = 0
1182 self.exc = None
1183
1184 def startPrefixMapping(self, prefix, uri):
1185 self.trackNamespace(prefix, uri)
1186
1187 def startElementNS(self, name, qname, attrs):
1188 namespace, localname = name
1189 namespace = str(namespace or '')
1190 if namespace.find('backend.userland.com/rss') <> -1:
1191
1192 namespace = 'http://backend.userland.com/rss'
1193 prefix = self.namespaces.get(namespace, 'unknown')
1194 if prefix:
1195 localname = prefix + ':' + localname
1196 localname = str(localname).lower()
1197
1198
1199
1200
1201
1202
1203
1204
1205 attrsD = {}
1206 for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1207 prefix = self.namespaces.get(namespace, '')
1208 if prefix:
1209 attrlocalname = prefix + ":" + attrlocalname
1210 attrsD[str(attrlocalname).lower()] = attrvalue
1211 for qname in attrs.getQNames():
1212 attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1213 self.unknown_starttag(localname, attrsD.items())
1214
1215
1216
1217
1218 def characters(self, text):
1219 self.handle_data(text)
1220
1221 def endElementNS(self, name, qname):
1222 namespace, localname = name
1223 namespace = str(namespace)
1224 prefix = self.namespaces.get(namespace, '')
1225 if prefix:
1226 localname = prefix + ':' + localname
1227 localname = str(localname).lower()
1228 self.unknown_endtag(localname)
1229
1230 def error(self, exc):
1231 self.bozo = 1
1232 self.exc = exc
1233
1234 def fatalError(self, exc):
1235 self.error(exc)
1236 raise exc
1237
1238class _BaseHTMLProcessor(sgmllib.SGMLParser):
1239 elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1240 'img', 'input', 'isindex', 'link', 'meta', 'param']
1241
1242 def __init__(self, encoding):
1243 self.encoding = encoding
1244 if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1245 sgmllib.SGMLParser.__init__(self)
1246
1247 def reset(self):
1248 self.pieces = []
1249 sgmllib.SGMLParser.reset(self)
1250
1251 def feed(self, data):
1252 data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1253 data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)
1254 data = data.replace(''', "'")
1255 data = data.replace('"', '"')
1256 if self.encoding and (type(data) == types.UnicodeType):
1257 data = data.encode(self.encoding)
1258 sgmllib.SGMLParser.feed(self, data)
1259
1260 def normalize_attrs(self, attrs):
1261
1262 attrs = [(k.lower(), v) for k, v in attrs]
1263
1264
1265
1266 attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1267 return attrs
1268
1269 def unknown_starttag(self, tag, attrs):
1270
1271
1272
1273 if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1274 strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
1275 if tag in self.elements_no_end_tag:
1276 self.pieces.append("<%(tag)s%(strattrs)s />" % locals())
1277 else:
1278 self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
1279
1280 def unknown_endtag(self, tag):
1281
1282
1283 if tag not in self.elements_no_end_tag:
1284 self.pieces.append("</%(tag)s>" % locals())
1285
1286 def handle_charref(self, ref):
1287
1288
1289 self.pieces.append("&#%(ref)s;" % locals())
1290
1291 def handle_entityref(self, ref):
1292
1293
1294 self.pieces.append("&%(ref)s;" % locals())
1295
1296 def handle_data(self, text):
1297
1298
1299
1300 if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1301 self.pieces.append(text)
1302
1303 def handle_comment(self, text):
1304
1305
1306 self.pieces.append("<!--%(text)s-->" % locals())
1307
1308 def handle_pi(self, text):
1309
1310
1311 self.pieces.append("<?%(text)s>" % locals())
1312
1313 def handle_decl(self, text):
1314
1315
1316
1317
1318 self.pieces.append("<!%(text)s>" % locals())
1319
1320 _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1321 def _scan_name(self, i, declstartpos):
1322 rawdata = self.rawdata
1323 n = len(rawdata)
1324 if i == n:
1325 return None, -1
1326 m = self._new_declname_match(rawdata, i)
1327 if m:
1328 s = m.group()
1329 name = s.strip()
1330 if (i + len(s)) == n:
1331 return None, -1
1332 return name.lower(), m.end()
1333 else:
1334 self.handle_data(rawdata)
1335
1336 return None, -1
1337
1338 def output(self):
1339 """Return processed HTML as a single string"""
1340 return "".join([str(p) for p in self.pieces])
1341
1342class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1343 def __init__(self, baseuri, baselang, encoding):
1344 sgmllib.SGMLParser.__init__(self)
1345 _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1346
1347 def decodeEntities(self, element, data):
1348 data = data.replace('<', '<')
1349 data = data.replace('<', '<')
1350 data = data.replace('>', '>')
1351 data = data.replace('>', '>')
1352 data = data.replace('&', '&')
1353 data = data.replace('&', '&')
1354 data = data.replace('"', '"')
1355 data = data.replace('"', '"')
1356 data = data.replace(''', ''')
1357 data = data.replace(''', ''')
1358 if self.contentparams.get('mode') == 'escaped':
1359 data = data.replace('<', '<')
1360 data = data.replace('>', '>')
1361 data = data.replace('&', '&')
1362 data = data.replace('"', '"')
1363 data = data.replace(''', "'")
1364 return data
1365
1366class _RelativeURIResolver(_BaseHTMLProcessor):
1367 relative_uris = [('a', 'href'),
1368 ('applet', 'codebase'),
1369 ('area', 'href'),
1370 ('blockquote', 'cite'),
1371 ('body', 'background'),
1372 ('del', 'cite'),
1373 ('form', 'action'),
1374 ('frame', 'longdesc'),
1375 ('frame', 'src'),
1376 ('iframe', 'longdesc'),
1377 ('iframe', 'src'),
1378 ('head', 'profile'),
1379 ('img', 'longdesc'),
1380 ('img', 'src'),
1381 ('img', 'usemap'),
1382 ('input', 'src'),
1383 ('input', 'usemap'),
1384 ('ins', 'cite'),
1385 ('link', 'href'),
1386 ('object', 'classid'),
1387 ('object', 'codebase'),
1388 ('object', 'data'),
1389 ('object', 'usemap'),
1390 ('q', 'cite'),
1391 ('script', 'src')]
1392
1393 def __init__(self, baseuri, encoding):
1394 _BaseHTMLProcessor.__init__(self, encoding)
1395 self.baseuri = baseuri
1396
1397 def resolveURI(self, uri):
1398 return urlparse.urljoin(self.baseuri, uri)
1399
1400 def unknown_starttag(self, tag, attrs):
1401 attrs = self.normalize_attrs(attrs)
1402 attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1403 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1404
1405def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1406 if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")
1407 p = _RelativeURIResolver(baseURI, encoding)
1408 p.feed(htmlSource)
1409 return p.output()
1410
1411class _HTMLSanitizer(_BaseHTMLProcessor):
1412 acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1413 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1414 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1415 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1416 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1417 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1418 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1419 'thead', 'tr', 'tt', 'u', 'ul', 'var']
1420
1421 acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1422 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1423 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1424 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1425 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1426 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1427 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1428 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1429 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1430 'usemap', 'valign', 'value', 'vspace', 'width']
1431
1432 unacceptable_elements_with_end_tag = ['script', 'applet']
1433
1434 def reset(self):
1435 _BaseHTMLProcessor.reset(self)
1436 self.unacceptablestack = 0
1437
1438 def unknown_starttag(self, tag, attrs):
1439 if not tag in self.acceptable_elements:
1440 if tag in self.unacceptable_elements_with_end_tag:
1441 self.unacceptablestack += 1
1442 return
1443 attrs = self.normalize_attrs(attrs)
1444 attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1445 _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1446
1447 def unknown_endtag(self, tag):
1448 if not tag in self.acceptable_elements:
1449 if tag in self.unacceptable_elements_with_end_tag:
1450 self.unacceptablestack -= 1
1451 return
1452 _BaseHTMLProcessor.unknown_endtag(self, tag)
1453
1454 def handle_pi(self, text):
1455 pass
1456
1457 def handle_decl(self, text):
1458 pass
1459
1460 def handle_data(self, text):
1461 if not self.unacceptablestack:
1462 _BaseHTMLProcessor.handle_data(self, text)
1463
1464def _sanitizeHTML(htmlSource, encoding):
1465 p = _HTMLSanitizer(encoding)
1466 p.feed(htmlSource)
1467 data = p.output()
1468 if _mxtidy and TIDY_MARKUP:
1469 nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)
1470 if data.count('<body'):
1471 data = data.split('<body', 1)[1]
1472 if data.count('>'):
1473 data = data.split('>', 1)[1]
1474 if data.count('</body'):
1475 data = data.split('</body', 1)[0]
1476 data = data.strip().replace('\r\n', '\n')
1477 return data
1478
1479class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1480 def http_error_default(self, req, fp, code, msg, headers):
1481 if ((code / 100) == 3) and (code != 304):
1482 return self.http_error_302(req, fp, code, msg, headers)
1483 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1484 infourl.status = code
1485 return infourl
1486
1487 def http_error_302(self, req, fp, code, msg, headers):
1488 if headers.dict.has_key('location'):
1489 infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1490 else:
1491 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1492 if not hasattr(infourl, 'status'):
1493 infourl.status = code
1494 return infourl
1495
1496 def http_error_301(self, req, fp, code, msg, headers):
1497 if headers.dict.has_key('location'):
1498 infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1499 else:
1500 infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1501 if not hasattr(infourl, 'status'):
1502 infourl.status = code
1503 return infourl
1504
1505 http_error_300 = http_error_302
1506 http_error_303 = http_error_302
1507 http_error_307 = http_error_302
1508
1509def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1510 """URL, filename, or string --> stream
1511
1512 This function lets you define parsers that take any input source
1513 (URL, pathname to local or network file, or actual data as a string)
1514 and deal with it in a uniform manner. Returned object is guaranteed
1515 to have all the basic stdio read methods (read, readline, readlines).
1516 Just .close() the object when you're done with it.
1517
1518 If the etag argument is supplied, it will be used as the value of an
1519 If-None-Match request header.
1520
1521 If the modified argument is supplied, it must be a tuple of 9 integers
1522 as returned by gmtime() in the standard Python time module. This MUST
1523 be in GMT (Greenwich Mean Time). The formatted date/time will be used
1524 as the value of an If-Modified-Since request header.
1525
1526 If the agent argument is supplied, it will be used as the value of a
1527 User-Agent request header.
1528
1529 If the referrer argument is supplied, it will be used as the value of a
1530 Referer[sic] request header.
1531
1532 If handlers is supplied, it is a list of handlers used to build a
1533 urllib2 opener.
1534 """
1535
1536 if hasattr(url_file_stream_or_string, "read"):
1537 return url_file_stream_or_string
1538
1539 if url_file_stream_or_string == "-":
1540 return sys.stdin
1541
1542 if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1543 if not agent:
1544 agent = USER_AGENT
1545
1546 auth = None
1547 if base64:
1548 urltype, rest = urllib.splittype(url_file_stream_or_string)
1549 realhost, rest = urllib.splithost(rest)
1550 if realhost:
1551 user_passwd, realhost = urllib.splituser(realhost)
1552 if user_passwd:
1553 url_file_stream_or_string = "%s://%s%s" % (urltype, realhost, rest)
1554 auth = base64.encodestring(user_passwd).strip()
1555
1556 request = urllib2.Request(url_file_stream_or_string)
1557 request.add_header("User-Agent", agent)
1558 if etag:
1559 request.add_header("If-None-Match", etag)
1560 if modified:
1561
1562
1563
1564
1565 short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
1566 months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
1567 request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1568 if referrer:
1569 request.add_header("Referer", referrer)
1570 if gzip and zlib:
1571 request.add_header("Accept-encoding", "gzip, deflate")
1572 elif gzip:
1573 request.add_header("Accept-encoding", "gzip")
1574 elif zlib:
1575 request.add_header("Accept-encoding", "deflate")
1576 else:
1577 request.add_header("Accept-encoding", "")
1578 if auth:
1579 request.add_header("Authorization", "Basic %s" % auth)
1580 if ACCEPT_HEADER:
1581 request.add_header("Accept", ACCEPT_HEADER)
1582 opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1583 opener.addheaders = []
1584 try:
1585 return opener.open(request)
1586 finally:
1587 opener.close()
1588
1589
1590 try:
1591 return open(url_file_stream_or_string)
1592 except:
1593 pass
1594
1595
1596 return _StringIO(str(url_file_stream_or_string))
1597
1598_date_handlers = []
1599def registerDateHandler(func):
1600 """Register a date handler function (takes string, returns 9-tuple date in GMT)"""
1601 _date_handlers.insert(0, func)
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1613 'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1614 '-YY-?MM', '-OOO', '-YY',
1615 '--MM-?DD', '--MM',
1616 '---DD',
1617 'CC', '']
1618_iso8601_re = [
1619 tmpl.replace(
1620 'YYYY', r'(?P<year>\d{4})').replace(
1621 'YY', r'(?P<year>\d\d)').replace(
1622 'MM', r'(?P<month>[01]\d)').replace(
1623 'DD', r'(?P<day>[0123]\d)').replace(
1624 'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1625 'CC', r'(?P<century>\d\d$)')
1626 + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1627 + r'(:(?P<second>\d{2}))?'
1628 + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1629 for tmpl in _iso8601_tmpl]
1630del tmpl
1631_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1632del regex
1633def _parse_date_iso8601(dateString):
1634 """Parse a variety of ISO-8601-compatible formats like 20040105"""
1635 m = None
1636 for _iso8601_match in _iso8601_matches:
1637 m = _iso8601_match(dateString)
1638 if m: break
1639 if not m: return
1640 if m.span() == (0, 0): return
1641 params = m.groupdict()
1642 ordinal = params.get("ordinal", 0)
1643 if ordinal:
1644 ordinal = int(ordinal)
1645 else:
1646 ordinal = 0
1647 year = params.get("year", "--")
1648 if not year or year == "--":
1649 year = time.gmtime()[0]
1650 elif len(year) == 2:
1651
1652 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1653 else:
1654 year = int(year)
1655 month = params.get("month", "-")
1656 if not month or month == "-":
1657
1658
1659 if ordinal:
1660 month = 1
1661 else:
1662 month = time.gmtime()[1]
1663 month = int(month)
1664 day = params.get("day", 0)
1665 if not day:
1666
1667 if ordinal:
1668 day = ordinal
1669 elif params.get("century", 0) or params.get("year", 0) or params.get("month", 0):
1671 day = 1
1672 else:
1673 day = time.gmtime()[2]
1674 else:
1675 day = int(day)
1676
1677
1678 if "century" in params.keys():
1679 year = (int(params["century"]) - 1) * 100 + 1
1680
1681 for field in ["hour", "minute", "second", "tzhour", "tzmin"]:
1682 if not params.get(field, None):
1683 params[field] = 0
1684 hour = int(params.get("hour", 0))
1685 minute = int(params.get("minute", 0))
1686 second = int(params.get("second", 0))
1687
1688 weekday = 0
1689
1690
1691
1692
1693 daylight_savings_flag = 0
1694 tm = [year, month, day, hour, minute, second, weekday,
1695 ordinal, daylight_savings_flag]
1696
1697 tz = params.get("tz")
1698 if tz and tz != "Z":
1699 if tz[0] == "-":
1700 tm[3] += int(params.get("tzhour", 0))
1701 tm[4] += int(params.get("tzmin", 0))
1702 elif tz[0] == "+":
1703 tm[3] -= int(params.get("tzhour", 0))
1704 tm[4] -= int(params.get("tzmin", 0))
1705 else:
1706 return None
1707
1708
1709
1710 return time.localtime(time.mktime(tm))
1711registerDateHandler(_parse_date_iso8601)
1712
1713
1714_korean_year = u'\ub144'
1715_korean_month = u'\uc6d4'
1716_korean_day = u'\uc77c'
1717_korean_am = u'\uc624\uc804'
1718_korean_pm = u'\uc624\ud6c4'
1719
1720_korean_onblog_date_re = re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % (_korean_year, _korean_month, _korean_day))
1723_korean_nate_date_re = re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % (_korean_am, _korean_pm))
1726def _parse_date_onblog(dateString):
1727 """Parse a string according to the OnBlog 8-bit date format"""
1728 m = _korean_onblog_date_re.match(dateString)
1729 if not m: return
1730 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), 'zonediff': '+09:00'}
1734 if _debug: sys.stderr.write("OnBlog date parsed as: %s\n" % w3dtfdate)
1735 return _parse_date_w3dtf(w3dtfdate)
1736registerDateHandler(_parse_date_onblog)
1737
1738def _parse_date_nate(dateString):
1739 """Parse a string according to the Nate 8-bit date format"""
1740 m = _korean_nate_date_re.match(dateString)
1741 if not m: return
1742 hour = int(m.group(5))
1743 ampm = m.group(4)
1744 if (ampm == _korean_pm):
1745 hour += 12
1746 hour = str(hour)
1747 if len(hour) == 1:
1748 hour = '0' + hour
1749 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': hour, 'minute': m.group(6), 'second': m.group(7), 'zonediff': '+09:00'}
1753 if _debug: sys.stderr.write("Nate date parsed as: %s\n" % w3dtfdate)
1754 return _parse_date_w3dtf(w3dtfdate)
1755registerDateHandler(_parse_date_nate)
1756
1757_mssql_date_re = re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})\.\d+')
1759def _parse_date_mssql(dateString):
1760 """Parse a string according to the MS SQL date format"""
1761 m = _mssql_date_re.match(dateString)
1762 if not m: return
1763 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" % {'year': m.group(1), 'month': m.group(2), 'day': m.group(3), 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6), 'zonediff': '+09:00'}
1767 if _debug: sys.stderr.write("MS SQL date parsed as: %s\n" % w3dtfdate)
1768 return _parse_date_w3dtf(w3dtfdate)
1769registerDateHandler(_parse_date_mssql)
1770
1771
1772_greek_months = { u'\u0399\u03b1\u03bd': u'Jan',
1775 u'\u03a6\u03b5\u03b2': u'Feb',
1776 u'\u039c\u03ac\u03ce': u'Mar',
1777 u'\u039c\u03b1\u03ce': u'Mar',
1778 u'\u0391\u03c0\u03c1': u'Apr',
1779 u'\u039c\u03ac\u03b9': u'May',
1780 u'\u039c\u03b1\u03ca': u'May',
1781 u'\u039c\u03b1\u03b9': u'May',
1782 u'\u0399\u03bf\u03cd\u03bd': u'Jun',
1783 u'\u0399\u03bf\u03bd': u'Jun',
1784 u'\u0399\u03bf\u03cd\u03bb': u'Jul',
1785 u'\u0399\u03bf\u03bb': u'Jul',
1786 u'\u0391\u03cd\u03b3': u'Aug',
1787 u'\u0391\u03c5\u03b3': u'Aug',
1788 u'\u03a3\u03b5\u03c0': u'Sep',
1789 u'\u039f\u03ba\u03c4': u'Oct',
1790 u'\u039d\u03bf\u03ad': u'Nov',
1791 u'\u039d\u03bf\u03b5': u'Nov',
1792 u'\u0394\u03b5\u03ba': u'Dec',
1793 }
1794
1795_greek_wdays = { u'\u039a\u03c5\u03c1': u'Sun',
1798 u'\u0394\u03b5\u03c5': u'Mon',
1799 u'\u03a4\u03c1\u03b9': u'Tue',
1800 u'\u03a4\u03b5\u03c4': u'Wed',
1801 u'\u03a0\u03b5\u03bc': u'Thu',
1802 u'\u03a0\u03b1\u03c1': u'Fri',
1803 u'\u03a3\u03b1\u03b2': u'Sat',
1804 }
1805
1806_greek_date_format_re = re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
1808
1809def _parse_date_greek(dateString):
1810 """Parse a string according to a Greek 8-bit date format."""
1811 m = _greek_date_format_re.match(dateString)
1812 if not m: return
1813 try:
1814 wday = _greek_wdays[m.group(1)]
1815 month = _greek_months[m.group(3)]
1816 except:
1817 return
1818 rfc822date = "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s" % {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4), 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7), 'zonediff': m.group(8)}
1822 if _debug: sys.stderr.write("Greek date parsed as: %s\n" % rfc822date)
1823 return _parse_date_rfc822(rfc822date)
1824registerDateHandler(_parse_date_greek)
1825
1826
1827_hungarian_months = { u'janu\u00e1r': u'01',
1830 u'febru\u00e1ri': u'02',
1831 u'm\u00e1rcius': u'03',
1832 u'\u00e1prilis': u'04',
1833 u'm\u00e1ujus': u'05',
1834 u'j\u00fanius': u'06',
1835 u'j\u00falius': u'07',
1836 u'augusztus': u'08',
1837 u'szeptember': u'09',
1838 u'okt\u00f3ber': u'10',
1839 u'november': u'11',
1840 u'december': u'12',
1841 }
1842
1843_hungarian_date_format_re = re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
1845
1846def _parse_date_hungarian(dateString):
1847 """Parse a string according to a Hungarian 8-bit date format."""
1848 m = _hungarian_date_format_re.match(dateString)
1849 if not m: return
1850 try:
1851 month = _hungarian_months[m.group(2)]
1852 day = m.group(3)
1853 if len(day) == 1:
1854 day = '0' + day
1855 hour = m.group(4)
1856 if len(hour) == 1:
1857 hour = '0' + hour
1858 except:
1859 return
1860 w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s" % {'year': m.group(1), 'month': month, 'day': day, 'hour': hour, 'minute': m.group(5), 'zonediff': m.group(6)}
1864 if _debug: sys.stderr.write("Hungarian date parsed as: %s\n" % w3dtfdate)
1865 return _parse_date_w3dtf(w3dtfdate)
1866registerDateHandler(_parse_date_hungarian)
1867
1868
1869
1870
1871
1872def _parse_date_w3dtf(dateString):
1873 def __extract_date(m):
1874 year = int(m.group("year"))
1875 if year < 100:
1876 year = 100 * int(time.gmtime()[0] / 100) + int(year)
1877 if year < 1000:
1878 return 0, 0, 0
1879 julian = m.group("julian")
1880 if julian:
1881 julian = int(julian)
1882 month = julian / 30 + 1
1883 day = julian % 30 + 1
1884 jday = None
1885 while jday != julian:
1886 t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
1887 jday = time.gmtime(t)[-2]
1888 diff = abs(jday - julian)
1889 if jday > julian:
1890 if diff < day:
1891 day = day - diff
1892 else:
1893 month = month - 1
1894 day = 31
1895 elif jday < julian:
1896 if day + diff < 28:
1897 day = day + diff
1898 else:
1899 month = month + 1
1900 return year, month, day
1901 month = m.group("month")
1902 day = 1
1903 if month is None:
1904 month = 1
1905 else:
1906 month = int(month)
1907 day = m.group("day")
1908 if day:
1909 day = int(day)
1910 else:
1911 day = 1
1912 return year, month, day
1913
1914 def __extract_time(m):
1915 if not m:
1916 return 0, 0, 0
1917 hours = m.group("hours")
1918 if not hours:
1919 return 0, 0, 0
1920 hours = int(hours)
1921 minutes = int(m.group("minutes"))
1922 seconds = m.group("seconds")
1923 if seconds:
1924 seconds = int(seconds)
1925 else:
1926 seconds = 0
1927 return hours, minutes, seconds
1928
1929 def __extract_tzd(m):
1930 """Return the Time Zone Designator as an offset in seconds from UTC."""
1931 if not m:
1932 return 0
1933 tzd = m.group("tzd")
1934 if not tzd:
1935 return 0
1936 if tzd == "Z":
1937 return 0
1938 hours = int(m.group("tzdhours"))
1939 minutes = m.group("tzdminutes")
1940 if minutes:
1941 minutes = int(minutes)
1942 else:
1943 minutes = 0
1944 offset = (hours*60 + minutes) * 60
1945 if tzd[0] == "+":
1946 return -offset
1947 return offset
1948
1949 __date_re = ("(?P<year>\d\d\d\d)"
1950 "(?:(?P<dsep>-|)"
1951 "(?:(?P<julian>\d\d\d)"
1952 "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")
1953 __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"
1954 __tzd_rx = re.compile(__tzd_re)
1955 __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"
1956 "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"
1957 + __tzd_re)
1958 __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)
1959 __datetime_rx = re.compile(__datetime_re)
1960 m = __datetime_rx.match(dateString)
1961 if (m is None) or (m.group() != dateString): return
1962 gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
1963 if gmt[0] == 0: return
1964 return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
1965registerDateHandler(_parse_date_w3dtf)
1966
1967def _parse_date_rfc822(dateString):
1968 """Parse an RFC822, RFC1123, RFC2822, or asctime-style date"""
1969 tm = rfc822.parsedate_tz(dateString)
1970 if tm:
1971 return time.gmtime(rfc822.mktime_tz(tm))
1972
1973
1974_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
1975rfc822._timezones.update(_additional_timezones)
1976registerDateHandler(_parse_date_rfc822)
1977
1978def _parse_date(dateString):
1979 """Parses a variety of date formats into a 9-tuple in GMT"""
1980 for handler in _date_handlers:
1981 try:
1982 date9tuple = handler(dateString)
1983 if not date9tuple: continue
1984 if len(date9tuple) != 9:
1985 if _debug: sys.stderr.write("date handler function must return 9-tuple\n")
1986 raise ValueError
1987 map(int, date9tuple)
1988 return date9tuple
1989 except Exception, e:
1990 if _debug: sys.stderr.write("%s raised %s\n" % (handler.__name__, repr(e)))
1991 pass
1992 return None
1993
1994def _getCharacterEncoding(http_headers, xml_data):
1995 """Get the character encoding of the XML document
1996
1997 http_headers is a dictionary
1998 xml_data is a raw string (not Unicode)
1999
2000 This is so much trickier than it sounds, it's not even funny.
2001 According to RFC 3023 ("XML Media Types"), if the HTTP Content-Type
2002 is application/xml, application/*+xml,
2003 application/xml-external-parsed-entity, or application/xml-dtd,
2004 the encoding given in the charset parameter of the HTTP Content-Type
2005 takes precedence over the encoding given in the XML prefix within the
2006 document, and defaults to "utf-8" if neither are specified. But, if
2007 the HTTP Content-Type is text/xml, text/*+xml, or
2008 text/xml-external-parsed-entity, the encoding given in the XML prefix
2009 within the document is ALWAYS IGNORED and only the encoding given in
2010 the charset parameter of the HTTP Content-Type header should be
2011 respected, and it defaults to "us-ascii" if not specified.
2012
2013 Furthermore, discussion on the atom-syntax mailing list with the
2014 author of RFC 3023 leads me to the conclusion that any document
2015 served with a Content-Type of text/* and no charset parameter
2016 must be treated as us-ascii. (We now do this.) And also that it
2017 must always be flagged as non-well-formed. (We now do this too.)
2018
2019 If Content-Type is unspecified (input was local file or non-HTTP source)
2020 or unrecognized (server just got it totally wrong), then go by the
2021 encoding given in the XML prefix of the document and default to
2022 "iso-8859-1" as per the HTTP specification (RFC 2616).
2023
2024 Then, assuming we didn't find a character encoding in the HTTP headers
2025 (and the HTTP Content-type allowed us to look in the body), we need
2026 to sniff the first few bytes of the XML data and try to determine
2027 whether the encoding is ASCII-compatible. Section F of the XML
2028 specification shows the way here:
2029 http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2030
2031 If the sniffed encoding is not ASCII-compatible, we need to make it
2032 ASCII compatible so that we can sniff further into the XML declaration
2033 to find the encoding attribute, which will tell us the true encoding.
2034
2035 Of course, none of this guarantees that we will be able to parse the
2036 feed in the declared character encoding (assuming it was declared
2037 correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2038 you should definitely install them if you can.
2039 http://cjkpython.i18n.org/
2040 """
2041
2042 def _parseHTTPContentType(content_type):
2043 """takes HTTP Content-Type header and returns (content type, charset)
2044
2045 If no charset is specified, returns (content type, '')
2046 If no content type is specified, returns ('', '')
2047 Both return parameters are guaranteed to be lowercase strings
2048 """
2049 content_type = content_type or ''
2050 content_type, params = cgi.parse_header(content_type)
2051 return content_type, params.get('charset', '').replace("'", "")
2052
2053 sniffed_xml_encoding = ''
2054 xml_encoding = ''
2055 true_encoding = ''
2056 http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))
2057
2058
2059
2060
2061 try:
2062 if xml_data[:4] == '\x4c\x6f\xa7\x94':
2063
2064 xml_data = _ebcdic_to_ascii(xml_data)
2065 elif xml_data[:4] == '\x00\x3c\x00\x3f':
2066
2067 sniffed_xml_encoding = 'utf-16be'
2068 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2069 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2070
2071 sniffed_xml_encoding = 'utf-16be'
2072 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2073 elif xml_data[:4] == '\x3c\x00\x3f\x00':
2074
2075 sniffed_xml_encoding = 'utf-16le'
2076 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2077 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2078
2079 sniffed_xml_encoding = 'utf-16le'
2080 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2081 elif xml_data[:4] == '\x00\x00\x00\x3c':
2082
2083 sniffed_xml_encoding = 'utf-32be'
2084 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2085 elif xml_data[:4] == '\x3c\x00\x00\x00':
2086
2087 sniffed_xml_encoding = 'utf-32le'
2088 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2089 elif xml_data[:4] == '\x00\x00\xfe\xff':
2090
2091 sniffed_xml_encoding = 'utf-32be'
2092 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2093 elif xml_data[:4] == '\xff\xfe\x00\x00':
2094
2095 sniffed_xml_encoding = 'utf-32le'
2096 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2097 elif xml_data[:3] == '\xef\xbb\xbf':
2098
2099 sniffed_xml_encoding = 'utf-8'
2100 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2101 else:
2102
2103 pass
2104 xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2105 except:
2106 xml_encoding_match = None
2107 if xml_encoding_match:
2108 xml_encoding = xml_encoding_match.groups()[0].lower()
2109 if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2110 xml_encoding = sniffed_xml_encoding
2111 acceptable_content_type = 0
2112 application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2113 text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2114 if (http_content_type in application_content_types) or (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2116 acceptable_content_type = 1
2117 true_encoding = http_encoding or xml_encoding or 'utf-8'
2118 elif (http_content_type in text_content_types) or (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2120 acceptable_content_type = 1
2121 true_encoding = http_encoding or 'us-ascii'
2122 elif http_content_type.startswith('text/'):
2123 true_encoding = http_encoding or 'us-ascii'
2124 elif http_headers and (not http_headers.has_key('content-type')):
2125 true_encoding = xml_encoding or 'iso-8859-1'
2126 else:
2127 true_encoding = xml_encoding or 'utf-8'
2128 return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2129
2130def _toUTF8(data, encoding):
2131 """Changes an XML data stream on the fly to specify a new encoding
2132
2133 data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2134 encoding is a string recognized by encodings.aliases
2135 """
2136 if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2137
2138 if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2139 if _debug:
2140 sys.stderr.write('stripping BOM\n')
2141 if encoding != 'utf-16be':
2142 sys.stderr.write('trying utf-16be instead\n')
2143 encoding = 'utf-16be'
2144 data = data[2:]
2145 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2146 if _debug:
2147 sys.stderr.write('stripping BOM\n')
2148 if encoding != 'utf-16le':
2149 sys.stderr.write('trying utf-16le instead\n')
2150 encoding = 'utf-16le'
2151 data = data[2:]
2152 elif data[:3] == '\xef\xbb\xbf':
2153 if _debug:
2154 sys.stderr.write('stripping BOM\n')
2155 if encoding != 'utf-8':
2156 sys.stderr.write('trying utf-8 instead\n')
2157 encoding = 'utf-8'
2158 data = data[3:]
2159 elif data[:4] == '\x00\x00\xfe\xff':
2160 if _debug:
2161 sys.stderr.write('stripping BOM\n')
2162 if encoding != 'utf-32be':
2163 sys.stderr.write('trying utf-32be instead\n')
2164 encoding = 'utf-32be'
2165 data = data[4:]
2166 elif data[:4] == '\xff\xfe\x00\x00':
2167 if _debug:
2168 sys.stderr.write('stripping BOM\n')
2169 if encoding != 'utf-32le':
2170 sys.stderr.write('trying utf-32le instead\n')
2171 encoding = 'utf-32le'
2172 data = data[4:]
2173 newdata = unicode(data, encoding)
2174 if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2175 declmatch = re.compile('^<\?xml[^>]*?>')
2176 newdecl = """<?xml version='1.0' encoding='utf-8'?>"""
2177 if declmatch.search(newdata):
2178 newdata = declmatch.sub(newdecl, newdata)
2179 else:
2180 newdata = newdecl + u'\n' + newdata
2181 return newdata.encode("utf-8")
2182
2183def _stripDoctype(data):
2184 """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2185
2186 rss_version may be "rss091n" or None
2187 stripped_data is the same XML document, minus the DOCTYPE
2188 """
2189 entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2190 data = entity_pattern.sub('', data)
2191 doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2192 doctype_results = doctype_pattern.findall(data)
2193 doctype = doctype_results and doctype_results[0] or ''
2194 if doctype.lower().count('netscape'):
2195 version = 'rss091n'
2196 else:
2197 version = None
2198 data = doctype_pattern.sub('', data)
2199 return version, data
2200
2201def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2202 """Parse a feed from a URL, file, stream, or string"""
2203 result = FeedParserDict()
2204 result['feed'] = FeedParserDict()
2205 result['entries'] = []
2206 if _XML_AVAILABLE:
2207 result['bozo'] = 0
2208 if type(handlers) == types.InstanceType:
2209 handlers = [handlers]
2210 try:
2211 f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2212 data = f.read()
2213 except Exception, e:
2214 result['bozo'] = 1
2215 result['bozo_exception'] = e
2216 data = ''
2217 f = None
2218
2219
2220 if f and data and hasattr(f, "headers"):
2221 if gzip and f.headers.get('content-encoding', '') == 'gzip':
2222 try:
2223 data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2224 except Exception, e:
2225
2226
2227
2228
2229 result['bozo'] = 1
2230 result['bozo_exception'] = e
2231 data = ''
2232 elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2233 try:
2234 data = zlib.decompress(data, -zlib.MAX_WBITS)
2235 except Exception, e:
2236 result['bozo'] = 1
2237 result['bozo_exception'] = e
2238 data = ''
2239
2240
2241 if hasattr(f, "info"):
2242 info = f.info()
2243 result["etag"] = info.getheader("ETag")
2244 last_modified = info.getheader("Last-Modified")
2245 if last_modified:
2246 result["modified"] = _parse_date(last_modified)
2247 if hasattr(f, "url"):
2248 result["url"] = f.url
2249 result["status"] = 200
2250 if hasattr(f, "status"):
2251 result["status"] = f.status
2252 if hasattr(f, "headers"):
2253 result["headers"] = f.headers.dict
2254 if hasattr(f, "close"):
2255 f.close()
2256
2257
2258
2259
2260
2261
2262 http_headers = result.get("headers", {})
2263 result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = _getCharacterEncoding(http_headers, data)
2265 if http_headers and (not acceptable_content_type):
2266 if http_headers.has_key('content-type'):
2267 bozo_message = '%s is not an XML media type' % http_headers['content-type']
2268 else:
2269 bozo_message = 'no Content-type specified'
2270 result['bozo'] = 1
2271 result['bozo_exception'] = NonXMLContentType(bozo_message)
2272
2273 result['version'], data = _stripDoctype(data)
2274
2275 baseuri = http_headers.get('content-location', result.get('url'))
2276 baselang = http_headers.get('content-language', None)
2277
2278
2279 if result.get("status", 0) == 304:
2280 result['version'] = ''
2281 result['debug_message'] = "The feed has not changed since you last checked, " + "so the server sent no data. This is a feature, not a bug!"
2283 return result
2284
2285
2286 if not data:
2287 return result
2288
2289
2290 use_strict_parser = 0
2291 known_encoding = 0
2292 tried_encodings = []
2293 for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding, 'utf-8', 'windows-1252'):
2294 if proposed_encoding in tried_encodings: continue
2295 if not proposed_encoding: continue
2296 try:
2297 data = _toUTF8(data, proposed_encoding)
2298 known_encoding = 1
2299 use_strict_parser = 1
2300 break
2301 except:
2302 pass
2303 tried_encodings.append(proposed_encoding)
2304 if not known_encoding:
2305 result['bozo'] = 1
2306 result['bozo_exception'] = CharacterEncodingUnknown( "document encoding unknown, I tried " + "%s, %s, utf-8, and windows-1252 but nothing worked" % (result['encoding'], xml_encoding))
2310 result['encoding'] = ''
2311 elif proposed_encoding != result['encoding']:
2312 result['bozo'] = 1
2313 result['bozo_exception'] = CharacterEncodingOverride( "documented declared as %s, but parsed as %s" % (result['encoding'], proposed_encoding))
2316 result['encoding'] = proposed_encoding
2317
2318 if not _XML_AVAILABLE:
2319 use_strict_parser = 0
2320 if use_strict_parser:
2321
2322 feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2323 saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2324 saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2325 saxparser.setContentHandler(feedparser)
2326 saxparser.setErrorHandler(feedparser)
2327 source = xml.sax.xmlreader.InputSource()
2328 source.setByteStream(_StringIO(data))
2329 if hasattr(saxparser, '_ns_stack'):
2330
2331
2332 saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2333 try:
2334 saxparser.parse(source)
2335 except Exception, e:
2336 if _debug:
2337 import traceback
2338 traceback.print_stack()
2339 traceback.print_exc()
2340 sys.stderr.write('xml parsing failed\n')
2341 result['bozo'] = 1
2342 result['bozo_exception'] = feedparser.exc or e
2343 use_strict_parser = 0
2344 if not use_strict_parser:
2345 feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2346 feedparser.feed(data)
2347 result['feed'] = feedparser.feeddata
2348 result['entries'] = feedparser.entries
2349 result['version'] = result['version'] or feedparser.version
2350 return result
2351
2352if __name__ == '__main__':
2353 if not sys.argv[1:]:
2354 print __doc__
2355 sys.exit(0)
2356 else:
2357 urls = sys.argv[1:]
2358 zopeCompatibilityHack()
2359 from pprint import pprint
2360 for url in urls:
2361 print url
2362 print
2363 result = parse(url)
2364 pprint(result)
2365 print
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572