/home/phil/devel/fmtorrent/fmtorrent/master/aggregator/feedparser.py

0001#!/usr/bin/env python

0002"""Universal feed parser

0003

0004Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds

0005

0006Visit http://feedparser.org/ for the latest version

0007Visit http://feedparser.org/docs/ for the latest documentation

0008

0009Required: Python 2.1 or later

0010Recommended: Python 2.3 or later

0011Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>

0012"""

0013

0014#__version__ = "pre-3.3-" + "$Revision: 1.51 $"[11:15] + "-cvs"

0015__version__ = "3.3"

0016__license__ = "Python"

0017__copyright__ = "Copyright 2002-4, Mark Pilgrim"

0018__author__ = "Mark Pilgrim <http://diveintomark.org/>"

0019__contributors__ = ["Jason Diamond <http://injektilo.org/>",

0020                    "John Beimler <http://john.beimler.org/>",

0021                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",

0022                    "Aaron Swartz <http://aaronsw.com>"]

0023_debug = 0

0024

0025# HTTP "User-Agent" header to send to servers when downloading feeds.

0026# If you are embedding feedparser in a larger application, you should

0027# change this to your application name and URL.

0028USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__

0029

0030# HTTP "Accept" header to send to servers when downloading feeds.  If you don't

0031# want to send an Accept header, set this to None.

0032ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"

0033

0034# List of preferred XML parsers, by SAX driver name.  These will be tried first,

0035# but if they're not installed, Python will keep searching through its own list

0036# of pre-installed parsers until it finds one that supports everything we need.

0037PREFERRED_XML_PARSERS = ["drv_libxml2"]

0038

0039# If you want feedparser to automatically run HTML markup through HTML Tidy, set

0040# this to 1.  This is off by default because of reports of crashing on some

0041# platforms.  If it crashes for you, please submit a bug report with your OS

0042# platform, Python version, and the URL of the feed you were attempting to parse.

0043# Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>

0044TIDY_MARKUP = 0

0045

0046# ---------- required modules (should come with any Python distribution) ----------

0047import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi

0048try:

0049    from cStringIO import StringIO as _StringIO

0050except:

0051    from StringIO import StringIO as _StringIO

0052

0053# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------

0054

0055# gzip is included with most Python distributions, but may not be available if you compiled your own

0056try:

0057    import gzip

0058except:

0059    gzip = None

0060try:

0061    import zlib

0062except:

0063    zlib = None

0064

0065# timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.

0066# Python 2.3 now has this functionality available in the standard socket library, so under

0067# 2.3 you don't need to install anything.  But you probably should anyway, because the socket

0068# module is buggy and timeoutsocket is better.

0069try:

0070    import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py

0071    timeoutsocket.setDefaultSocketTimeout(20)

0072except ImportError:

0073    import socket

0074    if hasattr(socket, 'setdefaulttimeout'):

0075        socket.setdefaulttimeout(20)

0076import urllib, urllib2

0077

0078_mxtidy = None

0079if TIDY_MARKUP:

0080    try:

0081        from mx.Tidy import Tidy as _mxtidy

0082    except:

0083        pass

0084

0085# If a real XML parser is available, feedparser will attempt to use it.  feedparser has

0086# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the

0087# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some

0088# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.

0089try:

0090    import xml.sax

0091    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers

0092    from xml.sax.saxutils import escape as _xmlescape

0093    _XML_AVAILABLE = 1

0094except:

0095    _XML_AVAILABLE = 0

0096    def _xmlescape(data):

0097        data = data.replace("&", "&amp;")

0098        data = data.replace(">", "&gt;")

0099        data = data.replace("<", "&lt;")

0100        return data

0101

0102# base64 support for Atom feeds that contain embedded binary data

0103try:

0104    import base64, binascii

0105except:

0106    base64 = binascii = None

0107

0108# cjkcodecs and iconv_codec provide support for more character encodings.

0109# Both are available from http://cjkpython.i18n.org/

0110try:

0111    import cjkcodecs.aliases

0112except:

0113    pass

0114try:

0115    import iconv_codec

0116except:

0117    pass

0118

0119# ---------- don't touch these ----------

0120class CharacterEncodingOverride(Exception): pass

0121class CharacterEncodingUnknown(Exception): pass

0122class NonXMLContentType(Exception): pass

0123

0124sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')

0125sgmllib.special = re.compile('<!')

0126sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')

0127

0128SUPPORTED_VERSIONS = {'': 'unknown',

0129                      'rss090': 'RSS 0.90',

0130                      'rss091n': 'RSS 0.91 (Netscape)',

0131                      'rss091u': 'RSS 0.91 (Userland)',

0132                      'rss092': 'RSS 0.92',

0133                      'rss093': 'RSS 0.93',

0134                      'rss094': 'RSS 0.94',

0135                      'rss20': 'RSS 2.0',

0136                      'rss10': 'RSS 1.0',

0137                      'rss': 'RSS (unknown version)',

0138                      'atom01': 'Atom 0.1',

0139                      'atom02': 'Atom 0.2',

0140                      'atom03': 'Atom 0.3',

0141                      'atom': 'Atom (unknown version)',

0142                      'cdf': 'CDF',

0143                      'hotrss': 'Hot RSS'

0144                      }

0145

0146try:

0147    UserDict = dict

0148except NameError:

0149    # Python 2.1 does not have dict

0150    from UserDict import UserDict

0151    def dict(aList):

0152        rc = {}

0153        for k, v in aList:

0154            rc[k] = v

0155        return rc

0156

0157class FeedParserDict(UserDict):

0158    def __getitem__(self, key):

0159        keymap = {'channel': 'feed',

0160                  'items': 'entries',

0161                  'guid': 'id',

0162                  'date': 'modified',

0163                  'date_parsed': 'modified_parsed',

0164                  'description': ['tagline', 'summary']}

0165        realkey = keymap.get(key, key)

0166        if type(realkey) == types.ListType:

0167            for k in realkey:

0168                if UserDict.has_key(self, k):

0169                    return UserDict.__getitem__(self, k)

0170            return UserDict.__getitem__(self, key)

0171        return UserDict.__getitem__(self, realkey)

0172

0173    def has_key(self, key):

0174        return hasattr(self, key) or UserDict.has_key(self, key)

0175

0176    def __getattr__(self, key):

0177        try:

0178            return self.__dict__[key]

0179        except KeyError:

0180            pass

0181        try:

0182            return self.__getitem__(key)

0183        except:

0184            raise AttributeError, "object has no attribute '%s'" % key

0185

0186    def __contains__(self, key):

0187        return self.has_key(key)

0188

0189def zopeCompatibilityHack():

0190    global FeedParserDict

0191    del FeedParserDict

0192    def FeedParserDict(aDict=None):

0193        rc = {}

0194        if aDict:

0195            rc.update(aDict)

0196        return rc

0197

0198_ebcdic_to_ascii_map = None

0199def _ebcdic_to_ascii(s):

0200    global _ebcdic_to_ascii_map

0201    if not _ebcdic_to_ascii_map:

0202        emap = (

0203            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,

0204            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,

0205            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,

0206            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,

0207            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,

0208            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,

0209            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,

0210            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,

0211            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,

0212            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,

0213            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,

0214            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,

0215            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,

0216            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,

0217            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,

0218            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255

0219            )

0220        import string

0221        _ebcdic_to_ascii_map = string.maketrans(               "".join(map(chr, range(256))), "".join(map(chr, emap)))

0223    return s.translate(_ebcdic_to_ascii_map)

0224

0225class _FeedParserMixin:

0226    namespaces = {"": "",

0227                  "http://backend.userland.com/rss": "",

0228                  "http://blogs.law.harvard.edu/tech/rss": "",

0229                  "http://purl.org/rss/1.0/": "",

0230                  "http://my.netscape.com/rdf/simple/0.9/": "",

0231                  "http://example.com/newformat#": "",

0232                  "http://example.com/necho": "",

0233                  "http://purl.org/echo/": "",

0234                  "uri/of/echo/namespace#": "",

0235                  "http://purl.org/pie/": "",

0236                  "http://purl.org/atom/ns#": "",

0237                  "http://purl.org/rss/1.0/modules/rss091#": "",

0238

0239                  "http://webns.net/mvcb/":                               "admin",

0240                  "http://purl.org/rss/1.0/modules/aggregation/":         "ag",

0241                  "http://purl.org/rss/1.0/modules/annotate/":            "annotate",

0242                  "http://media.tangent.org/rss/1.0/":                    "audio",

0243                  "http://backend.userland.com/blogChannelModule":        "blogChannel",

0244                  "http://web.resource.org/cc/":                          "cc",

0245                  "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",

0246                  "http://purl.org/rss/1.0/modules/company":              "co",

0247                  "http://purl.org/rss/1.0/modules/content/":             "content",

0248                  "http://my.theinfo.org/changed/1.0/rss/":               "cp",

0249                  "http://purl.org/dc/elements/1.1/":                     "dc",

0250                  "http://purl.org/dc/terms/":                            "dcterms",

0251                  "http://purl.org/rss/1.0/modules/email/":               "email",

0252                  "http://purl.org/rss/1.0/modules/event/":               "ev",

0253                  "http://postneo.com/icbm/":                             "icbm",

0254                  "http://purl.org/rss/1.0/modules/image/":               "image",

0255                  "http://xmlns.com/foaf/0.1/":                           "foaf",

0256                  "http://freshmeat.net/rss/fm/":                         "fm",

0257                  "http://purl.org/rss/1.0/modules/link/":                "l",

0258                  "http://madskills.com/public/xml/rss/module/pingback/": "pingback",

0259                  "http://prismstandard.org/namespaces/1.2/basic/":       "prism",

0260                  "http://www.w3.org/1999/02/22-rdf-syntax-ns#":          "rdf",

0261                  "http://www.w3.org/2000/01/rdf-schema#":                "rdfs",

0262                  "http://purl.org/rss/1.0/modules/reference/":           "ref",

0263                  "http://purl.org/rss/1.0/modules/richequiv/":           "reqv",

0264                  "http://purl.org/rss/1.0/modules/search/":              "search",

0265                  "http://purl.org/rss/1.0/modules/slash/":               "slash",

0266                  "http://purl.org/rss/1.0/modules/servicestatus/":       "ss",

0267                  "http://hacks.benhammersley.com/rss/streaming/":        "str",

0268                  "http://purl.org/rss/1.0/modules/subscription/":        "sub",

0269                  "http://purl.org/rss/1.0/modules/syndication/":         "sy",

0270                  "http://purl.org/rss/1.0/modules/taxonomy/":            "taxo",

0271                  "http://purl.org/rss/1.0/modules/threading/":           "thr",

0272                  "http://purl.org/rss/1.0/modules/textinput/":           "ti",

0273                  "http://madskills.com/public/xml/rss/module/trackback/":"trackback",

0274                  "http://wellformedweb.org/CommentAPI/":                 "wfw",

0275                  "http://purl.org/rss/1.0/modules/wiki/":                "wiki",

0276                  "http://schemas.xmlsoap.org/soap/envelope/":            "soap",

0277                  "http://www.w3.org/1999/xhtml":                         "xhtml",

0278                  "http://www.w3.org/XML/1998/namespace":                 "xml"

0279}

0280

0281    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments', 'license']

0282    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']

0283    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']

0284    html_types = ['text/html', 'application/xhtml+xml']

0285

0286    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):

0287        if _debug: sys.stderr.write("initializing FeedParser\n")

0288        self.feeddata = FeedParserDict() # feed-level data

0289        self.encoding = encoding # character encoding

0290        self.entries = [] # list of entry-level data

0291        self.version = '' # feed type/version, see SUPPORTED_VERSIONS

0292

0293        # the following are used internally to track state;

0294        # some of this is kind of out of control and should

0295        # probably be refactored into a finite state machine

0296        self.infeed = 0

0297        self.inentry = 0

0298        self.incontent = 0

0299        self.intextinput = 0

0300        self.inimage = 0

0301        self.inauthor = 0

0302        self.incontributor = 0

0303        self.contentparams = FeedParserDict()

0304        self.namespacemap = {}

0305        self.elementstack = []

0306        self.basestack = []

0307        self.langstack = []

0308        self.baseuri = baseuri or ''

0309        self.lang = baselang or None

0310        if baselang:

0311            self.feeddata['language'] = baselang

0312

0313    def unknown_starttag(self, tag, attrs):

0314        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))

0315        # normalize attrs

0316        attrs = [(k.lower(), v) for k, v in attrs]

0317        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]

0318

0319        # track xml:base and xml:lang

0320        attrsD = dict(attrs)

0321        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri

0322        self.baseuri = baseuri

0323        lang = attrsD.get('xml:lang', attrsD.get('lang'))

0324        if lang == '':

0325            # xml:lang could be explicitly set to '', we need to capture that

0326            lang = None

0327        elif lang is None:

0328            # if no xml:lang is specified, use parent lang

0329            lang = self.lang

0330        if lang:

0331            if tag in ('feed', 'rss', 'rdf:RDF'):

0332                self.feeddata['language'] = lang

0333        self.lang = lang

0334        self.basestack.append(baseuri)

0335        self.langstack.append(lang)

0336

0337        # track namespaces

0338        for prefix, uri in attrs:

0339            if prefix.startswith('xmlns:'):

0340                self.trackNamespace(prefix[6:], uri)

0341            elif prefix == 'xmlns':

0342                self.trackNamespace(None, uri)

0343

0344        # track inline content

0345        if self.incontent and self.contentparams.get('mode') == 'escaped':

0346            # element declared itself as escaped markup, but it isn't really

0347            self.contentparams['mode'] = 'xml'

0348        if self.incontent and self.contentparams.get('mode') == 'xml':

0349            # Note: probably shouldn't simply recreate localname here, but

0350            # our namespace handling isn't actually 100% correct in cases where

0351            # the feed redefines the default namespace (which is actually

0352            # the usual case for inline content, thanks Sam), so here we

0353            # cheat and just reconstruct the element based on localname

0354            # because that compensates for the bugs in our namespace handling.

0355            # This will horribly munge inline content with non-empty qnames,

0356            # but nobody actually does that, so I'm not fixing it.

0357            tag = tag.split(':')[-1]

0358            return self.handle_data("<%s%s>" % (tag, "".join([' %s="%s"' % t for t in attrs])), escape=0)

0359

0360        # match namespaces

0361        if tag.find(':') <> -1:

0362            prefix, suffix = tag.split(':', 1)

0363        else:

0364            prefix, suffix = '', tag

0365        prefix = self.namespacemap.get(prefix, prefix)

0366        if prefix:

0367            prefix = prefix + '_'

0368

0369        # special hack for better tracking of empty textinput/image elements in illformed feeds

0370        if (not prefix) and tag not in ('title', 'link', 'description', 'name'):

0371            self.intextinput = 0

0372        if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'width', 'height'):

0373            self.inimage = 0

0374

0375        # call special handler (if defined) or default handler

0376        methodname = '_start_' + prefix + suffix

0377        try:

0378            method = getattr(self, methodname)

0379            return method(attrsD)

0380        except AttributeError:

0381            return self.push(prefix + suffix, 1)

0382

0383    def unknown_endtag(self, tag):

0384        if _debug: sys.stderr.write('end %s\n' % tag)

0385        # match namespaces

0386        if tag.find(':') <> -1:

0387            prefix, suffix = tag.split(':', 1)

0388        else:

0389            prefix, suffix = '', tag

0390        prefix = self.namespacemap.get(prefix, prefix)

0391        if prefix:

0392            prefix = prefix + '_'

0393

0394        # call special handler (if defined) or default handler

0395        methodname = '_end_' + prefix + suffix

0396        try:

0397            method = getattr(self, methodname)

0398            method()

0399        except AttributeError:

0400            self.pop(prefix + suffix)

0401

0402        # track inline content

0403        if self.incontent and self.contentparams.get('mode') == 'escaped':

0404            # element declared itself as escaped markup, but it isn't really

0405            self.contentparams['mode'] = 'xml'

0406        if self.incontent and self.contentparams.get('mode') == 'xml':

0407            tag = tag.split(':')[-1]

0408            self.handle_data("</%s>" % tag, escape=0)

0409

0410        # track xml:base and xml:lang going out of scope

0411        if self.basestack:

0412            self.basestack.pop()

0413            if self.basestack and self.basestack[-1]:

0414                self.baseuri = self.basestack[-1]

0415        if self.langstack:

0416            self.langstack.pop()

0417            if self.langstack: # and (self.langstack[-1] is not None):

0418                self.lang = self.langstack[-1]

0419

0420    def handle_charref(self, ref):

0421        # called for each character reference, e.g. for "&#160;", ref will be "160"

0422        if not self.elementstack: return

0423        ref = ref.lower()

0424        if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):

0425            text = "&#%s;" % ref

0426        else:

0427            if ref[0] == 'x':

0428                c = int(ref[1:], 16)

0429            else:

0430                c = int(ref)

0431            text = unichr(c).encode('utf-8')

0432        self.elementstack[-1][2].append(text)

0433

0434    def handle_entityref(self, ref):

0435        # called for each entity reference, e.g. for "&copy;", ref will be "copy"

0436        if not self.elementstack: return

0437        if _debug: sys.stderr.write("entering handle_entityref with %s\n" % ref)

0438        if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):

0439            text = '&%s;' % ref

0440        else:

0441            # entity resolution graciously donated by Aaron Swartz

0442            def name2cp(k):

0443                import htmlentitydefs

0444                if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3

0445                    return htmlentitydefs.name2codepoint[k]

0446                k = htmlentitydefs.entitydefs[k]

0447                if k.startswith("&#") and k.endswith(";"):

0448                    return int(k[2:-1]) # not in latin-1

0449                return ord(k)

0450            try: name2cp(ref)

0451            except KeyError: text = "&%s;" % ref

0452            else: text = unichr(name2cp(ref)).encode('utf-8')

0453        self.elementstack[-1][2].append(text)

0454

0455    def handle_data(self, text, escape=1):

0456        # called for each block of plain text, i.e. outside of any tag and

0457        # not containing any character or entity references

0458        if not self.elementstack: return

0459        if escape and self.contentparams.get('mode') == 'xml':

0460            text = _xmlescape(text)

0461        self.elementstack[-1][2].append(text)

0462

0463    def handle_comment(self, text):

0464        # called for each comment, e.g. <!-- insert message here -->

0465        pass

0466

0467    def handle_pi(self, text):

0468        # called for each processing instruction, e.g. <?instruction>

0469        pass

0470

0471    def handle_decl(self, text):

0472        pass

0473

0474    def parse_declaration(self, i):

0475        # override internal declaration handler to handle CDATA blocks

0476        if _debug: sys.stderr.write("entering parse_declaration\n")

0477        if self.rawdata[i:i+9] == '<![CDATA[':

0478            k = self.rawdata.find(']]>', i)

0479            if k == -1: k = len(self.rawdata)

0480            self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)

0481            return k+3

0482        else:

0483            k = self.rawdata.find('>', i)

0484            return k+1

0485

0486    def trackNamespace(self, prefix, uri):

0487        if (prefix, uri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:

0488            self.version = 'rss090'

0489        if uri == 'http://purl.org/rss/1.0/' and not self.version:

0490            self.version = 'rss10'

0491        if not prefix: return

0492        if uri.find('backend.userland.com/rss') <> -1:

0493            # match any backend.userland.com namespace

0494            uri = 'http://backend.userland.com/rss'

0495        if self.namespaces.has_key(uri):

0496            self.namespacemap[prefix] = self.namespaces[uri]

0497

0498    def resolveURI(self, uri):

0499        return urlparse.urljoin(self.baseuri or '', uri)

0500

0501    def decodeEntities(self, element, data):

0502        return data

0503

0504    def push(self, element, expectingText):

0505        self.elementstack.append([element, expectingText, []])

0506

0507    def pop(self, element):

0508        if not self.elementstack: return

0509        if self.elementstack[-1][0] != element: return

0510

0511        element, expectingText, pieces = self.elementstack.pop()

0512        output = "".join(pieces)

0513        output = output.strip()

0514        if not expectingText: return output

0515

0516        # decode base64 content

0517        if self.contentparams.get('mode') == 'base64' and base64:

0518            try:

0519                output = base64.decodestring(output)

0520            except binascii.Error:

0521                pass

0522            except binascii.Incomplete:

0523                pass

0524

0525        # resolve relative URIs

0526        if (element in self.can_be_relative_uri) and output:

0527            output = self.resolveURI(output)

0528

0529        # decode entities within embedded markup

0530        output = self.decodeEntities(element, output)

0531

0532        # resolve relative URIs within embedded markup

0533        if self.contentparams.get('type', 'text/html') in self.html_types:

0534            if element in self.can_contain_relative_uris:

0535                output = _resolveRelativeURIs(output, self.baseuri, self.encoding)

0536

0537        # sanitize embedded markup

0538        if self.contentparams.get('type', 'text/html') in self.html_types:

0539            if element in self.can_contain_dangerous_markup:

0540                output = _sanitizeHTML(output, self.encoding)

0541

0542        if self.encoding and (type(output) == types.StringType):

0543            try:

0544                output = unicode(output, self.encoding)

0545            except:

0546                pass

0547

0548        # store output in appropriate place(s)

0549        if self.inentry:

0550            if element == 'content':

0551                self.entries[-1].setdefault(element, [])

0552                contentparams = copy.deepcopy(self.contentparams)

0553                contentparams['value'] = output

0554                self.entries[-1][element].append(contentparams)

0555            elif element == 'category':

0556                self.entries[-1][element] = output

0557                domain = self.entries[-1]['categories'][-1][0]

0558                self.entries[-1]['categories'][-1] = (domain, output)

0559            elif element == 'source':

0560                self.entries[-1]['source']['value'] = output

0561            elif element == 'link':

0562                self.entries[-1][element] = output

0563                if output:

0564                    self.entries[-1]['links'][-1]['href'] = output

0565            else:

0566                if element == 'description':

0567                    element = 'summary'

0568                self.entries[-1][element] = output

0569                if self.incontent:

0570                    contentparams = copy.deepcopy(self.contentparams)

0571                    contentparams['value'] = output

0572                    self.entries[-1][element + '_detail'] = contentparams

0573        elif self.infeed and (not self.intextinput) and (not self.inimage):

0574            if element == 'description':

0575                element = 'tagline'

0576            self.feeddata[element] = output

0577            if element == 'category':

0578                domain = self.feeddata['categories'][-1][0]

0579                self.feeddata['categories'][-1] = (domain, output)

0580            elif element == 'link':

0581                self.feeddata['links'][-1]['href'] = output

0582            elif self.incontent:

0583                contentparams = copy.deepcopy(self.contentparams)

0584                contentparams['value'] = output

0585                self.feeddata[element + '_detail'] = contentparams

0586        return output

0587

0588    def _mapToStandardPrefix(self, name):

0589        colonpos = name.find(':')

0590        if colonpos <> -1:

0591            prefix = name[:colonpos]

0592            suffix = name[colonpos+1:]

0593            prefix = self.namespacemap.get(prefix, prefix)

0594            name = prefix + ':' + suffix

0595        return name

0596

0597    def _getAttribute(self, attrsD, name):

0598        return attrsD.get(self._mapToStandardPrefix(name))

0599

0600    def _save(self, key, value):

0601        if self.inentry:

0602            self.entries[-1].setdefault(key, value)

0603        elif self.feeddata:

0604            self.feeddata.setdefault(key, value)

0605

0606    def _start_rss(self, attrsD):

0607        versionmap = {'0.91': 'rss091u',

0608                      '0.92': 'rss092',

0609                      '0.93': 'rss093',

0610                      '0.94': 'rss094'}

0611        if not self.version:

0612            attr_version = attrsD.get('version', '')

0613            version = versionmap.get(attr_version)

0614            if version:

0615                self.version = version

0616            elif attr_version.startswith('2.'):

0617                self.version = 'rss20'

0618            else:

0619                self.version = 'rss'

0620

0621    def _start_dlhottitles(self, attrsD):

0622        self.version = 'hotrss'

0623

0624    def _start_channel(self, attrsD):

0625        self.infeed = 1

0626        self._cdf_common(attrsD)

0627    _start_feedinfo = _start_channel

0628

0629    def _cdf_common(self, attrsD):

0630        if attrsD.has_key('lastmod'):

0631            self._start_modified({})

0632            self.elementstack[-1][-1] = attrsD['lastmod']

0633            self._end_modified()

0634        if attrsD.has_key('href'):

0635            self._start_link({})

0636            self.elementstack[-1][-1] = attrsD['href']

0637            self._end_link()

0638

0639    def _start_feed(self, attrsD):

0640        self.infeed = 1

0641        versionmap = {'0.1': 'atom01',

0642                      '0.2': 'atom02',

0643                      '0.3': 'atom03'}

0644        if not self.version:

0645            attr_version = attrsD.get('version')

0646            version = versionmap.get(attr_version)

0647            if version:

0648                self.version = version

0649            else:

0650                self.version = 'atom'

0651

0652    def _end_channel(self):

0653        self.infeed = 0

0654    _end_feed = _end_channel

0655

0656    def _start_image(self, attrsD):

0657        self.inimage = 1

0658        self.push('image', 0)

0659        context = self._getContext()

0660        context.setdefault('image', FeedParserDict())

0661

0662    def _end_image(self):

0663        self.pop('image')

0664        self.inimage = 0

0665

0666    def _start_textinput(self, attrsD):

0667        self.intextinput = 1

0668        self.push('textinput', 0)

0669        context = self._getContext()

0670        context.setdefault('textinput', FeedParserDict())

0671    _start_textInput = _start_textinput

0672

0673    def _end_textinput(self):

0674        self.pop('textinput')

0675        self.intextinput = 0

0676    _end_textInput = _end_textinput

0677

0678    def _start_author(self, attrsD):

0679        self.inauthor = 1

0680        self.push('author', 1)

0681    _start_managingeditor = _start_author

0682    _start_dc_author = _start_author

0683    _start_dc_creator = _start_author

0684

0685    def _end_author(self):

0686        self.pop('author')

0687        self.inauthor = 0

0688        self._sync_author_detail()

0689    _end_managingeditor = _end_author

0690    _end_dc_author = _end_author

0691    _end_dc_creator = _end_author

0692

0693    def _start_contributor(self, attrsD):

0694        self.incontributor = 1

0695        context = self._getContext()

0696        context.setdefault('contributors', [])

0697        context['contributors'].append(FeedParserDict())

0698        self.push('contributor', 0)

0699

0700    def _end_contributor(self):

0701        self.pop('contributor')

0702        self.incontributor = 0

0703

0704    def _start_name(self, attrsD):

0705        self.push('name', 0)

0706

0707    def _end_name(self):

0708        value = self.pop('name')

0709        if self.inauthor:

0710            self._save_author('name', value)

0711        elif self.incontributor:

0712            self._save_contributor('name', value)

0713        elif self.intextinput:

0714            context = self._getContext()

0715            context['textinput']['name'] = value

0716

0717    def _start_width(self, attrsD):

0718        self.push('width', 0)

0719

0720    def _end_width(self):

0721        value = self.pop('width')

0722        try:

0723            value = int(value)

0724        except:

0725            value = 0

0726        if self.inimage:

0727            context = self._getContext()

0728            context['image']['width'] = value

0729

0730    def _start_height(self, attrsD):

0731        self.push('height', 0)

0732

0733    def _end_height(self):

0734        value = self.pop('height')

0735        try:

0736            value = int(value)

0737        except:

0738            value = 0

0739        if self.inimage:

0740            context = self._getContext()

0741            context['image']['height'] = value

0742

0743    def _start_url(self, attrsD):

0744        self.push('url', 1)

0745    _start_homepage = _start_url

0746    _start_uri = _start_url

0747

0748    def _end_url(self):

0749        value = self.pop('url')

0750        if self.inauthor:

0751            self._save_author('url', value)

0752        elif self.incontributor:

0753            self._save_contributor('url', value)

0754        elif self.inimage:

0755            context = self._getContext()

0756            context['image']['url'] = value

0757        elif self.intextinput:

0758            context = self._getContext()

0759            context['textinput']['link'] = value

0760    _end_homepage = _end_url

0761    _end_uri = _end_url

0762

0763    def _start_email(self, attrsD):

0764        self.push('email', 0)

0765

0766    def _end_email(self):

0767        value = self.pop('email')

0768        if self.inauthor:

0769            self._save_author('email', value)

0770        elif self.incontributor:

0771            self._save_contributor('email', value)

0772            pass

0773

0774    def _getContext(self):

0775        if self.inentry:

0776            context = self.entries[-1]

0777        else:

0778            context = self.feeddata

0779        return context

0780

0781    def _save_author(self, key, value):

0782        context = self._getContext()

0783        context.setdefault('author_detail', FeedParserDict())

0784        context['author_detail'][key] = value

0785        self._sync_author_detail()

0786

0787    def _save_contributor(self, key, value):

0788        context = self._getContext()

0789        context.setdefault('contributors', [FeedParserDict()])

0790        context['contributors'][-1][key] = value

0791

0792    def _sync_author_detail(self, key='author'):

0793        context = self._getContext()

0794        detail = context.get('%s_detail' % key)

0795        if detail:

0796            name = detail.get('name')

0797            email = detail.get('email')

0798            if name and email:

0799                context[key] = "%s (%s)" % (name, email)

0800            elif name:

0801                context[key] = name

0802            elif email:

0803                context[key] = email

0804        else:

0805            author = context.get(key)

0806            if not author: return

0807            emailmatch = re.search(r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", author)

0808            if not emailmatch: return

0809            email = emailmatch.group(0)

0810            # probably a better way to do the following, but it passes all the tests

0811            author = author.replace(email, '')

0812            author = author.replace('()', '')

0813            author = author.strip()

0814            if author and (author[0] == '('):

0815                author = author[1:]

0816            if author and (author[-1] == ')'):

0817                author = author[:-1]

0818            author = author.strip()

0819            context.setdefault('%s_detail' % key, FeedParserDict())

0820            context['%s_detail' % key]['name'] = author

0821            context['%s_detail' % key]['email'] = email

0822

0823    def _start_tagline(self, attrsD):

0824        self.incontent += 1

0825        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),

0826                              'type': attrsD.get('type', 'text/plain'),

0827                              'language': self.lang,

0828                              'base': self.baseuri})

0829        self.push('tagline', 1)

0830    _start_subtitle = _start_tagline

0831

0832    def _end_tagline(self):

0833        value = self.pop('tagline')

0834        self.incontent -= 1

0835        self.contentparams.clear()

0836        if self.infeed:

0837            self.feeddata['description'] = value

0838    _end_subtitle = _end_tagline

0839

0840    def _start_copyright(self, attrsD):

0841        self.incontent += 1

0842        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),

0843                              'type': attrsD.get('type', 'text/plain'),

0844                              'language': self.lang,

0845                              'base': self.baseuri})

0846        self.push('copyright', 1)

0847    _start_dc_rights = _start_copyright

0848

0849    def _end_copyright(self):

0850        self.pop('copyright')

0851        self.incontent -= 1

0852        self.contentparams.clear()

0853    _end_dc_rights = _end_copyright

0854

0855    def _start_item(self, attrsD):

0856        self.entries.append(FeedParserDict())

0857        self.push('item', 0)

0858        self.inentry = 1

0859        self.guidislink = 0

0860        id = self._getAttribute(attrsD, 'rdf:about')

0861        if id:

0862            context = self._getContext()

0863            context['id'] = id

0864        self._cdf_common(attrsD)

0865    _start_entry = _start_item

0866    _start_product = _start_item

0867

0868    def _end_item(self):

0869        self.pop('item')

0870        self.inentry = 0

0871    _end_entry = _end_item

0872

0873    def _start_dc_language(self, attrsD):

0874        self.push('language', 1)

0875    _start_language = _start_dc_language

0876

0877    def _end_dc_language(self):

0878        self.lang = self.pop('language')

0879    _end_language = _end_dc_language

0880

0881    def _start_dc_publisher(self, attrsD):

0882        self.push('publisher', 1)

0883    _start_webmaster = _start_dc_publisher

0884

0885    def _end_dc_publisher(self):

0886        self.pop('publisher')

0887        self._sync_author_detail('publisher')

0888    _end_webmaster = _end_dc_publisher

0889

0890    def _start_dcterms_issued(self, attrsD):

0891        self.push('issued', 1)

0892    _start_issued = _start_dcterms_issued

0893

0894    def _end_dcterms_issued(self):

0895        value = self.pop('issued')

0896        self._save('issued_parsed', _parse_date(value))

0897    _end_issued = _end_dcterms_issued

0898

0899    def _start_dcterms_created(self, attrsD):

0900        self.push('created', 1)

0901    _start_created = _start_dcterms_created

0902

0903    def _end_dcterms_created(self):

0904        value = self.pop('created')

0905        self._save('created_parsed', _parse_date(value))

0906    _end_created = _end_dcterms_created

0907

0908    def _start_dcterms_modified(self, attrsD):

0909        self.push('modified', 1)

0910    _start_modified = _start_dcterms_modified

0911    _start_dc_date = _start_dcterms_modified

0912    _start_pubdate = _start_dcterms_modified

0913

0914    def _end_dcterms_modified(self):

0915        value = self.pop('modified')

0916        parsed_value = _parse_date(value)

0917        self._save('modified_parsed', parsed_value)

0918    _end_modified = _end_dcterms_modified

0919    _end_dc_date = _end_dcterms_modified

0920    _end_pubdate = _end_dcterms_modified

0921

0922    def _start_expirationdate(self, attrsD):

0923        self.push('expired', 1)

0924

0925    def _end_expirationdate(self):

0926        self._save('expired_parsed', _parse_date(self.pop('expired')))

0927

0928    def _start_cc_license(self, attrsD):

0929        self.push('license', 1)

0930        value = self._getAttribute(attrsD, 'rdf:resource')

0931        if value:

0932            self.elementstack[-1][2].append(value)

0933        self.pop('license')

0934

0935    def _start_creativecommons_license(self, attrsD):

0936        self.push('license', 1)

0937

0938    def _end_creativecommons_license(self):

0939        self.pop('license')

0940

0941    def _start_category(self, attrsD):

0942        self.push('category', 1)

0943        domain = self._getAttribute(attrsD, 'domain')

0944        cats = []

0945        if self.inentry:

0946            cats = self.entries[-1].setdefault('categories', [])

0947        elif self.infeed:

0948            cats = self.feeddata.setdefault('categories', [])

0949        cats.append((domain, None))

0950    _start_dc_subject = _start_category

0951    _start_keywords = _start_category

0952

0953    def _end_category(self):

0954        self.pop('category')

0955    _end_dc_subject = _end_category

0956    _end_keywords = _end_category

0957

0958    def _start_cloud(self, attrsD):

0959        self.feeddata['cloud'] = FeedParserDict(attrsD)

0960

0961    def _start_link(self, attrsD):

0962        attrsD.setdefault('rel', 'alternate')

0963        attrsD.setdefault('type', 'text/html')

0964        if attrsD.has_key('href'):

0965            attrsD['href'] = self.resolveURI(attrsD['href'])

0966        expectingText = self.infeed or self.inentry

0967        if self.inentry:

0968            self.entries[-1].setdefault('links', [])

0969            self.entries[-1]['links'].append(FeedParserDict(attrsD))

0970        elif self.infeed:

0971            self.feeddata.setdefault('links', [])

0972            self.feeddata['links'].append(FeedParserDict(attrsD))

0973        if attrsD.has_key('href'):

0974            expectingText = 0

0975            if attrsD.get('type', '') in self.html_types:

0976                if self.inentry:

0977                    self.entries[-1]['link'] = attrsD['href']

0978                elif self.infeed:

0979                    self.feeddata['link'] = attrsD['href']

0980        else:

0981            self.push('link', expectingText)

0982    _start_producturl = _start_link

0983

0984    def _end_link(self):

0985        value = self.pop('link')

0986        if self.intextinput:

0987            context = self._getContext()

0988            context['textinput']['link'] = value

0989        if self.inimage:

0990            context = self._getContext()

0991            context['image']['link'] = value

0992    _end_producturl = _end_link

0993

0994    def _start_guid(self, attrsD):

0995        self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')

0996        self.push('id', 1)

0997

0998    def _end_guid(self):

0999        value = self.pop('id')

1000        self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))

1001        if self.guidislink:

1002            # guid acts as link, but only if "ispermalink" is not present or is "true",

1003            # and only if the item doesn't already have a link element

1004            self._save('link', value)

1005

1006    def _start_id(self, attrsD):

1007        self.push('id', 1)

1008

1009    def _end_id(self):

1010        value = self.pop('id')

1011

1012    def _start_title(self, attrsD):

1013        self.incontent += 1

1014        if _debug: sys.stderr.write('attrsD.xml:lang = %s\n' % attrsD.get('xml:lang'))

1015        if _debug: sys.stderr.write('self.lang = %s\n' % self.lang)

1016        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),

1017                              'type': attrsD.get('type', 'text/plain'),

1018                              'language': self.lang,

1019                              'base': self.baseuri})

1020        self.push('title', self.infeed or self.inentry)

1021    _start_dc_title = _start_title

1022

1023    def _end_title(self):

1024        value = self.pop('title')

1025        self.incontent -= 1

1026        self.contentparams.clear()

1027        if self.intextinput:

1028            context = self._getContext()

1029            context['textinput']['title'] = value

1030        elif self.inimage:

1031            context = self._getContext()

1032            context['image']['title'] = value

1033    _end_dc_title = _end_title

1034

1035    def _start_description(self, attrsD, default_content_type='text/html'):

1036        self.incontent += 1

1037        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),

1038                              'type': attrsD.get('type', default_content_type),

1039                              'language': self.lang,

1040                              'base': self.baseuri})

1041        self.push('description', self.infeed or self.inentry)

1042

1043    def _start_abstract(self, attrsD):

1044        return self._start_description(attrsD, 'text/plain')

1045

1046    def _end_description(self):

1047        value = self.pop('description')

1048        self.incontent -= 1

1049        self.contentparams.clear()

1050        context = self._getContext()

1051        if self.intextinput:

1052            context['textinput']['description'] = value

1053        elif self.inimage:

1054            context['image']['description'] = value

1055#        elif self.inentry:

1056#            context['summary'] = value

1057#        elif self.infeed:

1058#            context['tagline'] = value

1059    _end_abstract = _end_description

1060

1061    def _start_info(self, attrsD):

1062        self.incontent += 1

1063        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),

1064                              'type': attrsD.get('type', 'text/plain'),

1065                              'language': self.lang,

1066                              'base': self.baseuri})

1067        self.push('info', 1)

1068

1069    def _end_info(self):

1070        self.pop('info')

1071        self.incontent -= 1

1072        self.contentparams.clear()

1073

1074    def _start_generator(self, attrsD):

1075        if attrsD:

1076            if attrsD.has_key('url'):

1077                attrsD['url'] = self.resolveURI(attrsD['url'])

1078            self.feeddata['generator_detail'] = FeedParserDict(attrsD)

1079        self.push('generator', 1)

1080

1081    def _end_generator(self):

1082        value = self.pop('generator')

1083        if self.feeddata.has_key('generator_detail'):

1084            self.feeddata['generator_detail']['name'] = value

1085

1086    def _start_admin_generatoragent(self, attrsD):

1087        self.push('generator', 1)

1088        value = self._getAttribute(attrsD, 'rdf:resource')

1089        if value:

1090            self.elementstack[-1][2].append(value)

1091        self.pop('generator')

1092        self.feeddata['generator_detail'] = FeedParserDict({"url": value})

1093

1094    def _start_admin_errorreportsto(self, attrsD):

1095        self.push('errorreportsto', 1)

1096        value = self._getAttribute(attrsD, 'rdf:resource')

1097        if value:

1098            self.elementstack[-1][2].append(value)

1099        self.pop('errorreportsto')

1100

1101    def _start_summary(self, attrsD):

1102        self.incontent += 1

1103        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'escaped'),

1104                              'type': attrsD.get('type', 'text/plain'),

1105                              'language': self.lang,

1106                              'base': self.baseuri})

1107        self.push('summary', 1)

1108

1109    def _end_summary(self):

1110        value = self.pop('summary')

1111        if self.entries:

1112            self.entries[-1]['description'] = value

1113        self.incontent -= 1

1114        self.contentparams.clear()

1115

1116    def _start_enclosure(self, attrsD):

1117        if self.inentry:

1118            self.entries[-1].setdefault('enclosures', [])

1119            self.entries[-1]['enclosures'].append(FeedParserDict(attrsD))

1120

1121    def _start_source(self, attrsD):

1122        if self.inentry:

1123            self.entries[-1]['source'] = FeedParserDict(attrsD)

1124        self.push('source', 1)

1125

1126    def _end_source(self):

1127        self.pop('source')

1128

1129    def _start_content(self, attrsD):

1130        self.incontent += 1

1131        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),

1132                              'type': attrsD.get('type', 'text/plain'),

1133                              'language': self.lang,

1134                              'base': self.baseuri})

1135        self.push('content', 1)

1136

1137    def _start_prodlink(self, attrsD):

1138        self.incontent += 1

1139        self.contentparams = FeedParserDict({'mode': attrsD.get('mode', 'xml'),

1140                              'type': attrsD.get('type', 'text/html'),

1141                              'language': self.lang,

1142                              'base': self.baseuri})

1143        self.push('content', 1)

1144

1145    def _start_body(self, attrsD):

1146        self.incontent += 1

1147        self.contentparams = FeedParserDict({'mode': 'xml',

1148                              'type': 'application/xhtml+xml',

1149                              'language': self.lang,

1150                              'base': self.baseuri})

1151        self.push('content', 1)

1152    _start_xhtml_body = _start_body

1153

1154    def _start_content_encoded(self, attrsD):

1155        self.incontent += 1

1156        self.contentparams = FeedParserDict({'mode': 'escaped',

1157                              'type': 'text/html',

1158                              'language': self.lang,

1159                              'base': self.baseuri})

1160        self.push('content', 1)

1161    _start_fullitem = _start_content_encoded

1162

1163    def _end_content(self):

1164        value = self.pop('content')

1165        if self.contentparams.get('type') in (['text/plain'] + self.html_types):

1166            self._save('description', value)

1167        self.incontent -= 1

1168        self.contentparams.clear()

1169    _end_body = _end_content

1170    _end_xhtml_body = _end_content

1171    _end_content_encoded = _end_content

1172    _end_fullitem = _end_content

1173    _end_prodlink = _end_content

1174

1175if _XML_AVAILABLE:

1176    class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):

1177        def __init__(self, baseuri, baselang, encoding):

1178            if _debug: sys.stderr.write('trying StrictFeedParser\n')

1179            xml.sax.handler.ContentHandler.__init__(self)

1180            _FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1181            self.bozo = 0

1182            self.exc = None

1183

1184        def startPrefixMapping(self, prefix, uri):

1185            self.trackNamespace(prefix, uri)

1186

1187        def startElementNS(self, name, qname, attrs):

1188            namespace, localname = name

1189            namespace = str(namespace or '')

1190            if namespace.find('backend.userland.com/rss') <> -1:

1191                # match any backend.userland.com namespace

1192                namespace = 'http://backend.userland.com/rss'

1193            prefix = self.namespaces.get(namespace, 'unknown')

1194            if prefix:

1195                localname = prefix + ':' + localname

1196            localname = str(localname).lower()

1197

1198            # qname implementation is horribly broken in Python 2.1 (it

1199            # doesn't report any), and slightly broken in Python 2.2 (it

1200            # doesn't report the xml: namespace). So we match up namespaces

1201            # with a known list first, and then possibly override them with

1202            # the qnames the SAX parser gives us (if indeed it gives us any

1203            # at all).  Thanks to MatejC for helping me test this and

1204            # tirelessly telling me that it didn't work yet.

1205            attrsD = {}

1206            for (namespace, attrlocalname), attrvalue in attrs._attrs.items():

1207                prefix = self.namespaces.get(namespace, '')

1208                if prefix:

1209                    attrlocalname = prefix + ":" + attrlocalname

1210                attrsD[str(attrlocalname).lower()] = attrvalue

1211            for qname in attrs.getQNames():

1212                attrsD[str(qname).lower()] = attrs.getValueByQName(qname)

1213            self.unknown_starttag(localname, attrsD.items())

1214

1215#        def resolveEntity(self, publicId, systemId):

1216#            return _StringIO()

1217

1218        def characters(self, text):

1219            self.handle_data(text)

1220

1221        def endElementNS(self, name, qname):

1222            namespace, localname = name

1223            namespace = str(namespace)

1224            prefix = self.namespaces.get(namespace, '')

1225            if prefix:

1226                localname = prefix + ':' + localname

1227            localname = str(localname).lower()

1228            self.unknown_endtag(localname)

1229

1230        def error(self, exc):

1231            self.bozo = 1

1232            self.exc = exc

1233

1234        def fatalError(self, exc):

1235            self.error(exc)

1236            raise exc

1237

1238class _BaseHTMLProcessor(sgmllib.SGMLParser):

1239    elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',

1240      'img', 'input', 'isindex', 'link', 'meta', 'param']

1241

1242    def __init__(self, encoding):

1243        self.encoding = encoding

1244        if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)

1245        sgmllib.SGMLParser.__init__(self)

1246

1247    def reset(self):

1248        self.pieces = []

1249        sgmllib.SGMLParser.reset(self)

1250

1251    def feed(self, data):

1252        data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)

1253        data = re.sub(r'<(\S+)/>', r'<\1></\1>', data)

1254        data = data.replace('&#39;', "'")

1255        data = data.replace('&#34;', '"')

1256        if self.encoding and (type(data) == types.UnicodeType):

1257            data = data.encode(self.encoding)

1258        sgmllib.SGMLParser.feed(self, data)

1259

1260    def normalize_attrs(self, attrs):

1261        # utility method to be called by descendants

1262        attrs = [(k.lower(), v) for k, v in attrs]

1263#        if self.encoding:

1264#            if _debug: sys.stderr.write('normalize_attrs, encoding=%s\n' % self.encoding)

1265#            attrs = [(k, v.encode(self.encoding)) for k, v in attrs]

1266        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]

1267        return attrs

1268

1269    def unknown_starttag(self, tag, attrs):

1270        # called for each start tag

1271        # attrs is a list of (attr, value) tuples

1272        # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]

1273        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)

1274        strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])

1275        if tag in self.elements_no_end_tag:

1276            self.pieces.append("<%(tag)s%(strattrs)s />" % locals())

1277        else:

1278            self.pieces.append("<%(tag)s%(strattrs)s>" % locals())

1279

1280    def unknown_endtag(self, tag):

1281        # called for each end tag, e.g. for </pre>, tag will be "pre"

1282        # Reconstruct the original end tag.

1283        if tag not in self.elements_no_end_tag:

1284            self.pieces.append("</%(tag)s>" % locals())

1285

1286    def handle_charref(self, ref):

1287        # called for each character reference, e.g. for "&#160;", ref will be "160"

1288        # Reconstruct the original character reference.

1289        self.pieces.append("&#%(ref)s;" % locals())

1290

1291    def handle_entityref(self, ref):

1292        # called for each entity reference, e.g. for "&copy;", ref will be "copy"

1293        # Reconstruct the original entity reference.

1294        self.pieces.append("&%(ref)s;" % locals())

1295

1296    def handle_data(self, text):

1297        # called for each block of plain text, i.e. outside of any tag and

1298        # not containing any character or entity references

1299        # Store the original text verbatim.

1300        if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)

1301        self.pieces.append(text)

1302

1303    def handle_comment(self, text):

1304        # called for each HTML comment, e.g. <!-- insert Javascript code here -->

1305        # Reconstruct the original comment.

1306        self.pieces.append("<!--%(text)s-->" % locals())

1307

1308    def handle_pi(self, text):

1309        # called for each processing instruction, e.g. <?instruction>

1310        # Reconstruct original processing instruction.

1311        self.pieces.append("<?%(text)s>" % locals())

1312

1313    def handle_decl(self, text):

1314        # called for the DOCTYPE, if present, e.g.

1315        # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

1316        #     "http://www.w3.org/TR/html4/loose.dtd">

1317        # Reconstruct original DOCTYPE

1318        self.pieces.append("<!%(text)s>" % locals())

1319

1320    _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match

1321    def _scan_name(self, i, declstartpos):

1322        rawdata = self.rawdata

1323        n = len(rawdata)

1324        if i == n:

1325            return None, -1

1326        m = self._new_declname_match(rawdata, i)

1327        if m:

1328            s = m.group()

1329            name = s.strip()

1330            if (i + len(s)) == n:

1331                return None, -1  # end of buffer

1332            return name.lower(), m.end()

1333        else:

1334            self.handle_data(rawdata)

1335#            self.updatepos(declstartpos, i)

1336            return None, -1

1337

1338    def output(self):

1339        """Return processed HTML as a single string"""

1340        return "".join([str(p) for p in self.pieces])

1341

1342class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):

1343    def __init__(self, baseuri, baselang, encoding):

1344        sgmllib.SGMLParser.__init__(self)

1345        _FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1346

1347    def decodeEntities(self, element, data):

1348        data = data.replace('&#60;', '&lt;')

1349        data = data.replace('&#x3c;', '&lt;')

1350        data = data.replace('&#62;', '&gt;')

1351        data = data.replace('&#x3e;', '&gt;')

1352        data = data.replace('&#38;', '&amp;')

1353        data = data.replace('&#x26;', '&amp;')

1354        data = data.replace('&#34;', '&quot;')

1355        data = data.replace('&#x22;', '&quot;')

1356        data = data.replace('&#39;', '&apos;')

1357        data = data.replace('&#x27;', '&apos;')

1358        if self.contentparams.get('mode') == 'escaped':

1359            data = data.replace('&lt;', '<')

1360            data = data.replace('&gt;', '>')

1361            data = data.replace('&amp;', '&')

1362            data = data.replace('&quot;', '"')

1363            data = data.replace('&apos;', "'")

1364        return data

1365

1366class _RelativeURIResolver(_BaseHTMLProcessor):

1367    relative_uris = [('a', 'href'),

1368                     ('applet', 'codebase'),

1369                     ('area', 'href'),

1370                     ('blockquote', 'cite'),

1371                     ('body', 'background'),

1372                     ('del', 'cite'),

1373                     ('form', 'action'),

1374                     ('frame', 'longdesc'),

1375                     ('frame', 'src'),

1376                     ('iframe', 'longdesc'),

1377                     ('iframe', 'src'),

1378                     ('head', 'profile'),

1379                     ('img', 'longdesc'),

1380                     ('img', 'src'),

1381                     ('img', 'usemap'),

1382                     ('input', 'src'),

1383                     ('input', 'usemap'),

1384                     ('ins', 'cite'),

1385                     ('link', 'href'),

1386                     ('object', 'classid'),

1387                     ('object', 'codebase'),

1388                     ('object', 'data'),

1389                     ('object', 'usemap'),

1390                     ('q', 'cite'),

1391                     ('script', 'src')]

1392

1393    def __init__(self, baseuri, encoding):

1394        _BaseHTMLProcessor.__init__(self, encoding)

1395        self.baseuri = baseuri

1396

1397    def resolveURI(self, uri):

1398        return urlparse.urljoin(self.baseuri, uri)

1399

1400    def unknown_starttag(self, tag, attrs):

1401        attrs = self.normalize_attrs(attrs)

1402        attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]

1403        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1404

1405def _resolveRelativeURIs(htmlSource, baseURI, encoding):

1406    if _debug: sys.stderr.write("entering _resolveRelativeURIs\n")

1407    p = _RelativeURIResolver(baseURI, encoding)

1408    p.feed(htmlSource)

1409    return p.output()

1410

1411class _HTMLSanitizer(_BaseHTMLProcessor):

1412    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',

1413      'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',

1414      'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',

1415      'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',

1416      'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',

1417      'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',

1418      'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',

1419      'thead', 'tr', 'tt', 'u', 'ul', 'var']

1420

1421    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',

1422      'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',

1423      'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',

1424      'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',

1425      'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',

1426      'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',

1427      'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',

1428      'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',

1429      'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',

1430      'usemap', 'valign', 'value', 'vspace', 'width']

1431

1432    unacceptable_elements_with_end_tag = ['script', 'applet']

1433

1434    def reset(self):

1435        _BaseHTMLProcessor.reset(self)

1436        self.unacceptablestack = 0

1437

1438    def unknown_starttag(self, tag, attrs):

1439        if not tag in self.acceptable_elements:

1440            if tag in self.unacceptable_elements_with_end_tag:

1441                self.unacceptablestack += 1

1442            return

1443        attrs = self.normalize_attrs(attrs)

1444        attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]

1445        _BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1446

1447    def unknown_endtag(self, tag):

1448        if not tag in self.acceptable_elements:

1449            if tag in self.unacceptable_elements_with_end_tag:

1450                self.unacceptablestack -= 1

1451            return

1452        _BaseHTMLProcessor.unknown_endtag(self, tag)

1453

1454    def handle_pi(self, text):

1455        pass

1456

1457    def handle_decl(self, text):

1458        pass

1459

1460    def handle_data(self, text):

1461        if not self.unacceptablestack:

1462            _BaseHTMLProcessor.handle_data(self, text)

1463

1464def _sanitizeHTML(htmlSource, encoding):

1465    p = _HTMLSanitizer(encoding)

1466    p.feed(htmlSource)

1467    data = p.output()

1468    if _mxtidy and TIDY_MARKUP:

1469        nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, output_xhtml=1, numeric_entities=1, wrap=0)

1470        if data.count('<body'):

1471            data = data.split('<body', 1)[1]

1472            if data.count('>'):

1473                data = data.split('>', 1)[1]

1474        if data.count('</body'):

1475            data = data.split('</body', 1)[0]

1476    data = data.strip().replace('\r\n', '\n')

1477    return data

1478

1479class _FeedURLHandler(urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):

1480    def http_error_default(self, req, fp, code, msg, headers):

1481        if ((code / 100) == 3) and (code != 304):

1482            return self.http_error_302(req, fp, code, msg, headers)

1483        infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1484        infourl.status = code

1485        return infourl

1486

1487    def http_error_302(self, req, fp, code, msg, headers):

1488        if headers.dict.has_key('location'):

1489            infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)

1490        else:

1491            infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1492        if not hasattr(infourl, 'status'):

1493            infourl.status = code

1494        return infourl

1495

1496    def http_error_301(self, req, fp, code, msg, headers):

1497        if headers.dict.has_key('location'):

1498            infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)

1499        else:

1500            infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1501        if not hasattr(infourl, 'status'):

1502            infourl.status = code

1503        return infourl

1504

1505    http_error_300 = http_error_302

1506    http_error_303 = http_error_302

1507    http_error_307 = http_error_302

1508

1509def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):

1510    """URL, filename, or string --> stream

1511

1512    This function lets you define parsers that take any input source

1513    (URL, pathname to local or network file, or actual data as a string)

1514    and deal with it in a uniform manner.  Returned object is guaranteed

1515    to have all the basic stdio read methods (read, readline, readlines).

1516    Just .close() the object when you're done with it.

1517

1518    If the etag argument is supplied, it will be used as the value of an

1519    If-None-Match request header.

1520

1521    If the modified argument is supplied, it must be a tuple of 9 integers

1522    as returned by gmtime() in the standard Python time module. This MUST

1523    be in GMT (Greenwich Mean Time). The formatted date/time will be used

1524    as the value of an If-Modified-Since request header.

1525

1526    If the agent argument is supplied, it will be used as the value of a

1527    User-Agent request header.

1528

1529    If the referrer argument is supplied, it will be used as the value of a

1530    Referer[sic] request header.

1531

1532    If handlers is supplied, it is a list of handlers used to build a

1533    urllib2 opener.

1534    """

1535

1536    if hasattr(url_file_stream_or_string, "read"):

1537        return url_file_stream_or_string

1538

1539    if url_file_stream_or_string == "-":

1540        return sys.stdin

1541

1542    if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):

1543        if not agent:

1544            agent = USER_AGENT

1545        # test for inline user:password for basic auth

1546        auth = None

1547        if base64:

1548            urltype, rest = urllib.splittype(url_file_stream_or_string)

1549            realhost, rest = urllib.splithost(rest)

1550            if realhost:

1551                user_passwd, realhost = urllib.splituser(realhost)

1552                if user_passwd:

1553                    url_file_stream_or_string = "%s://%s%s" % (urltype, realhost, rest)

1554                    auth = base64.encodestring(user_passwd).strip()

1555        # try to open with urllib2 (to use optional headers)

1556        request = urllib2.Request(url_file_stream_or_string)

1557        request.add_header("User-Agent", agent)

1558        if etag:

1559            request.add_header("If-None-Match", etag)

1560        if modified:

1561            # format into an RFC 1123-compliant timestamp. We can't use

1562            # time.strftime() since the %a and %b directives can be affected

1563            # by the current locale, but RFC 2616 states that dates must be

1564            # in English.

1565            short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

1566            months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

1567            request.add_header("If-Modified-Since", "%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))

1568        if referrer:

1569            request.add_header("Referer", referrer)

1570        if gzip and zlib:

1571            request.add_header("Accept-encoding", "gzip, deflate")

1572        elif gzip:

1573            request.add_header("Accept-encoding", "gzip")

1574        elif zlib:

1575            request.add_header("Accept-encoding", "deflate")

1576        else:

1577            request.add_header("Accept-encoding", "")

1578        if auth:

1579            request.add_header("Authorization", "Basic %s" % auth)

1580        if ACCEPT_HEADER:

1581            request.add_header("Accept", ACCEPT_HEADER)

1582        opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))

1583        opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent

1584        try:

1585            return opener.open(request)

1586        finally:

1587            opener.close() # JohnD

1588

1589    # try to open with native open function (if url_file_stream_or_string is a filename)

1590    try:

1591        return open(url_file_stream_or_string)

1592    except:

1593        pass

1594

1595    # treat url_file_stream_or_string as string

1596    return _StringIO(str(url_file_stream_or_string))

1597

1598_date_handlers = []

1599def registerDateHandler(func):

1600    """Register a date handler function (takes string, returns 9-tuple date in GMT)"""

1601    _date_handlers.insert(0, func)

1602

1603# ISO-8601 date parsing routines written by Fazal Majid.

1604# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601

1605# parser is beyond the scope of feedparser and would be a worthwhile addition

1606# to the Python library.

1607# A single regular expression cannot parse ISO 8601 date formats into groups

1608# as the standard is highly irregular (for instance is 030104 2003-01-04 or

1609# 0301-04-01), so we use templates instead.

1610# Please note the order in templates is significant because we need a

1611# greedy match.

1612_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',

1613                'YY-?MM-?DD', 'YY-?OOO', 'YYYY',

1614                '-YY-?MM', '-OOO', '-YY',

1615                '--MM-?DD', '--MM',

1616                '---DD',

1617                'CC', '']

1618_iso8601_re = [

1619    tmpl.replace(

1620    'YYYY', r'(?P<year>\d{4})').replace(

1621    'YY', r'(?P<year>\d\d)').replace(

1622    'MM', r'(?P<month>[01]\d)').replace(

1623    'DD', r'(?P<day>[0123]\d)').replace(

1624    'OOO', r'(?P<ordinal>[0123]\d\d)').replace(

1625    'CC', r'(?P<century>\d\d$)')

1626    + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'

1627    + r'(:(?P<second>\d{2}))?'

1628    + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'

1629    for tmpl in _iso8601_tmpl]

1630del tmpl

1631_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]

1632del regex

1633def _parse_date_iso8601(dateString):

1634    """Parse a variety of ISO-8601-compatible formats like 20040105"""

1635    m = None

1636    for _iso8601_match in _iso8601_matches:

1637        m = _iso8601_match(dateString)

1638        if m: break

1639    if not m: return

1640    if m.span() == (0, 0): return

1641    params = m.groupdict()

1642    ordinal = params.get("ordinal", 0)

1643    if ordinal:

1644        ordinal = int(ordinal)

1645    else:

1646        ordinal = 0

1647    year = params.get("year", "--")

1648    if not year or year == "--":

1649        year = time.gmtime()[0]

1650    elif len(year) == 2:

1651        # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993

1652        year = 100 * int(time.gmtime()[0] / 100) + int(year)

1653    else:

1654        year = int(year)

1655    month = params.get("month", "-")

1656    if not month or month == "-":

1657        # ordinals are NOT normalized by mktime, we simulate them

1658        # by setting month=1, day=ordinal

1659        if ordinal:

1660            month = 1

1661        else:

1662            month = time.gmtime()[1]

1663    month = int(month)

1664    day = params.get("day", 0)

1665    if not day:

1666        # see above

1667        if ordinal:

1668            day = ordinal

1669        elif params.get("century", 0) or                    params.get("year", 0) or params.get("month", 0):

1671            day = 1

1672        else:

1673            day = time.gmtime()[2]

1674    else:

1675        day = int(day)

1676    # special case of the century - is the first year of the 21st century

1677    # 2000 or 2001 ? The debate goes on...

1678    if "century" in params.keys():

1679        year = (int(params["century"]) - 1) * 100 + 1

1680    # in ISO 8601 most fields are optional

1681    for field in ["hour", "minute", "second", "tzhour", "tzmin"]:

1682        if not params.get(field, None):

1683            params[field] = 0

1684    hour = int(params.get("hour", 0))

1685    minute = int(params.get("minute", 0))

1686    second = int(params.get("second", 0))

1687    # weekday is normalized by mktime(), we can ignore it

1688    weekday = 0

1689    # daylight savings is complex, but not needed for feedparser's purposes

1690    # as time zones, if specified, include mention of whether it is active

1691    # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and

1692    # and most implementations have DST bugs

1693    daylight_savings_flag = 0

1694    tm = [year, month, day, hour, minute, second, weekday,

1695          ordinal, daylight_savings_flag]

1696    # ISO 8601 time zone adjustments

1697    tz = params.get("tz")

1698    if tz and tz != "Z":

1699        if tz[0] == "-":

1700            tm[3] += int(params.get("tzhour", 0))

1701            tm[4] += int(params.get("tzmin", 0))

1702        elif tz[0] == "+":

1703            tm[3] -= int(params.get("tzhour", 0))

1704            tm[4] -= int(params.get("tzmin", 0))

1705        else:

1706            return None

1707    # Python's time.mktime() is a wrapper around the ANSI C mktime(3c)

1708    # which is guaranteed to normalize d/m/y/h/m/s.

1709    # Many implementations have bugs, but we'll pretend they don't.

1710    return time.localtime(time.mktime(tm))

1711registerDateHandler(_parse_date_iso8601)

1712

1713# 8-bit date handling routines written by ytrewq1.

1714_korean_year  = u'\ub144' # b3e2 in euc-kr

1715_korean_month = u'\uc6d4' # bff9 in euc-kr

1716_korean_day   = u'\uc77c' # c0cf in euc-kr

1717_korean_am    = u'\uc624\uc804' # bfc0 c0fc in euc-kr

1718_korean_pm    = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr

1719

1720_korean_onblog_date_re =       re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' %                  (_korean_year, _korean_month, _korean_day))

1723_korean_nate_date_re =       re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' %                  (_korean_am, _korean_pm))

1726def _parse_date_onblog(dateString):

1727    """Parse a string according to the OnBlog 8-bit date format"""

1728    m = _korean_onblog_date_re.match(dateString)

1729    if not m: return

1730    w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" %                   {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),                   'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),                   'zonediff': '+09:00'}

1734    if _debug: sys.stderr.write("OnBlog date parsed as: %s\n" % w3dtfdate)

1735    return _parse_date_w3dtf(w3dtfdate)

1736registerDateHandler(_parse_date_onblog)

1737

1738def _parse_date_nate(dateString):

1739    """Parse a string according to the Nate 8-bit date format"""

1740    m = _korean_nate_date_re.match(dateString)

1741    if not m: return

1742    hour = int(m.group(5))

1743    ampm = m.group(4)

1744    if (ampm == _korean_pm):

1745        hour += 12

1746    hour = str(hour)

1747    if len(hour) == 1:

1748        hour = '0' + hour

1749    w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" %                   {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),                   'hour': hour, 'minute': m.group(6), 'second': m.group(7),                   'zonediff': '+09:00'}

1753    if _debug: sys.stderr.write("Nate date parsed as: %s\n" % w3dtfdate)

1754    return _parse_date_w3dtf(w3dtfdate)

1755registerDateHandler(_parse_date_nate)

1756

1757_mssql_date_re =       re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})\.\d+')

1759def _parse_date_mssql(dateString):

1760    """Parse a string according to the MS SQL date format"""

1761    m = _mssql_date_re.match(dateString)

1762    if not m: return

1763    w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s" %                   {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),                   'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),                   'zonediff': '+09:00'}

1767    if _debug: sys.stderr.write("MS SQL date parsed as: %s\n" % w3dtfdate)

1768    return _parse_date_w3dtf(w3dtfdate)

1769registerDateHandler(_parse_date_mssql)

1770

1771# Unicode strings for Greek date strings

1772_greek_months =     {      u'\u0399\u03b1\u03bd': u'Jan',       # c9e1ed in iso-8859-7

1775   u'\u03a6\u03b5\u03b2': u'Feb',       # d6e5e2 in iso-8859-7

1776   u'\u039c\u03ac\u03ce': u'Mar',       # ccdcfe in iso-8859-7

1777   u'\u039c\u03b1\u03ce': u'Mar',       # cce1fe in iso-8859-7

1778   u'\u0391\u03c0\u03c1': u'Apr',       # c1f0f1 in iso-8859-7

1779   u'\u039c\u03ac\u03b9': u'May',       # ccdce9 in iso-8859-7

1780   u'\u039c\u03b1\u03ca': u'May',       # cce1fa in iso-8859-7

1781   u'\u039c\u03b1\u03b9': u'May',       # cce1e9 in iso-8859-7

1782   u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7

1783   u'\u0399\u03bf\u03bd': u'Jun',       # c9efed in iso-8859-7

1784   u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7

1785   u'\u0399\u03bf\u03bb': u'Jul',       # c9f9eb in iso-8859-7

1786   u'\u0391\u03cd\u03b3': u'Aug',       # c1fde3 in iso-8859-7

1787   u'\u0391\u03c5\u03b3': u'Aug',       # c1f5e3 in iso-8859-7

1788   u'\u03a3\u03b5\u03c0': u'Sep',       # d3e5f0 in iso-8859-7

1789   u'\u039f\u03ba\u03c4': u'Oct',       # cfeaf4 in iso-8859-7

1790   u'\u039d\u03bf\u03ad': u'Nov',       # cdefdd in iso-8859-7

1791   u'\u039d\u03bf\u03b5': u'Nov',       # cdefe5 in iso-8859-7

1792   u'\u0394\u03b5\u03ba': u'Dec',       # c4e5ea in iso-8859-7

1793  }

1794

1795_greek_wdays =     {      u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7

1798   u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7

1799   u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7

1800   u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7

1801   u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7

1802   u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7

1803   u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7

1804  }

1805

1806_greek_date_format_re =       re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')

1808

1809def _parse_date_greek(dateString):

1810    """Parse a string according to a Greek 8-bit date format."""

1811    m = _greek_date_format_re.match(dateString)

1812    if not m: return

1813    try:

1814        wday = _greek_wdays[m.group(1)]

1815        month = _greek_months[m.group(3)]

1816    except:

1817        return

1818    rfc822date = "%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s" %                    {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),                    'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),                    'zonediff': m.group(8)}

1822    if _debug: sys.stderr.write("Greek date parsed as: %s\n" % rfc822date)

1823    return _parse_date_rfc822(rfc822date)

1824registerDateHandler(_parse_date_greek)

1825

1826# Unicode strings for Hungarian date strings

1827_hungarian_months =     {       u'janu\u00e1r':   u'01',  # e1 in iso-8859-2

1830    u'febru\u00e1ri': u'02',  # e1 in iso-8859-2

1831    u'm\u00e1rcius':  u'03',  # e1 in iso-8859-2

1832    u'\u00e1prilis':  u'04',  # e1 in iso-8859-2

1833    u'm\u00e1ujus':   u'05',  # e1 in iso-8859-2

1834    u'j\u00fanius':   u'06',  # fa in iso-8859-2

1835    u'j\u00falius':   u'07',  # fa in iso-8859-2

1836    u'augusztus':     u'08',

1837    u'szeptember':    u'09',

1838    u'okt\u00f3ber':  u'10',  # f3 in iso-8859-2

1839    u'november':      u'11',

1840    u'december':      u'12',

1841  }

1842

1843_hungarian_date_format_re =     re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')

1845

1846def _parse_date_hungarian(dateString):

1847    """Parse a string according to a Hungarian 8-bit date format."""

1848    m = _hungarian_date_format_re.match(dateString)

1849    if not m: return

1850    try:

1851        month = _hungarian_months[m.group(2)]

1852        day = m.group(3)

1853        if len(day) == 1:

1854            day = '0' + day

1855        hour = m.group(4)

1856        if len(hour) == 1:

1857            hour = '0' + hour

1858    except:

1859        return

1860    w3dtfdate = "%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s" %                   {'year': m.group(1), 'month': month, 'day': day,                   'hour': hour, 'minute': m.group(5),                   'zonediff': m.group(6)}

1864    if _debug: sys.stderr.write("Hungarian date parsed as: %s\n" % w3dtfdate)

1865    return _parse_date_w3dtf(w3dtfdate)

1866registerDateHandler(_parse_date_hungarian)

1867

1868# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by

1869# Drake and licensed under the Python license.  Removed all range checking

1870# for month, day, hour, minute, and second, since mktime will normalize

1871# these later

1872def _parse_date_w3dtf(dateString):

1873    def __extract_date(m):

1874        year = int(m.group("year"))

1875        if year < 100:

1876            year = 100 * int(time.gmtime()[0] / 100) + int(year)

1877        if year < 1000:

1878            return 0, 0, 0

1879        julian = m.group("julian")

1880        if julian:

1881            julian = int(julian)

1882            month = julian / 30 + 1

1883            day = julian % 30 + 1

1884            jday = None

1885            while jday != julian:

1886                t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))

1887                jday = time.gmtime(t)[-2]

1888                diff = abs(jday - julian)

1889                if jday > julian:

1890                    if diff < day:

1891                        day = day - diff

1892                    else:

1893                        month = month - 1

1894                        day = 31

1895                elif jday < julian:

1896                    if day + diff < 28:

1897                       day = day + diff

1898                    else:

1899                        month = month + 1

1900            return year, month, day

1901        month = m.group("month")

1902        day = 1

1903        if month is None:

1904            month = 1

1905        else:

1906            month = int(month)

1907            day = m.group("day")

1908            if day:

1909                day = int(day)

1910            else:

1911                day = 1

1912        return year, month, day

1913

1914    def __extract_time(m):

1915        if not m:

1916            return 0, 0, 0

1917        hours = m.group("hours")

1918        if not hours:

1919            return 0, 0, 0

1920        hours = int(hours)

1921        minutes = int(m.group("minutes"))

1922        seconds = m.group("seconds")

1923        if seconds:

1924            seconds = int(seconds)

1925        else:

1926            seconds = 0

1927        return hours, minutes, seconds

1928

1929    def __extract_tzd(m):

1930        """Return the Time Zone Designator as an offset in seconds from UTC."""

1931        if not m:

1932            return 0

1933        tzd = m.group("tzd")

1934        if not tzd:

1935            return 0

1936        if tzd == "Z":

1937            return 0

1938        hours = int(m.group("tzdhours"))

1939        minutes = m.group("tzdminutes")

1940        if minutes:

1941            minutes = int(minutes)

1942        else:

1943            minutes = 0

1944        offset = (hours*60 + minutes) * 60

1945        if tzd[0] == "+":

1946            return -offset

1947        return offset

1948

1949    __date_re = ("(?P<year>\d\d\d\d)"

1950                 "(?:(?P<dsep>-|)"

1951                 "(?:(?P<julian>\d\d\d)"

1952                 "|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?")

1953    __tzd_re = "(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)"

1954    __tzd_rx = re.compile(__tzd_re)

1955    __time_re = ("(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)"

1956                 "(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?"

1957                 + __tzd_re)

1958    __datetime_re = "%s(?:T%s)?" % (__date_re, __time_re)

1959    __datetime_rx = re.compile(__datetime_re)

1960    m = __datetime_rx.match(dateString)

1961    if (m is None) or (m.group() != dateString): return

1962    gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)

1963    if gmt[0] == 0: return

1964    return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)

1965registerDateHandler(_parse_date_w3dtf)

1966

1967def _parse_date_rfc822(dateString):

1968    """Parse an RFC822, RFC1123, RFC2822, or asctime-style date"""

1969    tm = rfc822.parsedate_tz(dateString)

1970    if tm:

1971        return time.gmtime(rfc822.mktime_tz(tm))

1972# rfc822.py defines several time zones, but we define some extra ones.

1973# "ET" is equivalent to "EST", etc.

1974_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}

1975rfc822._timezones.update(_additional_timezones)

1976registerDateHandler(_parse_date_rfc822)

1977

1978def _parse_date(dateString):

1979    """Parses a variety of date formats into a 9-tuple in GMT"""

1980    for handler in _date_handlers:

1981        try:

1982            date9tuple = handler(dateString)

1983            if not date9tuple: continue

1984            if len(date9tuple) != 9:

1985                if _debug: sys.stderr.write("date handler function must return 9-tuple\n")

1986                raise ValueError

1987            map(int, date9tuple)

1988            return date9tuple

1989        except Exception, e:

1990            if _debug: sys.stderr.write("%s raised %s\n" % (handler.__name__, repr(e)))

1991            pass

1992    return None

1993

1994def _getCharacterEncoding(http_headers, xml_data):

1995    """Get the character encoding of the XML document

1996

1997    http_headers is a dictionary

1998    xml_data is a raw string (not Unicode)

1999

2000    This is so much trickier than it sounds, it's not even funny.

2001    According to RFC 3023 ("XML Media Types"), if the HTTP Content-Type

2002    is application/xml, application/*+xml,

2003    application/xml-external-parsed-entity, or application/xml-dtd,

2004    the encoding given in the charset parameter of the HTTP Content-Type

2005    takes precedence over the encoding given in the XML prefix within the

2006    document, and defaults to "utf-8" if neither are specified.  But, if

2007    the HTTP Content-Type is text/xml, text/*+xml, or

2008    text/xml-external-parsed-entity, the encoding given in the XML prefix

2009    within the document is ALWAYS IGNORED and only the encoding given in

2010    the charset parameter of the HTTP Content-Type header should be

2011    respected, and it defaults to "us-ascii" if not specified.

2012

2013    Furthermore, discussion on the atom-syntax mailing list with the

2014    author of RFC 3023 leads me to the conclusion that any document

2015    served with a Content-Type of text/* and no charset parameter

2016    must be treated as us-ascii.  (We now do this.)  And also that it

2017    must always be flagged as non-well-formed.  (We now do this too.)

2018

2019    If Content-Type is unspecified (input was local file or non-HTTP source)

2020    or unrecognized (server just got it totally wrong), then go by the

2021    encoding given in the XML prefix of the document and default to

2022    "iso-8859-1" as per the HTTP specification (RFC 2616).

2023

2024    Then, assuming we didn't find a character encoding in the HTTP headers

2025    (and the HTTP Content-type allowed us to look in the body), we need

2026    to sniff the first few bytes of the XML data and try to determine

2027    whether the encoding is ASCII-compatible.  Section F of the XML

2028    specification shows the way here:

2029    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2030

2031    If the sniffed encoding is not ASCII-compatible, we need to make it

2032    ASCII compatible so that we can sniff further into the XML declaration

2033    to find the encoding attribute, which will tell us the true encoding.

2034

2035    Of course, none of this guarantees that we will be able to parse the

2036    feed in the declared character encoding (assuming it was declared

2037    correctly, which many are not).  CJKCodecs and iconv_codec help a lot;

2038    you should definitely install them if you can.

2039    http://cjkpython.i18n.org/

2040    """

2041

2042    def _parseHTTPContentType(content_type):

2043        """takes HTTP Content-Type header and returns (content type, charset)

2044

2045        If no charset is specified, returns (content type, '')

2046        If no content type is specified, returns ('', '')

2047        Both return parameters are guaranteed to be lowercase strings

2048        """

2049        content_type = content_type or ''

2050        content_type, params = cgi.parse_header(content_type)

2051        return content_type, params.get('charset', '').replace("'", "")

2052

2053    sniffed_xml_encoding = ''

2054    xml_encoding = ''

2055    true_encoding = ''

2056    http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type"))

2057    # Must sniff for non-ASCII-compatible character encodings before

2058    # searching for XML declaration.  This heuristic is defined in

2059    # section F of the XML specification:

2060    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2061    try:

2062        if xml_data[:4] == '\x4c\x6f\xa7\x94':

2063            # EBCDIC

2064            xml_data = _ebcdic_to_ascii(xml_data)

2065        elif xml_data[:4] == '\x00\x3c\x00\x3f':

2066            # UTF-16BE

2067            sniffed_xml_encoding = 'utf-16be'

2068            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')

2069        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):

2070            # UTF-16BE with BOM

2071            sniffed_xml_encoding = 'utf-16be'

2072            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')

2073        elif xml_data[:4] == '\x3c\x00\x3f\x00':

2074            # UTF-16LE

2075            sniffed_xml_encoding = 'utf-16le'

2076            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')

2077        elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):

2078            # UTF-16LE with BOM

2079            sniffed_xml_encoding = 'utf-16le'

2080            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')

2081        elif xml_data[:4] == '\x00\x00\x00\x3c':

2082            # UTF-32BE

2083            sniffed_xml_encoding = 'utf-32be'

2084            xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')

2085        elif xml_data[:4] == '\x3c\x00\x00\x00':

2086            # UTF-32LE

2087            sniffed_xml_encoding = 'utf-32le'

2088            xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')

2089        elif xml_data[:4] == '\x00\x00\xfe\xff':

2090            # UTF-32BE with BOM

2091            sniffed_xml_encoding = 'utf-32be'

2092            xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')

2093        elif xml_data[:4] == '\xff\xfe\x00\x00':

2094            # UTF-32LE with BOM

2095            sniffed_xml_encoding = 'utf-32le'

2096            xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')

2097        elif xml_data[:3] == '\xef\xbb\xbf':

2098            # UTF-8 with BOM

2099            sniffed_xml_encoding = 'utf-8'

2100            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')

2101        else:

2102            # ASCII-compatible

2103            pass

2104        xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)

2105    except:

2106        xml_encoding_match = None

2107    if xml_encoding_match:

2108        xml_encoding = xml_encoding_match.groups()[0].lower()

2109        if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):

2110            xml_encoding = sniffed_xml_encoding

2111    acceptable_content_type = 0

2112    application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')

2113    text_content_types = ('text/xml', 'text/xml-external-parsed-entity')

2114    if (http_content_type in application_content_types) or          (http_content_type.startswith('application/') and http_content_type.endswith('+xml')):

2116        acceptable_content_type = 1

2117        true_encoding = http_encoding or xml_encoding or 'utf-8'

2118    elif (http_content_type in text_content_types) or            (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):

2120        acceptable_content_type = 1

2121        true_encoding = http_encoding or 'us-ascii'

2122    elif http_content_type.startswith('text/'):

2123        true_encoding = http_encoding or 'us-ascii'

2124    elif http_headers and (not http_headers.has_key('content-type')):

2125        true_encoding = xml_encoding or 'iso-8859-1'

2126    else:

2127        true_encoding = xml_encoding or 'utf-8'

2128    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type

2129

2130def _toUTF8(data, encoding):

2131    """Changes an XML data stream on the fly to specify a new encoding

2132

2133    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already

2134    encoding is a string recognized by encodings.aliases

2135    """

2136    if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)

2137    # strip Byte Order Mark (if present)

2138    if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):

2139        if _debug:

2140            sys.stderr.write('stripping BOM\n')

2141            if encoding != 'utf-16be':

2142                sys.stderr.write('trying utf-16be instead\n')

2143        encoding = 'utf-16be'

2144        data = data[2:]

2145    elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):

2146        if _debug:

2147            sys.stderr.write('stripping BOM\n')

2148            if encoding != 'utf-16le':

2149                sys.stderr.write('trying utf-16le instead\n')

2150        encoding = 'utf-16le'

2151        data = data[2:]

2152    elif data[:3] == '\xef\xbb\xbf':

2153        if _debug:

2154            sys.stderr.write('stripping BOM\n')

2155            if encoding != 'utf-8':

2156                sys.stderr.write('trying utf-8 instead\n')

2157        encoding = 'utf-8'

2158        data = data[3:]

2159    elif data[:4] == '\x00\x00\xfe\xff':

2160        if _debug:

2161            sys.stderr.write('stripping BOM\n')

2162            if encoding != 'utf-32be':

2163                sys.stderr.write('trying utf-32be instead\n')

2164        encoding = 'utf-32be'

2165        data = data[4:]

2166    elif data[:4] == '\xff\xfe\x00\x00':

2167        if _debug:

2168            sys.stderr.write('stripping BOM\n')

2169            if encoding != 'utf-32le':

2170                sys.stderr.write('trying utf-32le instead\n')

2171        encoding = 'utf-32le'

2172        data = data[4:]

2173    newdata = unicode(data, encoding)

2174    if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)

2175    declmatch = re.compile('^<\?xml[^>]*?>')

2176    newdecl = """<?xml version='1.0' encoding='utf-8'?>"""

2177    if declmatch.search(newdata):

2178        newdata = declmatch.sub(newdecl, newdata)

2179    else:

2180        newdata = newdecl + u'\n' + newdata

2181    return newdata.encode("utf-8")

2182

2183def _stripDoctype(data):

2184    """Strips DOCTYPE from XML document, returns (rss_version, stripped_data)

2185

2186    rss_version may be "rss091n" or None

2187    stripped_data is the same XML document, minus the DOCTYPE

2188    """

2189    entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)

2190    data = entity_pattern.sub('', data)

2191    doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)

2192    doctype_results = doctype_pattern.findall(data)

2193    doctype = doctype_results and doctype_results[0] or ''

2194    if doctype.lower().count('netscape'):

2195        version = 'rss091n'

2196    else:

2197        version = None

2198    data = doctype_pattern.sub('', data)

2199    return version, data

2200

2201def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):

2202    """Parse a feed from a URL, file, stream, or string"""

2203    result = FeedParserDict()

2204    result['feed'] = FeedParserDict()

2205    result['entries'] = []

2206    if _XML_AVAILABLE:

2207        result['bozo'] = 0

2208    if type(handlers) == types.InstanceType:

2209        handlers = [handlers]

2210    try:

2211        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)

2212        data = f.read()

2213    except Exception, e:

2214        result['bozo'] = 1

2215        result['bozo_exception'] = e

2216        data = ''

2217        f = None

2218

2219    # if feed is gzip-compressed, decompress it

2220    if f and data and hasattr(f, "headers"):

2221        if gzip and f.headers.get('content-encoding', '') == 'gzip':

2222            try:

2223                data = gzip.GzipFile(fileobj=_StringIO(data)).read()

2224            except Exception, e:

2225                # Some feeds claim to be gzipped but they're not, so

2226                # we get garbage.  Ideally, we should re-request the

2227                # feed without the "Accept-encoding: gzip" header,

2228                # but we don't.

2229                result['bozo'] = 1

2230                result['bozo_exception'] = e

2231                data = ''

2232        elif zlib and f.headers.get('content-encoding', '') == 'deflate':

2233            try:

2234                data = zlib.decompress(data, -zlib.MAX_WBITS)

2235            except Exception, e:

2236                result['bozo'] = 1

2237                result['bozo_exception'] = e

2238                data = ''

2239

2240    # save HTTP headers

2241    if hasattr(f, "info"):

2242        info = f.info()

2243        result["etag"] = info.getheader("ETag")

2244        last_modified = info.getheader("Last-Modified")

2245        if last_modified:

2246            result["modified"] = _parse_date(last_modified)

2247    if hasattr(f, "url"):

2248        result["url"] = f.url

2249        result["status"] = 200

2250    if hasattr(f, "status"):

2251        result["status"] = f.status

2252    if hasattr(f, "headers"):

2253        result["headers"] = f.headers.dict

2254    if hasattr(f, "close"):

2255        f.close()

2256

2257    # there are four encodings to keep track of:

2258    # - http_encoding is the encoding declared in the Content-Type HTTP header

2259    # - xml_encoding is the encoding declared in the <?xml declaration

2260    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data

2261    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications

2262    http_headers = result.get("headers", {})

2263    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =           _getCharacterEncoding(http_headers, data)

2265    if http_headers and (not acceptable_content_type):

2266        if http_headers.has_key('content-type'):

2267            bozo_message = '%s is not an XML media type' % http_headers['content-type']

2268        else:

2269            bozo_message = 'no Content-type specified'

2270        result['bozo'] = 1

2271        result['bozo_exception'] = NonXMLContentType(bozo_message)

2272

2273    result['version'], data = _stripDoctype(data)

2274

2275    baseuri = http_headers.get('content-location', result.get('url'))

2276    baselang = http_headers.get('content-language', None)

2277

2278    # if server sent 304, we're done

2279    if result.get("status", 0) == 304:

2280        result['version'] = ''

2281        result['debug_message'] = "The feed has not changed since you last checked, " +               "so the server sent no data.  This is a feature, not a bug!"

2283        return result

2284

2285    # if there was a problem downloading, we're done

2286    if not data:

2287        return result

2288

2289    # determine character encoding

2290    use_strict_parser = 0

2291    known_encoding = 0

2292    tried_encodings = []

2293    for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding, 'utf-8', 'windows-1252'):

2294        if proposed_encoding in tried_encodings: continue

2295        if not proposed_encoding: continue

2296        try:

2297            data = _toUTF8(data, proposed_encoding)

2298            known_encoding = 1

2299            use_strict_parser = 1

2300            break

2301        except:

2302            pass

2303        tried_encodings.append(proposed_encoding)

2304    if not known_encoding:

2305        result['bozo'] = 1

2306        result['bozo_exception'] = CharacterEncodingUnknown(               "document encoding unknown, I tried " +               "%s, %s, utf-8, and windows-1252 but nothing worked" %               (result['encoding'], xml_encoding))

2310        result['encoding'] = ''

2311    elif proposed_encoding != result['encoding']:

2312        result['bozo'] = 1

2313        result['bozo_exception'] = CharacterEncodingOverride(               "documented declared as %s, but parsed as %s" %               (result['encoding'], proposed_encoding))

2316        result['encoding'] = proposed_encoding

2317

2318    if not _XML_AVAILABLE:

2319        use_strict_parser = 0

2320    if use_strict_parser:

2321        # initialize the SAX parser

2322        feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')

2323        saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)

2324        saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)

2325        saxparser.setContentHandler(feedparser)

2326        saxparser.setErrorHandler(feedparser)

2327        source = xml.sax.xmlreader.InputSource()

2328        source.setByteStream(_StringIO(data))

2329        if hasattr(saxparser, '_ns_stack'):

2330            # work around bug in built-in SAX parser (doesn't recognize xml: namespace)

2331            # PyXML doesn't have this problem, and it doesn't have _ns_stack either

2332            saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

2333        try:

2334            saxparser.parse(source)

2335        except Exception, e:

2336            if _debug:

2337                import traceback

2338                traceback.print_stack()

2339                traceback.print_exc()

2340                sys.stderr.write('xml parsing failed\n')

2341            result['bozo'] = 1

2342            result['bozo_exception'] = feedparser.exc or e

2343            use_strict_parser = 0

2344    if not use_strict_parser:

2345        feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')

2346        feedparser.feed(data)

2347    result['feed'] = feedparser.feeddata

2348    result['entries'] = feedparser.entries

2349    result['version'] = result['version'] or feedparser.version

2350    return result

2351

2352if __name__ == '__main__':

2353    if not sys.argv[1:]:

2354        print __doc__

2355        sys.exit(0)

2356    else:

2357        urls = sys.argv[1:]

2358    zopeCompatibilityHack()

2359    from pprint import pprint

2360    for url in urls:

2361        print url

2362        print

2363        result = parse(url)

2364        pprint(result)

2365        print

2366

2367#REVISION HISTORY

2368#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,

2369#  added Simon Fell's test suite

2370#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections

2371#2.0 - 10/19/2002

2372#  JD - use inchannel to watch out for image and textinput elements which can

2373#  also contain title, link, and description elements

2374#  JD - check for isPermaLink="false" attribute on guid elements

2375#  JD - replaced openAnything with open_resource supporting ETag and

2376#  If-Modified-Since request headers

2377#  JD - parse now accepts etag, modified, agent, and referrer optional

2378#  arguments

2379#  JD - modified parse to return a dictionary instead of a tuple so that any

2380#  etag or modified information can be returned and cached by the caller

2381#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything

2382#  because of etag/modified, return the old etag/modified to the caller to

2383#  indicate why nothing is being returned

2384#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its

2385#  useless.  Fixes the problem JD was addressing by adding it.

2386#2.1 - 11/14/2002 - MAP - added gzip support

2387#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.

2388#  start_admingeneratoragent is an example of how to handle elements with

2389#  only attributes, no content.

2390#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);

2391#  also, make sure we send the User-Agent even if urllib2 isn't available.

2392#  Match any variation of backend.userland.com/rss namespace.

2393#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.

2394#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's

2395#  snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed

2396#  project name

2397#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);

2398#  removed unnecessary urllib code -- urllib2 should always be available anyway;

2399#  return actual url, status, and full HTTP headers (as result['url'],

2400#  result['status'], and result['headers']) if parsing a remote feed over HTTP --

2401#  this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;

2402#  added the latest namespace-of-the-week for RSS 2.0

2403#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom

2404#  User-Agent (otherwise urllib2 sends two, which confuses some servers)

2405#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for

2406#  inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds

2407#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or

2408#  textInput, and also to return the character encoding (if specified)

2409#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking

2410#  nested divs within content (JohnD); fixed missing sys import (JohanS);

2411#  fixed regular expression to capture XML character encoding (Andrei);

2412#  added support for Atom 0.3-style links; fixed bug with textInput tracking;

2413#  added support for cloud (MartijnP); added support for multiple

2414#  category/dc:subject (MartijnP); normalize content model: "description" gets

2415#  description (which can come from description, summary, or full content if no

2416#  description), "content" gets dict of base/language/type/value (which can come

2417#  from content:encoded, xhtml:body, content, or fullitem);

2418#  fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang

2419#  tracking; fixed bug tracking unknown tags; fixed bug tracking content when

2420#  <content> element is not in default namespace (like Pocketsoap feed);

2421#  resolve relative URLs in link, guid, docs, url, comments, wfw:comment,

2422#  wfw:commentRSS; resolve relative URLs within embedded HTML markup in

2423#  description, xhtml:body, content, content:encoded, title, subtitle,

2424#  summary, info, tagline, and copyright; added support for pingback and

2425#  trackback namespaces

2426#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback

2427#  namespaces, as opposed to 2.6 when I said I did but didn't really;

2428#  sanitize HTML markup within some elements; added mxTidy support (if

2429#  installed) to tidy HTML markup within some elements; fixed indentation

2430#  bug in _parse_date (FazalM); use socket.setdefaulttimeout if available

2431#  (FazalM); universal date parsing and normalization (FazalM): 'created', modified',

2432#  'issued' are parsed into 9-tuple date format and stored in 'created_parsed',

2433#  'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'

2434#  and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa

2435#2.7.1 - 1/9/2004 - MAP - fixed bug handling &quot; and &apos;.  fixed memory

2436#  leak not closing url opener (JohnD); added dc:publisher support (MarekK);

2437#  added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)

2438#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in

2439#  encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);

2440#  fixed relative URI processing for guid (skadz); added ICBM support; added

2441#  base64 support

2442#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many

2443#  blogspot.com sites); added _debug variable

2444#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing

2445#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);

2446#  added several new supported namespaces; fixed bug tracking naked markup in

2447#  description; added support for enclosure; added support for source; re-added

2448#  support for cloud which got dropped somehow; added support for expirationDate

2449#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking

2450#  xml:base URI, one for documents that don't define one explicitly and one for

2451#  documents that define an outer and an inner xml:base that goes out of scope

2452#  before the end of the document

2453#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level

2454#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result["version"]

2455#  will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;

2456#  added support for creativeCommons:license and cc:license; added support for

2457#  full Atom content model in title, tagline, info, copyright, summary; fixed bug

2458#  with gzip encoding (not always telling server we support it when we do)

2459#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail

2460#  (dictionary of "name", "url", "email"); map author to author_detail if author

2461#  contains name + email address

2462#3.0b8 - 1/28/2004 - MAP - added support for contributor

2463#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added

2464#  support for summary

2465#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from

2466#  xml.util.iso8601

2467#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain

2468#  dangerous markup; fiddled with decodeEntities (not right); liberalized

2469#  date parsing even further

2470#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);

2471#  added support to Atom 0.2 subtitle; added support for Atom content model

2472#  in copyright; better sanitizing of dangerous HTML elements with end tags

2473#  (script, frameset)

2474#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,

2475#  etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)

2476#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under

2477#  Python 2.1

2478#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;

2479#  fixed bug capturing author and contributor URL; fixed bug resolving relative

2480#  links in author and contributor URL; fixed bug resolvin relative links in

2481#  generator URL; added support for recognizing RSS 1.0; passed Simon Fell's

2482#  namespace tests, and included them permanently in the test suite with his

2483#  permission; fixed namespace handling under Python 2.1

2484#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)

2485#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023

2486#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);

2487#  use libxml2 (if available)

2488#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author

2489#  name was in parentheses; removed ultra-problematic mxTidy support; patch to

2490#  workaround crash in PyXML/expat when encountering invalid entities

2491#  (MarkMoraes); support for textinput/textInput

2492#3.0b20 - 4/7/2004 - MAP - added CDF support

2493#3.0b21 - 4/14/2004 - MAP - added Hot RSS support

2494#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in

2495#  results dict; changed results dict to allow getting values with results.key

2496#  as well as results[key]; work around embedded illformed HTML with half

2497#  a DOCTYPE; work around malformed Content-Type header; if character encoding

2498#  is wrong, try several common ones before falling back to regexes (if this

2499#  works, bozo_exception is set to CharacterEncodingOverride); fixed character

2500#  encoding issues in BaseHTMLProcessor by tracking encoding and converting

2501#  from Unicode to raw strings before feeding data to sgmllib.SGMLParser;

2502#  convert each value in results to Unicode (if possible), even if using

2503#  regex-based parsing

2504#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain

2505#  high-bit characters in attributes in embedded HTML in description (thanks

2506#  Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in

2507#  FeedParserDict; tweaked FeedParserDict.has_key to return True if asking

2508#  about a mapped key

2509#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and

2510#  results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could

2511#  cause the same encoding to be tried twice (even if it failed the first time);

2512#  fixed DOCTYPE stripping when DOCTYPE contained entity declarations;

2513#  better textinput and image tracking in illformed RSS 1.0 feeds

2514#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed

2515#  my blink tag tests

2516#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that

2517#  failed to parse utf-16 encoded feeds; made source into a FeedParserDict;

2518#  duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;

2519#  added support for image; refactored parse() fallback logic to try other

2520#  encodings if SAX parsing fails (previously it would only try other encodings

2521#  if re-encoding failed); remove unichr madness in normalize_attrs now that

2522#  we're properly tracking encoding in and out of BaseHTMLProcessor; set

2523#  feed.language from root-level xml:lang; set entry.id from rdf:about;

2524#  send Accept header

2525#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between

2526#  iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are

2527#  windows-1252); fixed regression that could cause the same encoding to be

2528#  tried twice (even if it failed the first time)

2529#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;

2530#  recover from malformed content-type header parameter with no equals sign

2531#  ("text/xml; charset:iso-8859-1")

2532#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities

2533#  to Unicode equivalents in illformed feeds (aaronsw); added and

2534#  passed tests for converting character entities to Unicode equivalents

2535#  in illformed feeds (aaronsw); test for valid parsers when setting

2536#  XML_AVAILABLE; make version and encoding available when server returns

2537#  a 304; add handlers parameter to pass arbitrary urllib2 handlers (like

2538#  digest auth or proxy support); add code to parse username/password

2539#  out of url and send as basic authentication; expose downloading-related

2540#  exceptions in bozo_exception (aaronsw); added __contains__ method to

2541#  FeedParserDict (aaronsw); added publisher_detail (aaronsw)

2542#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always

2543#  convert feed to UTF-8 before passing to XML parser; completely revamped

2544#  logic for determining character encoding and attempting XML parsing

2545#  (much faster); increased default timeout to 20 seconds; test for presence

2546#  of Location header on redirects; added tests for many alternate character

2547#  encodings; support various EBCDIC encodings; support UTF-16BE and

2548#  UTF16-LE with or without a BOM; support UTF-8 with a BOM; support

2549#  UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no

2550#  XML parsers are available; added support for "Content-encoding: deflate";

2551#  send blank "Accept-encoding: " header if neither gzip nor zlib modules

2552#  are available

2553#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure

2554#  problem tracking xml:base and xml:lang if element declares it, child

2555#  doesn't, first grandchild redeclares it, and second grandchild doesn't;

2556#  refactored date parsing; defined public registerDateHandler so callers

2557#  can add support for additional date formats at runtime; added support

2558#  for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added

2559#  zopeCompatibilityHack() which turns FeedParserDict into a regular

2560#  dictionary, required for Zope compatibility, and also makes command-

2561#  line debugging easier because pprint module formats real dictionaries

2562#  better than dictionary-like objects; added NonXMLContentType exception,

2563#  which is stored in bozo_exception when a feed is served with a non-XML

2564#  media type such as "text/plain"; respect Content-Language as default

2565#  language if not xml:lang is present; cloud dict is now FeedParserDict;

2566#  generator dict is now FeedParserDict; better tracking of xml:lang,

2567#  including support for xml:lang="" to unset the current language;

2568#  recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default

2569#  namespace; don't overwrite final status on redirects (scenarios:

2570#  redirecting to a URL that returns 304, redirecting to a URL that

2571#  redirects to another URL with a different type of redirect); add

2572#  support for HTTP 303 redirects