0001from feedparser import parse
0002from feedfinder import getFeeds
0003import md5,os,time,re, urllib2
0004import logging
0005
0006__timeRe = re.compile("((?P<days>\d+)d)?(:)?((?P<hours>\d+)h)?(:)?((?P<minutes>\d+)m)?")
0007
0008def sign(text):
0009    return md5.new(text.encode('utf-8')).hexdigest()
0010
0011def str2seconds(stri):
0012    result = __timeRe.match(stri)
0013    days = hours = minutes = 0
0014    if result:
0015        dict = result.groupdict()
0016        if dict['days']:   days   = int(dict['days'])
0017        if dict['hours']:  hours  = int(dict['hours'])
0018        if dict['minutes']: minutes = int(dict['minutes'])
0019    return ((days * 86400) + (hours * 3600) + (minutes * 60))
0020
0021class NotFound(Exception): pass
0022class FetchError(Exception): pass
0023class ParseError(Exception): pass
0024class TooYoung(Exception):
0025    def __init__(self,aString):
0026        self.when = aString
0027
0028class FormatError(Exception):
0029    def __init__(self, msg):
0030        self.msg = str(msg)
0031
0032    def __str__(self):
0033        return self.msg
0034
0035class FeedCollector:
0036
0037    def __init__(self, iAg):
0038        config = iAg.getConfig()
0039        self.cache = config.getCacheDir()
0040        self.blackList = config.getBlackListKeywords()
0041        self.whiteList = config.getWhiteListKeywords()
0042        self.logger = logging.getLogger("FmTorrent")
0043
0044    def collect(self,id,location,timeInterval, filters, mungeTorrentFunc):
0045        """ Fetch a RSS feed on a given location
0046            identified by id and return the found
0047            feed as a hash."""
0048        secs = str2seconds(timeInterval)
0049        cache, cachePath = self._getCache(id)
0050
0051        # refresh-rate checking
0052        timeStamp = os.path.join(cachePath,'timestamp')
0053        currentTime = int(time.time())
0054        try:
0055            f = open(timeStamp)
0056            lastTime = int(f.read())
0057            f.close()
0058        except IOError, e:
0059            lastTime = 0
0060        elapsed, result = (currentTime - lastTime), {}
0061        if elapsed >= secs:
0062            currTime = str(int(time.time()))
0063            self.logger.info('Looking for fresh items on feed [%s]' % id)
0064            # look for some feeds
0065            try:
0066                feeds = getFeeds(location)
0067            except Exception,e:
0068                raise FetchError("Problem while fetching [%s]" % id + str(e))
0069            if feeds:
0070                # refresh feed update time
0071                f = open(timeStamp,'w')
0072                f.write(currTime)
0073                f.close()
0074                # look for something to process
0075                for feed in feeds:
0076                    if result != {}: break
0077                    result = self._process(id,feed, filters, mungeTorrentFunc)
0078
0079        else:
0080            self._purgeCache(id)
0081            when = time.strftime("%Hh:%Mm",time.gmtime(secs - elapsed))
0082            raise TooYoung, when
0083        if result == {}: raise NotFound, None
0084        return result
0085
0086    def _getCache(self, id):
0087        """ little helper to list the old feeds of a given
0088            location identified by id. """
0089        id = id.replace(' ','')
0090        # each location has its own cache directory on the
0091        # main cache directory (~/.iAggregator by default)
0092        locationCache = os.path.join(self.cache,id)
0093        if not os.path.exists(locationCache): os.mkdir(locationCache)
0094        return os.listdir(locationCache),locationCache
0095
0096    def _purgeCache(self, id):
0097        """ Purge the cache. All MD5s (excepted those stored in currentFeeds file)
0098            are deleted.
0099        """
0100        cache, cachePath = self._getCache(id)
0101        cache = filter(lambda x: x not in ['timestamp','currentFeeds'], cache)
0102        try:
0103            currentFeeds = open(os.path.join(cachePath,'currentFeeds'),'r')
0104        except:
0105            return
0106        currentFeedsMD5 = [ line[:-1] for line in currentFeeds.readlines() ] # strip ending \n
0107        toRemove = filter(lambda x: x not in currentFeedsMD5, cache)
0108        r = [ os.unlink(os.path.join(cachePath,i)) for i in toRemove]
0109
0110    def _process(self, id, feed, filters, mungeTorrentFunc):
0111        """ Process a feed by signing it, registering its items
0112            in the cache and appending them in a new up-to-date
0113            feed. """
0114        cache, cachePath = self._getCache(id)
0115        currentFeeds = open(os.path.join(cachePath,'currentFeeds'),'w')
0116        # walk on the feed items
0117        try:
0118            dict, newItems = parse(feed), {}
0119        except Exception,e:
0120            f = open('/tmp/%s' % id, 'w')
0121            f.write(feed)
0122            f.close()
0123            raise ParseError(u"Problem while parsing feed [%s]. RSS saved to /tmp/%s." % (id,id) + str(e))
0124        for item in dict['items']:
0125            if item.has_key('description'):
0126                signature = sign(item['description'])
0127            elif item.has_key('link'):
0128                signature = sign(item['link'])
0129            else:
0130                # the feed doesn't seem to be a valid RSS
0131                # let's tell collect() to process another one
0132                newItems = {}
0133                break
0134
0135            signFile = os.path.join(cachePath,signature)
0136            currentFeeds.write('%s\n' % signature)
0137            if signature not in cache:
0138                # add to the cache
0139                open(signFile,'w').close()
0140                cache.append(signature)
0141
0142                # check {white,black}List
0143                if self._checkLists(item):
0144                    urls = self._getTorrentURL(item, filters, mungeTorrentFunc)
0145                    if urls:
0146                        newItems.update(urls)
0147
0148        if newItems:
0149            # re-affect the up-to-date items
0150            dict['entries'] = newItems
0151        else: dict = {}
0152
0153        currentFeeds.close()
0154        return dict
0155
0156    def _checkLists(self, item):
0157        try:
0158            key = item['description']
0159        except:
0160            key = item['link']
0161        itemData = item['title'] + key
0162
0163        if len(self.blackList) or len(self.whiteList):
0164            for word in self.blackList:
0165                if itemData.find(word.lower()) > -1:
0166                    return False
0167
0168            for word in self.whiteList:
0169                if itemData.find(word.lower()) > -1:
0170                    return True
0171            return False
0172        else:
0173            return True
0174
0175    def _applyFilters(self, name, filters):
0176        ok = False
0177        for pattern in filters:
0178            if callable(pattern):
0179                ok = pattern(name)
0180            else:
0181                ok = re.match(pattern, name)
0182            if ok:
0183                break
0184        return ok
0185
0186    def _getTorrentURL(self, item, filters, mungeTorrentFunc):
0187        urls = {}
0188        title = item.get('title','')
0189        desc = item.get('description','')
0190        enclosures = item.get('enclosures',[])
0191        link = item.get('link','')
0192        name = ''
0193
0194        if enclosures:
0195            for enclosure in enclosures:
0196                url = enclosure.get('href','')
0197                if not url:
0198                    url = enclosure.get('url','')
0199                if url:
0200                    name = os.path.basename(urllib2.urlparse.urlparse(url)[2])
0201                    urls.update({name: url})
0202        else:
0203            if self._applyFilters(title, filters):
0204                name = title
0205            elif self._applyFilters(desc, filters):
0206                name = desc
0207
0208            if name and link:
0209                urls.update({name: mungeTorrentFunc(link)})
0210
0211        return urls