0001from feedparser import parse
0002from feedfinder import getFeeds
0003import md5,os,time,re, urllib2
0004import logging
0005
0006__timeRe = re.compile("((?P<days>\d+)d)?(:)?((?P<hours>\d+)h)?(:)?((?P<minutes>\d+)m)?")
0007
0008def sign(text):
0009 return md5.new(text.encode('utf-8')).hexdigest()
0010
0011def str2seconds(stri):
0012 result = __timeRe.match(stri)
0013 days = hours = minutes = 0
0014 if result:
0015 dict = result.groupdict()
0016 if dict['days']: days = int(dict['days'])
0017 if dict['hours']: hours = int(dict['hours'])
0018 if dict['minutes']: minutes = int(dict['minutes'])
0019 return ((days * 86400) + (hours * 3600) + (minutes * 60))
0020
0021class NotFound(Exception): pass
0022class FetchError(Exception): pass
0023class ParseError(Exception): pass
0024class TooYoung(Exception):
0025 def __init__(self,aString):
0026 self.when = aString
0027
0028class FormatError(Exception):
0029 def __init__(self, msg):
0030 self.msg = str(msg)
0031
0032 def __str__(self):
0033 return self.msg
0034
0035class FeedCollector:
0036
0037 def __init__(self, iAg):
0038 config = iAg.getConfig()
0039 self.cache = config.getCacheDir()
0040 self.blackList = config.getBlackListKeywords()
0041 self.whiteList = config.getWhiteListKeywords()
0042 self.logger = logging.getLogger("FmTorrent")
0043
0044 def collect(self,id,location,timeInterval, filters, mungeTorrentFunc):
0045 """ Fetch a RSS feed on a given location
0046 identified by id and return the found
0047 feed as a hash."""
0048 secs = str2seconds(timeInterval)
0049 cache, cachePath = self._getCache(id)
0050
0051
0052 timeStamp = os.path.join(cachePath,'timestamp')
0053 currentTime = int(time.time())
0054 try:
0055 f = open(timeStamp)
0056 lastTime = int(f.read())
0057 f.close()
0058 except IOError, e:
0059 lastTime = 0
0060 elapsed, result = (currentTime - lastTime), {}
0061 if elapsed >= secs:
0062 currTime = str(int(time.time()))
0063 self.logger.info('Looking for fresh items on feed [%s]' % id)
0064
0065 try:
0066 feeds = getFeeds(location)
0067 except Exception,e:
0068 raise FetchError("Problem while fetching [%s]" % id + str(e))
0069 if feeds:
0070
0071 f = open(timeStamp,'w')
0072 f.write(currTime)
0073 f.close()
0074
0075 for feed in feeds:
0076 if result != {}: break
0077 result = self._process(id,feed, filters, mungeTorrentFunc)
0078
0079 else:
0080 self._purgeCache(id)
0081 when = time.strftime("%Hh:%Mm",time.gmtime(secs - elapsed))
0082 raise TooYoung, when
0083 if result == {}: raise NotFound, None
0084 return result
0085
0086 def _getCache(self, id):
0087 """ little helper to list the old feeds of a given
0088 location identified by id. """
0089 id = id.replace(' ','')
0090
0091
0092 locationCache = os.path.join(self.cache,id)
0093 if not os.path.exists(locationCache): os.mkdir(locationCache)
0094 return os.listdir(locationCache),locationCache
0095
0096 def _purgeCache(self, id):
0097 """ Purge the cache. All MD5s (excepted those stored in currentFeeds file)
0098 are deleted.
0099 """
0100 cache, cachePath = self._getCache(id)
0101 cache = filter(lambda x: x not in ['timestamp','currentFeeds'], cache)
0102 try:
0103 currentFeeds = open(os.path.join(cachePath,'currentFeeds'),'r')
0104 except:
0105 return
0106 currentFeedsMD5 = [ line[:-1] for line in currentFeeds.readlines() ]
0107 toRemove = filter(lambda x: x not in currentFeedsMD5, cache)
0108 r = [ os.unlink(os.path.join(cachePath,i)) for i in toRemove]
0109
0110 def _process(self, id, feed, filters, mungeTorrentFunc):
0111 """ Process a feed by signing it, registering its items
0112 in the cache and appending them in a new up-to-date
0113 feed. """
0114 cache, cachePath = self._getCache(id)
0115 currentFeeds = open(os.path.join(cachePath,'currentFeeds'),'w')
0116
0117 try:
0118 dict, newItems = parse(feed), {}
0119 except Exception,e:
0120 f = open('/tmp/%s' % id, 'w')
0121 f.write(feed)
0122 f.close()
0123 raise ParseError(u"Problem while parsing feed [%s]. RSS saved to /tmp/%s." % (id,id) + str(e))
0124 for item in dict['items']:
0125 if item.has_key('description'):
0126 signature = sign(item['description'])
0127 elif item.has_key('link'):
0128 signature = sign(item['link'])
0129 else:
0130
0131
0132 newItems = {}
0133 break
0134
0135 signFile = os.path.join(cachePath,signature)
0136 currentFeeds.write('%s\n' % signature)
0137 if signature not in cache:
0138
0139 open(signFile,'w').close()
0140 cache.append(signature)
0141
0142
0143 if self._checkLists(item):
0144 urls = self._getTorrentURL(item, filters, mungeTorrentFunc)
0145 if urls:
0146 newItems.update(urls)
0147
0148 if newItems:
0149
0150 dict['entries'] = newItems
0151 else: dict = {}
0152
0153 currentFeeds.close()
0154 return dict
0155
0156 def _checkLists(self, item):
0157 try:
0158 key = item['description']
0159 except:
0160 key = item['link']
0161 itemData = item['title'] + key
0162
0163 if len(self.blackList) or len(self.whiteList):
0164 for word in self.blackList:
0165 if itemData.find(word.lower()) > -1:
0166 return False
0167
0168 for word in self.whiteList:
0169 if itemData.find(word.lower()) > -1:
0170 return True
0171 return False
0172 else:
0173 return True
0174
0175 def _applyFilters(self, name, filters):
0176 ok = False
0177 for pattern in filters:
0178 if callable(pattern):
0179 ok = pattern(name)
0180 else:
0181 ok = re.match(pattern, name)
0182 if ok:
0183 break
0184 return ok
0185
0186 def _getTorrentURL(self, item, filters, mungeTorrentFunc):
0187 urls = {}
0188 title = item.get('title','')
0189 desc = item.get('description','')
0190 enclosures = item.get('enclosures',[])
0191 link = item.get('link','')
0192 name = ''
0193
0194 if enclosures:
0195 for enclosure in enclosures:
0196 url = enclosure.get('href','')
0197 if not url:
0198 url = enclosure.get('url','')
0199 if url:
0200 name = os.path.basename(urllib2.urlparse.urlparse(url)[2])
0201 urls.update({name: url})
0202 else:
0203 if self._applyFilters(title, filters):
0204 name = title
0205 elif self._applyFilters(desc, filters):
0206 name = desc
0207
0208 if name and link:
0209 urls.update({name: mungeTorrentFunc(link)})
0210
0211 return urls