Friday, October 23, 2009

feed finder in python

Here is a python code snippet which finds the RSS link in any web site...


import sys
from urllib2 import urlopen
from urlparse import urljoin
from HTMLParser import HTMLParser, HTMLParseError

class FeedAutodiscoveryParser(HTMLParser):
# These are the MIME types of links accepted as feeds
FEED_TYPES = ('application/rss+xml',
'text/xml',
'application/atom+xml',
'application/x.atom+xml',
'application/x-atom+xml')
def __init__(self, base_href):
HTMLParser.__init__(self)
self.base_href = base_href
self.feeds = []
def handle_starttag(self, tag, attrs_tup):
tag = tag.lower()
attrs = dict([(k.lower(), v) for k,v in attrs_tup])
if tag == "base" and 'href' in attrs:
self.base_href = attrs['href']
if tag == "link":
rel = attrs.get("rel", "")
type = attrs.get("type", "")
title = attrs.get("title", "")
href = attrs.get("href", "")
if rel == "alternate" and type in self.FEED_TYPES:
self.feeds.append({
'type' : type,
'title' : title,
'href' : href
})
def getFeedsDetail(url):
data = urlopen(url).read()
parser = FeedAutodiscoveryParser(url)
try:
parser.feed(data)
except HTMLParseError:
pass
for feed in parser.feeds:
feed['href'] = urljoin(parser.base_href, feed['href'])
return parser.feeds
def getFeeds(url):
return [ x['href'] for x in getFeedsDetail(url) ]


def main():
url = sys.argv[1]
feeds = getFeedsDetail(url)
print
print "Site %s : " % url
print "###########################################"
print
for feed in feeds:
print "Title : '%(title)s' \nType : %(type)s \nURI : %(href)s" % feed
print "------------------------------------------------------------------------"
print

if __name__ == "__main__":
main()


The use is...


F:\Python26>python minifeedfinder.py http://www.timesofindia.com/

Site http://www.timesofindia.com/ :
###########################################

Title : ''
Type : application/rss+xml
URI : http://www.timesofindia.com/rssfeedsdefault.cms
------------------------------------------------------------------------


F:\Python26>python minifeedfinder.py http://asitdhal.blogspot.com/

Site http://asitdhal.blogspot.com/ :
###########################################

Title : 'Life like this - Atom'
Type : application/atom+xml
URI : http://asitdhal.blogspot.com/feeds/posts/default
------------------------------------------------------------------------
Title : 'Life like this - RSS'
Type : application/rss+xml
URI : http://asitdhal.blogspot.com/feeds/posts/default?alt=rss
------------------------------------------------------------------------


The equivalent php code is in the following link...
http://kodeyard.blogspot.com/2009/10/feed-finder-in-php.html

No comments:

Post a Comment