I want to create a simple (one page) web application using Django, and see the top 20 websites from alexa.com/topsites/global. The page should render a table with 21 rows (1 header and 20 websites) and 3 columns (rank, website and description).
My knowledge using django is limitted and I really need some help if possible.
I've used a template to create a table using some bootstrap but I actually don't have any idea on how to parse: rank / website name / and description.
Could anybody lead me in the right direction with some usefull websites / code snippets ?
I know that I have to use HTMLParser
and implement something like:
from HTMLParser import HTMLParser
# create a subclass and override the handler methods
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print "Encountered a start tag:", tag
def handle_endtag(self, tag):
print "Encountered an end tag :", tag
def handle_data(self, data):
print "Encountered some data :", data
# instantiate the parser and fed it some HTML
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1>Parse me!</h1></body></html>')
But I don't know how to use it on my requirements in my application.
So, I am comming back with an update. I've tried to do this (just to print the results to see if I get what I want) but I only get some links.
Any help ?
import urllib2, HTMLParser
class MyHTMLParser(HTMLParser.HTMLParser):
def reset(self):
HTMLParser.HTMLParser.reset(self)
#count div to get the rank of website
self.in_count_div = False
#description div to get description of website
self.in_description_div = False
#a tag to get the url
self.in_link_a = False
self.count_items = None
self.a_link_items = None
self.description_items = None
def handle_starttag(self, tag, attrs):
if tag == 'div':
if('class', 'count') in attrs:
self.in_count_div = True
if tag == 'a':
for name, value in attrs:
if name == 'href':
self.a_link_items = [value,'']
self.in_link_a = True
break
if tag == 'div':
if('class', 'description') in attrs:
self.in_description_div = True
#handle data for each section
def handle_data_count(self, data):
if self.in_count_div:
self.count_items[1] += data
def handle_data_url(self, data):
if self.in_link_a:
self.a_link_items[1] += data
def handle_data_description(self, data):
if self.in_description_div:
self.description_items[1] += data
#endtag
def handle_endtag(self, tag):
if tag =='div':
if self.count_items is not None:
print self.count_items
self.count_items = None
self.in_count_div = False
if tag =='a':
if self.a_link_items is not None:
print self.a_link_items
self.a_link_items = None
self.in_link_a = False
if __name__ == '__main__':
myhtml = MyHTMLParser()
myhtml.feed(urllib2.urlopen('http://www.alexa.com/topsites/global').read())