from twill.commands import *
from bs4 import BeautifulSoup
from urllib import urlopen
import urllib2
with open('urls.txt') as inf:
urls = (line.strip() for line in inf)
for url in urls:
try:
urllib2.urlopen(url)
except urllib2.HTTPError, e:
print e
site = urlopen(url)
soup = BeautifulSoup(site)
for td in soup.find_all('td', {'class': 'subjectCell'}):
print td.find('a').text
my code opens only a single page from each url of the file, sometimes there are more pages, in that case the pattern for the next pages would be &page=x
here are those pages i'm talking about:
http://www.last.fm/user/TheBladeRunner_/library/tags?tag=long+track http://www.last.fm/user/TheBladeRunner_/library/tags?tag=long+track&page=7