New to scrapy and python and running into an issue here.
I'm trying to get the entire list of PS3 games from Metacritic. Here is my code:
class MetacriticSpider(BaseSpider):
name = "metacritic"
allowed_domains = ["metacritic.com"]
max_id = 10
start_urls = [
"http://www.metacritic.com/browse/games/title/ps3?page="
#"http://www.metacritic.com/browse/games/title/xbox360?page=0"
]
def start_requests(self):
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps3/{0}?page={1}'.format(c, i), callback = self.parse)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[@class="product_wrap"]/div')
items = []
for site in sites:
#item = MetacriticItem()
#titles = site.xpath('a/text()').extract()
titles = site.xpath('//div[contains(@class, "basic_stat product_title")]/a/text()').extract()
#cscore = site.xpath('//div[contains(@class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()
if titles:
item = MetacriticItem()
item['title'] = titles[0].strip()
items.append(item)
return items
For some reason when I check the JSON file, I have 81 instances of each title, and it is starting on Assassin's Creed: Revelations - Ancestors Character Pack
It should be starting on the first page which is numbered titles, then progressing to the A list, and checking each page in that etc. Any ideas on why it is doing it this way, I can't see what my problem is