Okay the main issue I am having is I have a table that should not have any duplicate entries. This is because I was hoping to have a Primary Key which would be referenced in a separate table.
From what I understand about normalization it is better to have your database designed in this fashion. So right now I have a table that has a bunch of duplicate entries (there should be only 6 unique entries but there are 30 entries, each unique one is repeated 5 times).
How should I fix this? Should I fix this when importing the data, or by using a UNIQUE keyword. To note when I tried to use the UNIQUE feature this time around, it gave me an error as the data that I am importing indeed has duplicates entries.
EDIT:
This is what my items looks like:
from scrapy.item import Item, Field
class TeamStats(Item):
# define the fields for your item here like:
# name = scrapy.Field()
team = Field()
division = Field()
rosterurl = Field()
player_desc = Field()
playerurl = Field()
pass
class Player(Item):
exp = Field()
pass
This is what my code looks like:
import scrapy
import string
import re
from scrapy.selector import HtmlXPathSelector ##needed to import xpath command
from scrapy.shell import inspect_response ##needed for Response object
from nbastats.items import TeamStats, Player ##needed to import player stats
class NbastatsSpider(scrapy.Spider):
name = "nbaStats"
start_urls = [
"http://espn.go.com/nba/teams" ##only start not allowed because had some issues when navigated to team roster pages
]
def parse(self,response):
items = [] ##array or list that stores TeamStats item
i=0 ##counter needed for older code
for division in response.xpath('//div[@id="content"]//div[contains(@class, "mod-teams-list-medium")]'):
for team in division.xpath('.//div[contains(@class, "mod-content")]//li'):
item = TeamStats()
item['division'] = division.xpath('.//div[contains(@class, "mod-header")]/h4/text()').extract()[0]
item['team'] = team.xpath('.//h5/a/text()').extract()[0]
item['rosterurl'] = "http://espn.go.com" + team.xpath('.//div/span[2]/a[3]/@href').extract()[0]
items.append(item)
print(item['rosterurl'])
request = scrapy.Request(item['rosterurl'], callback = self.parseWPNow)
request.meta['play'] = item
yield request
def parseWPNow(self, response):
item = response.meta['play']
item = self.parseRoster(item, response)
return item
def parseRoster(self, item, response):
players1 = []
int = 0
for players in response.xpath("//td[@class='sortcell']"):
play = {}
play['name'] = players.xpath("a/text()").extract()[0]
play['position'] = players.xpath("following-sibling::td[1]").extract()[0]
play['age'] = players.xpath("following-sibling::td[2]").extract()[0]
play['height'] = players.xpath("following-sibling::td[3]").extract()[0]
play['weight'] = players.xpath("following-sibling::td[4]").extract()[0]
play['college'] = players.xpath("following-sibling::td[5]").extract()[0]
play['salary'] = players.xpath("following-sibling::td[6]").extract()[0]
players1.append(play)
item['playerurl'] = response.xpath("//td[@class='sortcell']/a").extract()
item['player_desc']=players1
return item
This is what my pipeline looks like:
class NbastatsPipeline(object):
def __init__(self):
self.setupDBCon()
self.createTables()
def setupDBCon(self):
self.con = lite.connect('test.db')
self.cur = self.con.cursor()
def createTables(self):
self.dropTeamsTable()
self.dropPlayersTable()
self.dropDivsTable()
self.createTeamsTable()
self.createPlayersTable()
self.createDivsTable()
def createTeamsTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Teams(P_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
team TEXT, \
DivId INTEGER, \
FOREIGN KEY (DivId) REFERENCES Divs1(Did) \
)")
def createDivsTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Divs(Did INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
division TEXT)")
def createPlayersTable(self):
self.cur.execute("CREATE TABLE IF NOT EXISTS Players(player_name TEXT, \
salary TEXT, \
weight INTEGER, \
age INTEGER, \
college TEXT )")
def dropTeamsTable(self):
self.cur.execute("DROP TABLE IF EXISTS Teams")
def dropPlayersTable(self):
self.cur.execute("DROP TABLE IF EXISTS Players")
def dropDivsTable(self):
self.cur.execute("DROP TABLE IF EXISTS Divs")
def closeDB(self):
self.con.close()
def __del__(self):
self.closeDB()
def process_item(self, item, spider):
for key, value in item.iteritems():
if key == "division":
print(item.get('division', ""))
self.cur.execute("INSERT INTO Divs( division ) VALUES(?)", (item.get('division', ""),))
self.con.commit()
# self.storeInDb(item) #this is the line you'll use when ready to completely pass item through to storeInDb but it'll be lower in code
return item