For practice I'm making a database that scrapes a music rating website to give album, artist, rating.
How do I prevent the same data from being duplicated in my table when I run the script multiple times?
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import urllib.error
import sqlite3
conn = sqlite3.connect('pitchscraper.sqlite')
cur = conn.cursor()
#create table
cur.execute('''
CREATE TABLE IF NOT EXISTS Albums (id INTEGER, rating INTEGER, name TEXT, url TEXT, artist TEXT)''')
#open and read page
req = Request('http://pitchfork.com/reviews/albums/?page=1', headers={'User-Agent': 'Mozilla/5.0'})
pitchpage = urlopen(req).read()
#parse with beautiful soup
soup = BeautifulSoup(pitchpage, "lxml")
albums = soup('h2')
artists = soup.find_all(attrs={"class" : "artist-list"})
print("ALBUMS")
for tag in albums:
for album in tag:
print(album)
# need to fix this so that duplicate code is not added
cur.execute('INSERT OR IGNORE INTO Albums (name) VALUES (?)', (album, ))