I have some questions regarding web scraping with selenium for python. I attempted to web scrape a table of pokemon names and stats from pokemondb.net, and I saved that data into a pandas dataframe in my jupyter notebook. The problem is that it takes 2-3 minutes to scrape all the data, and I assumed that this a bit too time consuming of a process. I was wondering if maybe I did a poor job of coding my web scraping program? I also programmed it to scrape all the table data 1 column at a time, and I believe that this may be one reason why it is not as efficient as possible. I would appreciate if anyone can take a look and offer any suggestions.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import os
import numpy as np
import pandas as pd
import matplotlib as plt
driver = webdriver.Chrome('drivers/chromedriver.exe') # assign the driver path to variable
driver.get('https://pokemondb.net/pokedex/all') # get request - opens chrome browser and navigates to URL
driver.minimize_window() # minimize window
pokemon_id = []
pokemon_id_html = driver.find_elements(By.CLASS_NAME, 'infocard-cell-data') # retrieve the pokemon id column from pokemondb.net
for poke_id in pokemon_id_html:
pokemon_id.append(poke_id.text)
pokemon_name = []
pokemon_name_html = driver.find_elements(By.CLASS_NAME, 'ent-name') # retrieve the pokemon name column
for name in pokemon_name_html:
pokemon_name.append(name.text)
pokemon_type = []
pokemon_type_html = driver.find_elements(By.CLASS_NAME, 'cell-icon') # retrieve pokemon type
for p_type in pokemon_type_html:
pokemon_type.append(p_type.text)
pokemon_total = []
pokemon_total_html = driver.find_elements(By.CLASS_NAME, 'cell-total') # retrieve pokemon total stats
for total in pokemon_total_html:
pokemon_total.append(total.text)
pokemon_hp = []
pokemon_hp_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][1]") # retrieve pokemon hp stat
for hp in pokemon_hp_html:
pokemon_hp.append(hp.text)
pokemon_attack = []
pokemon_attack_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][2]") # retrieve pokemon attack stat
for attack in pokemon_attack_html:
pokemon_attack.append(attack.text)
pokemon_defense = []
pokemon_defense_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][3]") # retrieve pokemon defense stat
for defense in pokemon_defense_html:
pokemon_defense.append(defense.text)
pokemon_special_attack = []
pokemon_special_attack_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][4]") # retrieve pokemon sp. attack stat
for special_attack in pokemon_special_attack_html:
pokemon_special_attack.append(special_attack.text)
pokemon_special_defense = []
pokemon_special_defense_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][5]") # retrieve pokemon sp. defense stat
for special_defense in pokemon_special_defense_html:
pokemon_special_defense.append(special_defense.text)
pokemon_speed = []
pokemon_speed_html = driver.find_elements(By.XPATH, "//*[@class='cell-num'][6]") # retrieve pokemon speed stat
for speed in pokemon_speed_html:
pokemon_speed.append(speed.text)
driver.close() # close driver, end session
columns = ['id', 'name', 'type', 'total', 'hp', 'attack', 'defense', 'special-attack', 'special-defense', 'speed'] # column names (labels) for dataset
attributes = [pokemon_id, pokemon_name, pokemon_type, pokemon_total, pokemon_hp, pokemon_attack, pokemon_defense, pokemon_special_attack, pokemon_special_defense, pokemon_speed] # list of values for each column (rows) for dataset