0

Im trying to scrape video titles from the link in the code.

Essentially want to scroll+scrape.

My code runs, but it scrapes half of the page, and instead of scraping the remaining half, repeats the first half.

import time
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = Options()
driver = webdriver.Chrome(ChromeDriverManager().install())
url='https://www.youtube.com/user/OakDice/videos'
driver.get(url)
content=driver.page_source.encode('utf-8').strip()
soup=BeautifulSoup(content,'lxml')
SCROLL_PAUSE_TIME = 2

# Get scroll height
last_height = driver.execute_script("return document.documentElement.scrollHeight")

while True:
    # Scroll down to bottom
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    titles = soup.findAll('a', id='video-title')
    for title in titles:
        print(title.text)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
Void S
  • 752
  • 4
  • 14
  • As I said in your other question (https://stackoverflow.com/questions/65008223/how-to-get-views-from-youtube). I don't think you want to use driver.page_source – DMart Nov 25 '20 at 16:34

2 Answers2

0

I think the issue is that you have is in the if loop, its possible that new_height can artificially equal last height, that's where I would look

Spencer
  • 34
  • 5
0

I would simply find the element you want to scroll to and use move_to_element

https://www.geeksforgeeks.org/move_to_element-method-action-chains-in-selenium-python/

# get the last video element
element = driver.find_elements_by_id("video-title")[-1]
  
# create action chain object 
action = ActionChains(driver) 
  
# perform the operation 
action.move_to_element(element)

That should be enough to trigger the infinite scroll on you tube.

DMart
  • 2,401
  • 1
  • 14
  • 19