Youtube url scrapping using python

Question

URL: https://www.youtube.com/@PW-Foundation/videos Write a Python program to extract the video URL of the first five videos.

import requests
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import logging

youtube_search = "https://www.youtube.com/@PW-Foundation/videos"
url_search = urlopen(youtube_search)
youtube_page = url_search.read()
youtube_html = bs(youtube_page, "html.parser")
youtube_html.findAll('div', {'id':'contents'})

when I execute this, It shows an empty list.

I want an HTML source where I can find the URL of the first five videos.

Have you tried to print out the results of the `findAll()` query? — fabrik, Jul 27 '23 at 07:55

Ajeet Verma · Accepted Answer · 2023-08-01T02:16:37.017

The data is present as a JSON string within the script tag of the HTML, which you can extract and parse with just BeautifulSoup.
By default, there are data for up to 30 YouTube videos under the JSON string that holds all the information of every video.

Here's the way to extract the JSON data and process the video URLs:

import re
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen
import json

youtube_search = "https://www.youtube.com/@PW-Foundation/videos"

# Open the URL and read the content of the page
url_search = urlopen(youtube_search)
youtube_page = url_search.read()

# Parse the HTML content of the page using BeautifulSoup
youtube_html = bs(youtube_page, "html.parser")

# # Define a regular expression pattern to extract the JSON data from the script tag
pattern = r'<script nonce="[-\w]+">\n\s+var ytInitialData = (.+)'
script_data = re.search(pattern=pattern, string=youtube_html.prettify())[1].replace(';', '')

# Load the JSON data into a Python dictionary
json_data = json.loads(script_data)

# Extract the list of videos from the JSON data and store it in the 'videos_container' variable
videos_container = json_data['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']

print(f"Total videos: {len(videos_container)-1}")

# Loop through the video list and print the URLs of the videos
for video in videos_container[:-1]:
    # print(video)
    video_id = video['richItemRenderer']['content']['videoRenderer']['videoId']
    video_url = f"https://www.youtube.com/watch?v={video_id}"
    print(video_url)

output:

Total videos: 30
https://www.youtube.com/watch?v=LuTONVLzESM
https://www.youtube.com/watch?v=KWXKegvNa-I
https://www.youtube.com/watch?v=dArUpCasmnE
https://www.youtube.com/watch?v=HqG2QchBw8Y
https://www.youtube.com/watch?v=1izKrQHyx9M
https://www.youtube.com/watch?v=jXAb1evxaJc
https://www.youtube.com/watch?v=2dn7XMxRtPE
https://www.youtube.com/watch?v=Fks4dVnTb5M
https://www.youtube.com/watch?v=nIuGXeISbSo
https://www.youtube.com/watch?v=L5G-0FbyAsc
https://www.youtube.com/watch?v=uqDX6hcRf2I
https://www.youtube.com/watch?v=9ZVfDuqKIQM
https://www.youtube.com/watch?v=1wMGzlQTyeM
https://www.youtube.com/watch?v=ivS0xPAbVUs
https://www.youtube.com/watch?v=UJb799ZLCwQ
https://www.youtube.com/watch?v=RPCHRtdO9hg
https://www.youtube.com/watch?v=iN2UWJW3lzo
https://www.youtube.com/watch?v=lRle7Jzciq8
https://www.youtube.com/watch?v=CPmcBN2xoxI
https://www.youtube.com/watch?v=mdZ4g2o7v9g
https://www.youtube.com/watch?v=z3ko4cUOYO0
https://www.youtube.com/watch?v=ZLgLCNKQwFw
https://www.youtube.com/watch?v=J7hFajBOmBo
https://www.youtube.com/watch?v=PXb-jcA2TGA
https://www.youtube.com/watch?v=LxHAzwur8cI
https://www.youtube.com/watch?v=sBXHecS1S1w
https://www.youtube.com/watch?v=l6ZY90YnMy0
https://www.youtube.com/watch?v=33onjejJLDs
https://www.youtube.com/watch?v=o3eOj-jhhfI
https://www.youtube.com/watch?v=ecGcmstmnGA

All the details pertaining to a video is available under the variable video in the code above that can be parsed/extracted in a similar manner as we did to extract the video_url.

score 0 · Answer 2 · answered Jul 27 '23 at 09:46

I would try an approach with selenium, since YT renders these pages with JS and I don't think its possible to scrape URLS with requests and bs4.

You use something like this:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

driver = webdriver.Firefox()
# driver = webdriver.Chrome()  # If you would prefer to use Chrome


video_urls = []


def accept_cookies():
    try:
        elem = driver.find_element(By.XPATH, "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[3]/div[1]/form[1]/div/div/button/span")
        elem.click()
        return True
    except NoSuchElementException:
        return False


def find_videos():
    print("test")
    try:
        # CODE THAT COPIES THE URLS

        return True
    except NoSuchElementException:
        return False


def activate_game():
    try:
        elem = driver.find_element(By.CLASS_NAME, "btn")
        elem.click()
        return True
    except NoSuchElementException:
        return False


def activate_scraping():
    driver.get("https://www.youtube.com/@NetworkChuck/videos")
    step = 0
    tries = 0
    while step < 2:
        if tries <= 5:  # 5 tries to accomplish the task
            tries += 1
            success = False
            match step:
                case 0:
                    success = accept_cookies()
                case 1:
                    success = find_videos()

            if success:
                step += 1
                tries = 0
            else:
                driver.implicitly_wait(2)  # wait 2 secs before retrying the current step
        else:
            return False
    assert "No results found." not in driver.page_source
    driver.close()
    return True


activate_scraping()

Note that I wrote that code so that it would retry to find the element so it doesn't crash if your connection is slow, also you can easily add steps.

You still need to copy the links but I think if you dive a little bit into the selenium docs you can manage to do that. https://selenium-python.readthedocs.io/locating-elements.html

Youtube url scrapping using python

2 Answers2

Linked