3

I am trying the extract the links from the webpage. While doing I am getting all the links. Need to extract the page having only contain watch?v=

import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import ast
import json
import os
from urllib.request import Request, urlopen
# For ignoring SSL certificate errors

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Input from user

#url = input('Enter Youtube Video Url- ')
#url = 'https://www.youtube.com/watch?v=MxnkDj8PIxQ'
url = 'https://www.youtube.com/feed/trending'
# Making the website believe that you are accessing it using a mozilla browser

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

# Creating a BeautifulSoup object of the html page for easy extraction of data.

soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
for a in soup.find_all('a', href=True):
    print ("Found the URL:", a['href'])

My Output

Found the URL: /watch?v=EJe3xxkzj5Y
Found the URL: /watch?v=Thf60JU8E98
Found the URL: /watch?v=Thf60JU8E98
Found the URL: /user/adityamusic
Found the URL: /channel/Muzik

My Expected Out should only contain links with watch?v=

Found the URL: /watch?v=EJe3xxkzj5Y
Found the URL: /watch?v=Thf60JU8E98
Bitto
  • 7,937
  • 1
  • 16
  • 38
  • [xpath is not supported in Beautifulsoup](https://stackoverflow.com/questions/11465555/can-we-use-xpath-with-beautifulsoup). I have edited the question to reflect your actual problem. – Bitto Sep 28 '19 at 12:58

2 Answers2

1

You don't need regular expression.You can use following css selector.

url = 'https://www.youtube.com/feed/trending'

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
for a in soup.select('a[href^="/watch?v="]'):
    print ("Found the URL:", a['href']) 

Output:

Found the URL: /watch?v=NEAWC9eK1Ts
Found the URL: /watch?v=NEAWC9eK1Ts
Found the URL: /watch?v=xOGtIKE1Us8
Found the URL: /watch?v=xOGtIKE1Us8
Found the URL: /watch?v=i23NEQEFpgQ
Found the URL: /watch?v=i23NEQEFpgQ
Found the URL: /watch?v=cMqkXu4iQcU
Found the URL: /watch?v=cMqkXu4iQcU
Found the URL: /watch?v=vtiRzuH7miI
Found the URL: /watch?v=vtiRzuH7miI
Found the URL: /watch?v=28HABZJ358g
Found the URL: /watch?v=28HABZJ358g
Found the URL: /watch?v=lrzMFW2glIU
Found the URL: /watch?v=lrzMFW2glIU
Found the URL: /watch?v=nLCvijAhVLY
Found the URL: /watch?v=nLCvijAhVLY
Found the URL: /watch?v=VZiVePJCpZI
Found the URL: /watch?v=VZiVePJCpZI
Found the URL: /watch?v=gEBolPQc_EA
Found the URL: /watch?v=gEBolPQc_EA
Found the URL: /watch?v=ho_Mafw9UAk
Found the URL: /watch?v=ho_Mafw9UAk
Found the URL: /watch?v=bwOS7fxjS9E
Found the URL: /watch?v=bwOS7fxjS9E
Found the URL: /watch?v=mGD1RBhtJNg
Found the URL: /watch?v=mGD1RBhtJNg
Found the URL: /watch?v=84sHN6_MyMo
Found the URL: /watch?v=84sHN6_MyMo
Found the URL: /watch?v=waXb8QGdEYQ
Found the URL: /watch?v=waXb8QGdEYQ
Found the URL: /watch?v=kRAPxo59EbU
Found the URL: /watch?v=kRAPxo59EbU
Found the URL: /watch?v=hzmbCSHcSts
Found the URL: /watch?v=hzmbCSHcSts
Found the URL: /watch?v=AByj4Do85QM
Found the URL: /watch?v=AByj4Do85QM
Found the URL: /watch?v=s7u58Wd2H_Q
Found the URL: /watch?v=s7u58Wd2H_Q
Found the URL: /watch?v=dY2OeY5QEC4
Found the URL: /watch?v=dY2OeY5QEC4
Found the URL: /watch?v=V4XLiNRxoVM
Found the URL: /watch?v=V4XLiNRxoVM
Found the URL: /watch?v=6GlFZRXBQyg
Found the URL: /watch?v=6GlFZRXBQyg
Found the URL: /watch?v=OA-APVqZXYA
Found the URL: /watch?v=OA-APVqZXYA
Found the URL: /watch?v=6Kr9REM0JYQ
Found the URL: /watch?v=6Kr9REM0JYQ
Found the URL: /watch?v=sd5iLfPt0-o
Found the URL: /watch?v=sd5iLfPt0-o
Found the URL: /watch?v=nfcAHfDuNzw
Found the URL: /watch?v=nfcAHfDuNzw
Found the URL: /watch?v=FLTOiQ8gXp4
Found the URL: /watch?v=FLTOiQ8gXp4
Found the URL: /watch?v=ZOGxOQxXjdo
Found the URL: /watch?v=ZOGxOQxXjdo
Found the URL: /watch?v=Geyg_F5pfHE
Found the URL: /watch?v=Geyg_F5pfHE
Found the URL: /watch?v=4Kv_Gkz4wPc
Found the URL: /watch?v=4Kv_Gkz4wPc
Found the URL: /watch?v=FbtdKI_0Y5s
Found the URL: /watch?v=FbtdKI_0Y5s
Found the URL: /watch?v=fhMma6QzR3E
Found the URL: /watch?v=fhMma6QzR3E
Found the URL: /watch?v=NQEzIrC6bCs
Found the URL: /watch?v=NQEzIrC6bCs
Found the URL: /watch?v=nNhYqLbsAGk
Found the URL: /watch?v=nNhYqLbsAGk
Found the URL: /watch?v=iaQMT9Y3saM
Found the URL: /watch?v=iaQMT9Y3saM
Found the URL: /watch?v=v7Hu-14z-zQ
Found the URL: /watch?v=v7Hu-14z-zQ
Found the URL: /watch?v=RDb1MGsyY5I
Found the URL: /watch?v=RDb1MGsyY5I
Found the URL: /watch?v=KQetemT1sWc
Found the URL: /watch?v=KQetemT1sWc
Found the URL: /watch?v=ALimx-H8C6s
Found the URL: /watch?v=ALimx-H8C6s
Found the URL: /watch?v=3aUj5ilB0jw
Found the URL: /watch?v=3aUj5ilB0jw
Found the URL: /watch?v=eFBI8E1W6Vo
Found the URL: /watch?v=eFBI8E1W6Vo
Found the URL: /watch?v=iXtUX2kx6io
Found the URL: /watch?v=iXtUX2kx6io
Found the URL: /watch?v=BNgmYFwUjjw
Found the URL: /watch?v=BNgmYFwUjjw
Found the URL: /watch?v=XHmRJroAjrE
Found the URL: /watch?v=XHmRJroAjrE
Found the URL: /watch?v=XRiUNPf-_-4
Found the URL: /watch?v=XRiUNPf-_-4
Found the URL: /watch?v=uc-_KXfHcXQ
Found the URL: /watch?v=uc-_KXfHcXQ
Found the URL: /watch?v=BK7ojj5H72A
Found the URL: /watch?v=BK7ojj5H72A
Found the URL: /watch?v=Yv72aYbOEB0
Found the URL: /watch?v=Yv72aYbOEB0
Found the URL: /watch?v=il94Ke4E28s
Found the URL: /watch?v=il94Ke4E28s
Found the URL: /watch?v=aDZxEYmcCGo
Found the URL: /watch?v=aDZxEYmcCGo
Found the URL: /watch?v=T8ADlJtr4a0
Found the URL: /watch?v=T8ADlJtr4a0
Found the URL: /watch?v=d1010B3sKNQ
Found the URL: /watch?v=d1010B3sKNQ
Found the URL: /watch?v=PllHgkC3yPs
Found the URL: /watch?v=PllHgkC3yPs
Found the URL: /watch?v=1ei355BrtVo
Found the URL: /watch?v=1ei355BrtVo
Found the URL: /watch?v=ZywVlyogLYM
Found the URL: /watch?v=ZywVlyogLYM
Found the URL: /watch?v=1JLUn2DFW4w
Found the URL: /watch?v=1JLUn2DFW4w
Found the URL: /watch?v=aDrVrz76z1A
Found the URL: /watch?v=aDrVrz76z1A
Found the URL: /watch?v=syNaiMVEbJo
Found the URL: /watch?v=syNaiMVEbJo
Found the URL: /watch?v=avqRA3rmvrk
Found the URL: /watch?v=avqRA3rmvrk
Found the URL: /watch?v=II5UsqP2JAk
Found the URL: /watch?v=II5UsqP2JAk
Found the URL: /watch?v=-_ou2tKKA3U
Found the URL: /watch?v=-_ou2tKKA3U
Found the URL: /watch?v=_p_7yerGQq8
Found the URL: /watch?v=_p_7yerGQq8
Found the URL: /watch?v=bwzLiQZDw2I
Found the URL: /watch?v=bwzLiQZDw2I
Found the URL: /watch?v=ltNm4MdykBE
Found the URL: /watch?v=ltNm4MdykBE
Found the URL: /watch?v=UIL9CiUDHp0
Found the URL: /watch?v=UIL9CiUDHp0
Found the URL: /watch?v=t0_HF7tkGdA
so on...............

To get the first 10 records.

for a in soup.select('a[href^="/watch?v="]')[:10]:
    print ("Found the URL:", a['href'])

If you want get last 10 records.

for a in soup.select('a[href^="/watch?v="]')[-10:]:
    print ("Found the URL:", a['href'])
KunduK
  • 32,888
  • 5
  • 17
  • 41
  • Is there any way to select first 10 urls. I have upvoted for you :) –  Sep 28 '19 at 15:51
0

You can pass a regular expression to the href keyword in find_all

soup.find_all('a', href=re.compile('^/watch\?v=')

Code

import re
# Rest of your code ...
for a in soup.find_all('a', href=re.compile('^/watch\?v=')):
    print ("Found the URL:", a['href']) 
Bitto
  • 7,937
  • 1
  • 16
  • 38