1.Data is generating via AJAX from API response.So I use api url.
2.Yes,they are using cloudflare protection
3.They are sending cookies that's why you are getting 403 sometimes
example:
from bs4 import BeautifulSoup
import cloudscraper
url = 'https://www.ralphlauren.co.uk/en/men/clothing/1020?start=0&sz=32&webcat=men-clothing&format=ajax'
def make_soup(url):
scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0'})
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'cookie': '_pxhd=YTY3kBjCFGK9ngu3b36yYO2E-PXpKRlhxK6C8BTXlJRN/FSawTz2AzfVOR4pC1mFHRpymTRIEfP6A-t7uA1Mxg==:PtAcitIkXqtoeFEQX2KzgQO/oJLX4XTcRG/HjfVr8sbKA7ZsTUAIxOWn/Yy9El9lR4RkjJitloa6YDeoOLerdxTFiKoVAf7bIct1xU-6MgA=; dwac_fdf60f7bee356f94087278707e=z7OQw4OBWJYa1C_fw_lf1ZDLAgRsD5L44Ks%3D|dw-only|||GBP|false|Etc%2FGMT|true; cqcid=ackOlpsjfbi6Od7fcdT8Qa83k2; cquid=||; mt.v=5.314767789.1658169518444; sid=z7OQw4OBWJYa1C_fw_lf1ZDLAgRsD5L44Ks; dwanonymous_da52d7be668f7d7467cbce92e0d4fd7f=ackOlpsjfbi6Od7fcdT8Qa83k2; pzcookie="{\"pz_id\":\"\",\"EP_RID\":\"\",\"gender\":0}"; __cq_dnt=0; dw_dnt=0; dwsid=F7fqw3IG0Fu425Iml6t51UjPeASdiBo1QqQlqKXSj4ZVAPM6xpv_iqqzWvJYmqW6qB9jKqskviUEgRgSKhrQqg==; pxcts=da60ef98-06c8-11ed-a93a-72797943537a; _pxvid=d67b3308-06c8-11ed-9550-50577866414b; dw=1; dw_cookies_accepted=1; cookieConsentOW=true; dw_TLSWarning=false; headerDefault=1; ftr_ncd=6; __cq_uuid=ackOlpsjfbi6Od7fcdT8Qa83k2; __cq_seg=0~0.00!1~0.00!2~0.00!3~0.00!4~0.00!5~0.00!6~0.00!7~0.00!8~0.00!9~0.00; OptanonAlertBoxClosed=2022-07-18T18:38:56.295Z; pageNameDuplicate=en_GB%3Aplp%3Amen%3Aclothing; kndctr_F18502BE5329FB670A490D4C_AdobeOrg_cluster=irl1; kndctr_F18502BE5329FB670A490D4C_AdobeOrg_identity=CiYxNDExNDEzNzYyNTY2MjY4MDgzMDQ3MDUzMjA3NzQ3MDkyNzk1MFIPCLzu7pShMBgBKgRJUkwx8AG87u6UoTA=; AMCV_F18502BE5329FB670A490D4C%40AdobeOrg=MCMID|14114137625662680830470532077470927950; _px3=b4da323d0fa5ec03179b158c8d4b5eaf353f870f898365f3ed7c650493bb584e:pwoUpuSJhxr98r8gBXnpMYVeWC4xvTyEPJa3E4is/iqdkI0Tc82oirHZybA9f8l9icjOK6i9hgf5bhW0KHpUxA==:1000:1HnUFfuDb0U76VdVlX4t09n55dvtF2ExJhB8I6ZvAgR7fO2EdBpI0lc0KBzlJo8c9kw2EwPKKWW/EhsNXL48zi5Go2L5Lk87z1auE/r8gyxBj/Apt3QVfNrPmaST04IeVz/Iyty1qTwaouF+j9ryJ8iHGePAEMiIbAogpIFLnNXyxH7PYXvhktJlUmYsPqPZiZnKX1Z9djlosDZEpzhtVQ==; forterToken=4b3480a3418543d599e45a659cd90a1e_1658169601028__UDF43_6; OptanonConsent=isGpcEnabled=0&datestamp=Tue+Jul+19+2022+00%3A40%3A05+GMT%2B0600+(Bangladesh+Standard+Time)&version=6.34.0&isIABGlobal=false&hosts=&genVendors=&consentId=58dc5abe-e2bb-408b-aae0-2d153328b836&interactionCount=1&landingPath=NotLandingPage&groups=1%3A1%2C2%3A0%2C3%3A0%2C4%3A0&geolocation=BD%3BC&AwaitingReconsent=false'
}
req = scraper.get(url,headers=headers)
page_soup = BeautifulSoup(req.text,'lxml')
print(req)
return page_soup
browsing_page_soup = make_soup(url)
page_links = [i.get('href') for i in browsing_page_soup.select('a.thumb-link')]
product_pages_soup = []
for i in page_links:
product_pages_soup.append(make_soup('https://www.ralphlauren.co.uk'+i))
Output:
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
screenshot