Some websites have anti-scraping mechanisms that involves in detecting the webdriver
property of the browser. When you enable the headless
mode for Chrome this property is not set by the browser. Thus indication to websites that the origin of the request is through a bot or a program.
You can try to execute javascript that can set the webdriver
property for your browser in headless mode.
However, please also note that this is just one of many mechanisms used by websites to detect bots or programs.
You may also check this answer
Here is a sample code I wrote using pyppeteer
library.
import asyncio
from pyppeteer import launch
# from pyvirtualdisplay import Display
from argparse import ArgumentParser
class HTMLRetriever(object):
_page_source = None
_title = None
def __init__(self, url):
self.url = url
async def load(self):
# with Display(backend='xvfb') as disp:
await self._init_browser()
await self._init_webpage()
await self._connect_website()
await self._take_snapshot()
@classmethod
async def _init_display(cls):
cls.disp = Display(backend='xvfb')
@classmethod
async def _init_browser(cls):
cls.browser = await launch(headless=True, args=[
"--no-sandbox",
"--single-process",
"--disable-dev-shm-usage",
"--no-zygote",
'--user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"'
])
@classmethod
async def _init_webpage(cls):
cls.page = await cls.browser.newPage()
await asyncio.sleep(1)
await cls.page.setJavaScriptEnabled(True)
@classmethod
async def _init_webpage_properties(cls):
await cls.page.evaluate('''() =>{
Object.defineProperties(navigator,{
webdriver:{
get: () => false
}
})
}''')
await cls.page.evaluate('''() =>{
Object.defineProperties(window,{
chrome:{
get: () => true
}
})
}''')
async def _connect_website(self):
await self.page.goto(self.url, {'waitUntil': 'networkidle2', 'timeout': 60000})
await asyncio.sleep(6)
self._title = await self.page.evaluate('''() => {
return document.title
}''')
self._page_source = await self.page.content()
async def _take_snapshot(self):
await self.page.screenshot({'path': f"snapshots/{self.url.strip('https://').strip('http://').replace('.', '_').replace('/','-')}.png"})
@property
def page_source(self):
return self._page_source
@property
def title(self):
return self._title
async def close(self):
await self.browser.close()
async def main():
parser = ArgumentParser(description='A tool to obtain HTMl of a web URL')
parser.add_argument('-u', '--url', dest='url', type=str, required=True, metavar='URL',
help='URL of the website for which HTML is to be retrieved')
args = parser.parse_args()
kwargs = vars(args)
if not kwargs.get('url') is None:
retriever = HTMLRetriever(url=kwargs.get('url'))
await retriever.load()
print(retriever.title)
await retriever.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())