I am able to get the URLs in the search page using the script below
def get_source(url):
"""Return the source code for the provided URL.
Args:
url (string): URL of the page to scrape.
Returns:
response (object): HTTP response object from requests_html.
"""
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(query):
query = urllib.parse.quote_plus(query)
response = get_source("https://www.google.co.uk/search?q=" + query)
links = list(response.html.absolute_links)
google_domains = ('https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.',
'https://play.google.')
https = ('https://')
for url in links[:]:
if url.startswith(google_domains):
links.remove(url)
return links
And now I want to get plain domains without https, www or anything like below
wiki.org
itroasters.com
And also need to remove any duplicates.
Could anyone please help me to get the expected result?
Thanks