I have used a regex for email validation for a google scraper to grab email addresses. The problem is there are several emails not being matched by this because they start with http://. I am not great at creating regexes and this is already very long. Here Is the code I have thus far
emailregex = r'''(?:[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\.)+[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-zA-Z0-9-]*[a-zA-Z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]|[(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?])'''
driver.get("https://www.google.com")
search = driver.find_element_by_xpath("//input[@name='q']")
search.send_keys(searchterm)
submit = driver.find_element_by_xpath("//input[@type='submit']")
driver.execute_script("arguments[0].click();", submit)
doc = driver.page_source
email_list = []
for re_match in re.finditer(emailregex, doc):
email_list.append(re_match.group())
while True:
try :
next_page = driver.find_element(By.ID, "pnnext")
driver.execute_script("arguments[0].click();", next_page)
doc = driver.page_source
for re_match in re.finditer(emailregex, doc):
email_list.append(re_match.group())
except :
break
for i, email in enumerate(email_list):
print(f'{i + 1}: {email}')