I am scraping contact details such as Emails and Phone numbers using python 3 and beautiful soup, where the urls are found from the google search with the help of given keywords.
I have scraped emails correctly from the urls a but i am unable to scrape phone numbers accurately from the websites.
from bs4 import BeautifulSoup
import sys
import requests
import urllib.request
import pandas as pd
from urllib.request import urlopen,urlparse, Request,HTTPError
import re
import numpy as np
import csv
import json
def get_keyword(word):
try:
from google search import search
except ImportError:
print("No module named 'google' found")
# to search
query = word
url=[]
for j in search (query, tld ="co.uk", num=10, stop=1, pause=2): url.append(j)
return url, word
def scrape(req1, word):
req2=req1
req1 = Request(req1, headers={'User-Agent': 'Mozilla/5.0 Chrome/24.0.1312.27 Safari/537.17 '})
f = url open(req1)
s = f.read().decode('UTF-8')
reg = "((\+\d{1,3}(-| )?\(?\d\)?(-| )?\d{1,3})|(\(?\d{2,3}\)?))(-| )?(\d{3,4})(-| )?(\d{4})(( x| ext)\d{1,5}){0,1}"
phone = re. find all(reg, s)
emails = re. find all(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s) #Email regex
ph=[]
for i in phone:
g = list(filter(None, i))
g=''.join(g)
ph.append(g)
def Remove(duplicate):
final_list = []
for num in duplicate:
if num not in final_list:
final_list.append(num)
return final_list
k = Remove(ph)
df = pd.DataFrame(k, columns=['phone'])
df2 = pd.DataFrame(emails, columns=['email'])
df3 = pd.DataFrame([req2],columns=['url'])
new_df = df.join([df3,df2])
return new_df
if __name__ == '__main__':
df_new = pd.DataFrame(columns = ['email','url','phone'])
x, y=get_keyword("women entrepreneur")#keyword
print(x)
for i in x:
k = scrape(i, y) #i=links in the list of x which means list of url
df_new = pd.concat([df_new,k],ignore_index=True)
i want to get exact phone number from the website but i am actually getting many other numbers as an output. Example ("phone": "1761768436145") which is not an accurate phone number. If there is no number found it should come up as "no phone number found".