0

I am scraping contact details such as Emails and Phone numbers using python 3 and beautiful soup, where the urls are found from the google search with the help of given keywords.

I have scraped emails correctly from the urls a but i am unable to scrape phone numbers accurately from the websites.

from bs4 import BeautifulSoup
import sys
import requests
import urllib.request
import pandas as pd
from urllib.request import urlopen,urlparse, Request,HTTPError
import re
import numpy as np
import csv
import json


def get_keyword(word):
try:     
from google search import search     
except ImportError:    
print("No module named 'google' found")   
# to search     
query = word    
url=[]    
for j in search (query, tld ="co.uk", num=10, stop=1, pause=2):    url.append(j)
return url, word
def scrape(req1, word):   
req2=req1
req1 = Request(req1, headers={'User-Agent': 'Mozilla/5.0 Chrome/24.0.1312.27 Safari/537.17 '})  
f = url open(req1)
s = f.read().decode('UTF-8') 
reg = "((\+\d{1,3}(-| )?\(?\d\)?(-| )?\d{1,3})|(\(?\d{2,3}\)?))(-| )?(\d{3,4})(-| )?(\d{4})(( x| ext)\d{1,5}){0,1}"
phone = re. find all(reg, s)    
emails = re. find all(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,3}",s)  #Email regex    
ph=[]    
for i in phone:    
g  = list(filter(None, i))    
g=''.join(g)    
ph.append(g)    
def Remove(duplicate):     
final_list = []     
for num in duplicate:     
if num not in final_list:     
final_list.append(num)     
return final_list 
k = Remove(ph)    
df = pd.DataFrame(k, columns=['phone'])   
df2 = pd.DataFrame(emails, columns=['email'])    
df3 = pd.DataFrame([req2],columns=['url'])    
new_df = df.join([df3,df2])
return new_df

if __name__ == '__main__':
df_new = pd.DataFrame(columns = ['email','url','phone'])
x, y=get_keyword("women entrepreneur")#keyword
print(x)
for i in x:    
k = scrape(i, y) #i=links in the list of x which means list of url
df_new = pd.concat([df_new,k],ignore_index=True)

i want to get exact phone number from the website but i am actually getting many other numbers as an output. Example ("phone": "1761768436145") which is not an accurate phone number. If there is no number found it should come up as "no phone number found".

Shah
  • 21
  • 1
  • 6

1 Answers1

0

Match phone:

# https://stackoverflow.com/a/3868861/15164646
match_phone = re.findall(r'((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))', snippet)
phone = ''.join(match_phone)

Match email:

match_email = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', snippet)
email = ''.join(match_email)

Code and full example in the online IDE:

import requests, lxml, re
from bs4 import BeautifulSoup

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  'q': 'site:Facebook.com Dentist gmail.com',
  'hl': 'en',
  'gl': 'us'
}

html = requests.get(f'https://www.google.com/search',
                    headers=headers,
                    params=params).text
soup = BeautifulSoup(html, 'lxml')

for result in soup.findAll('div', class_='tF2Cxc'):
    title = result.select_one('.DKV0Md').text
    link = result.find('a')['href']
    snippet = result.select_one('.lyLwlc').text

    match_email = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', snippet)
    email = ''.join(match_email)

    match_phone = re.findall(r'((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))', snippet)
    phone = ''.join(match_phone)

    print(f'{title}\n{link}\n{snippet}\n{email}\n{phone}\n')

--------
'''
Allenwood Dental - Home | Facebook
https://m.facebook.com/Allenwood-Dental-103141398249324/
Get Directions. Rating · 0. (0 reviews). 5 people checked in here. (734) 294-6003. allenwooddental@gmail.com. https://allenwooddental.com/. Closed Now.
allenwooddental@gmail.com
(734) 294-6003
...
'''

Alternatively, you can achieve the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.

The main difference is that you don't have to maintain the parser over time if something in HTML will change or if you don't want to bypass blocks from Google since it's already done for the end-user. All that really needs to be done is to iterate over structured JSON and get what you were looking for.

Code to integrate:

from serpapi import GoogleSearch
import os, json, re

params = {
  "engine": "google",
  "q": "site:Facebook.com Dentist gmail.com",
  "api_key": os.getenv('API_KEY')
}

search = GoogleSearch(params)
results = search.get_dict()

data = []

for result in results['organic_results']:
  title = result['title']
  link = result['link']
  snippet = result['snippet']

  match_email = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', snippet)
  email = '\n'.join(match_email)

  match_phone = re.findall(r'((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))', snippet)
  phone = ''.join(match_phone)

  print(f'{title}\n{link}\n{snippet}\n{email}\n{phone}\n')

--------
'''
Dental Professionals of Coral Springs - Posts | Facebook
https://www.facebook.com/DentalProfessionalsCoralSprings/posts
TELEDENTESTRY NOW AVAILABLE!!!!! GIVE USA CALL TO LEARN MORE !!! ☎️ 954-255-5858. Dpofcoralsprings@gmail.com.
Dpofcoralsprings@gmail.com
954-255-5858
...
'''

Disclaimer, I work for SerpApi.

Dmitriy Zub
  • 1,398
  • 8
  • 35