1

I am scraping name, email, phone and location of therapists from a website. I have scraped the data from the first page but i am not able to paginate through the rest of the pages. I am using requests and beautifulsoup.

The website is Here

The code for the first page is:

import requests
from bs4 import BeautifulSoup as bs

count = 0

cookies = {
    'ASP.NET_SessionId': 'uij03wnehlax221msxy4jkno',
    '__RequestVerificationToken': 'ReASHPRKAhth_7S9C1U7qg7de4AxnkIdFxUt6yhMKTdWPHsZl_1vC-pJOJZ8fQwopOL56MS3yjVi1D6WhrKm2ZyKoNU1',
    'LoginGuid': '',
    '_ga': 'GA1.2.1257196513.1587105612',
    'Asi.Web.Browser.CookiesEnabled': 'true',
    'tltos': '1',
    '_gid': 'GA1.2.1385127198.1587230995',
    '__utmxst': '180',
}

headers = {
    'Connection': 'keep-alive',
    'sec-ch-ua': '"Google Chrome 80"',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cache-Control': 'no-cache',
    'Sec-Fetch-Dest': 'empty',
    'X-Requested-With': 'XMLHttpRequest',
    'X-MicrosoftAjax': 'Delta=true',
    'Accept': '*/*',
    'Origin': 'https://www.therapistlocator.net',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001&name=',
    'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}


params = (
    ('zip', '10001'),
    ('name', ''),
)

data = {
  'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
  '__WPPS': 's',
  '__ClientContext': '{"baseUrl":"/","isAnonymous":true,"loggedInPartyId":"132791","selectedPartyId":"132791","websiteRoot":"http://www.therapistlocator.net/","virtualDir":""}',
  '__CTRLKEY': '',
  '__SHIFTKEY': '',
  'ctl01_ScriptManager1_TSM': '',
  'PageInstanceKey': '54d43052-a674-4b86-bebe-f3635b68db37',
  '__RequestVerificationToken': 'Q0PHslrV-Kffbpo7LCbjPe8RMOcT59p8PRLefKE93uc6G4hfz6Ewpjg_bCI3SV2MPNfGUd1VirBZ3igc1rB51IPZTvc1',
  'TemplateUserMessagesID': 'ctl01_TemplateUserMessages_ctl00_Messages',
  'PageIsDirty': 'false',
  'IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan': '1',
  'IsControlPostBackctl01$SearchField': '1',
  '__EVENTTARGET': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
  '__EVENTARGUMENT': '',
  'NavMenuClientID': 'ctl01_Primary_NavMenu',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage1': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage2': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage3': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
  'IsControlPostBackctl01$FooterCopyright$FooterCopyright': '1',
  'IsControlPostBackctl01$FooterCopyright$tosol': '1',
  '__VIEWSTATE': '/wEPaA8FDzhkN2UyOWRmZGE0ZGQ4NxgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBwUYY3RsMDEkTG9naW5TdGF0dXMxJGN0bDAxBRhjdGwwMSRMb2dpblN0YXR1czEkY3RsMDMFFWN0bDAxJFByaW1hcnkkTmF2TWVudQUUY3RsMDEkV2luZG93TWFuYWdlcjEFE2N0bDAxJEdlbmVyaWNXaW5kb3cFE2N0bDAxJE9iamVjdEJyb3dzZXIFGWN0bDAxJE9iamVjdEJyb3dzZXJEaWFsb2fx/JLd/+XByre34VShpvA4WynsKA==',
  '__VIEWSTATEGENERATOR': '37E773F2',
  'ctl01$lastClickedElementId': '',
  'ctl01$SearchField$SearchTerms': 'Keyword Search',
  'ctl01_Primary_NavMenu_ClientState': '',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0': '10001',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0': '5',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0': '',
  '__ASYNCPOST': 'true',
  '': ''
}

response = requests.post('https://www.therapistlocator.net/tl/therapist-finder.aspx', headers=headers, params=params, cookies=cookies, data=data)

html = str(response.content)
con = bs(html , 'lxml')

therapists = con.find('div',class_='QueryDisplayWrapper').find_all('div',class_='row')

for therapist in therapists:
    count+=1
    name = therapist.find('div',class_='item name').find('a').text.strip()

    therapist_href = therapist.find('div',class_='item name').find('a').get('href')
    therapist_href = therapist_href.replace('\\','')
    therapist_href = therapist_href.replace("'",'')

    therapist_link = 'https://www.therapistlocator.net{}'.format(therapist_href)

    therapist_info = requests.get(therapist_link)
    if therapist_info.ok:
        dataa = bs(therapist_info.text,'lxml')

        try:
            email = dataa.find('a',class_='PanelField').text.strip()

            location = dataa.find_all('div',class_='PanelFieldValue')[0].find('span').text.strip()
            loc1 = dataa.find_all('div',class_='PanelFieldValue')[0].find('br').next_sibling.strip()
            location = location.replace(loc1 , ' {}'.format(loc1))

            phone = dataa.find_all('div',class_='PanelFieldValue')[1].find('span').text.strip()


            print('\n*********** '+str(count)+' ************\n')
            print('Name: {}'.format(name))
            print('Email: {}'.format(email))
            print('Phone: {}'.format(phone))
            print('Location: {}'.format(location))
        except:
            pass

The rest of the pages seems to have the same URL, so i was not able to iterate through them all.

Each page has 25 entries. I wish getting them all.

Sample Output for each entry:

Name: Marya B . Slater
Email: nycitytherapist@gmail.com
Phone: (646) 265-1555
Location: 360 W 34th St Apt 5P New York, NY  10001-2407
ahmadfaraz
  • 224
  • 1
  • 9

2 Answers2

1

Solution

Your website has javascript. When you click on the next page, it fires a javascript function to populate the result. You could use Selenium browser automation to programatically access other pages.

See these:

  1. https://selenium-python.readthedocs.io/getting-started.html
  2. https://selenium-python.readthedocs.io/
  3. https://selenium-python.readthedocs.io/navigating.html#interacting-with-the-page

Pagination with Selenium

Core Steps

You need to breakdown your problem in to the following steps:

  1. Use Selenium (with python) BrowserAutomation to access your page.
  2. Get the total number of pages (see in the page source, at the very end, it has a pagination section). Althernatively, you could use total_pages = total_results//max_results + 1 where, max_results = 25 by default.

  3. For each page:

    1. Use BeautifulSoup to extract the data from the response object obtained using Selenium.

    2. Use selenium to click on the next page link

    3. Append the results in a dict or a list or to a pandas.DataFrame if you like.

CypherX
  • 7,019
  • 3
  • 25
  • 37
  • 1
    Sir, i don't have much command on seleniun. it will take me lot of time to do it in selenium. Can't it be done using requests and beautifulsoup? – ahmadfaraz Apr 18 '20 at 19:30
  • I am afraid there's no other way around it. At least BeautifulSoup will not suffice to interact with javascript-based-actions. You could however, use BeautifulSoup still, if you can decode what the javascript functions are doing (in terms of what URL is being fired upon). But this tactic may not work always and also it could a be an ad hoc solution which could break down later. Instead, using Selenium will make it a robust solution that you could showcase in future as project work. – CypherX Apr 18 '20 at 19:44
1
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import pandas as pd

fish = ["ctl01$ScriptManager1", "ctl01$lastClickedElementId", "__EVENTTARGET"]

data = {
    'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage',
    '__WPPS': 's',
    '__CTRLKEY': '',
    '__SHIFTKEY': '',
    'NavMenuClientID': 'ctl01_Primary_NavMenu',
    'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
    'ctl01$lastClickedElementId': 'id|ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_lnkFirstPage',
    'ctl01$SearchField$SearchTerms': 'Keyword Search',
    "ctl01_Primary_NavMenu_ClientState": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0": "10001",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0": "5",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlResultsPerPage": "25",
    "ctl01_GenericWindow_ClientState": "",
    "ctl01_ObjectBrowser_ClientState": "",
    "ctl01_ObjectBrowserDialog_ClientState": "",
    "ctl01_WindowManager1_ClientState": "",
    "__EVENTTARGET": "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage",
    "__EVENTARGUMENT": "",
    "__LASTFOCUS": "",
    "__VIEWSTATEGENERATOR": "37E773F2",
    "__ClientContext": "{\"baseUrl\":\"/\",\"isAnonymous\":true,\"loggedInPartyId\":\"132791\",\"selectedPartyId\":\"132791\",\"websiteRoot\":\"http://www.therapistlocator.net/\",\"virtualDir\":\"\"}",
    "TemplateUserMessagesID": "ctl01_TemplateUserMessages_ctl00_Messages",
    "PageIsDirty": "false",
    "IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan": "1",
    "IsControlPostBackctl01$SearchField": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage1": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage2": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage3": "1",
    "IsControlPostBackctl01$FooterCopyright$FooterCopyright": "1",
    "IsControlPostBackctl01$FooterCopyright$tosol": "1",
    "__ASYNCPOST": "true",
    "RadAJAXControlID": "ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_rapLoading"
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
    "Referer": "https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001"
}


def main(url):
    with requests.Session() as req:

        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        data['ctl01_ScriptManager1_TSM'] = unquote(soup.select_one(
            "script[src*=Telerik]").get("src")).split("=", 3)[-1]
        data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
        data['PageInstanceKey'] = re.search(
            'PageInstanceKey=(.+?)"', r.text).group(1)
        data['__RequestVerificationToken'] = soup.find(
            "input", id="__RequestVerificationToken").get("value")
        urls = []
        for num in range(1, 4):
            print(f"Extracting Links From Page {num}")

            r = req.post(url, data=data, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')

            links = [f'{url[:32]}{link.get("href")}'
                     for link in soup.select("a[href*=viewprofile]")]
            urls.extend(links)

            for f in fish:
                if num == 1:
                    data[f] = re.sub('(k)(.+)', r"\1SecondPage", data[f])
                else:
                    data[f] = re.sub('(k)(.+)', r"\1Last", data[f])

        print(f"Collected {len(urls)} Links")
        done = []
        for x in urls:
            r = req.get(x)
            soup = BeautifulSoup(r.content, 'html.parser')
            load = soup.select("div.PanelFieldValue")
            name = load[2].span.text
            add = load[0].span.text
            ph = load[1].span.text
            try:
                em = soup.select_one("a.PanelField").text
            except:
                em = "N/A"
            goal = [name, add, ph, em]
            done.append(goal)
        df = pd.DataFrame.from_records(
            done, columns=["Name", "Address", "Phone", "Email"])
        print(df)
        df.to_csv("data.csv", index=False)


main("https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001")

Output: view-online

enter image description here

  • Sir,, it works fine for the given zip code and address but will you please explain a bit, what if i search for another zip code and distance? what will i have to change in the code then? thanksss – ahmadfaraz Apr 21 '20 at 06:47