How to iterate through all pages in a site?

Question

I am scraping name, email, phone and location of therapists from a website. I have scraped the data from the first page but i am not able to paginate through the rest of the pages. I am using requests and beautifulsoup.

The website is Here

The code for the first page is:

import requests
from bs4 import BeautifulSoup as bs

count = 0

cookies = {
    'ASP.NET_SessionId': 'uij03wnehlax221msxy4jkno',
    '__RequestVerificationToken': 'ReASHPRKAhth_7S9C1U7qg7de4AxnkIdFxUt6yhMKTdWPHsZl_1vC-pJOJZ8fQwopOL56MS3yjVi1D6WhrKm2ZyKoNU1',
    'LoginGuid': '',
    '_ga': 'GA1.2.1257196513.1587105612',
    'Asi.Web.Browser.CookiesEnabled': 'true',
    'tltos': '1',
    '_gid': 'GA1.2.1385127198.1587230995',
    '__utmxst': '180',
}

headers = {
    'Connection': 'keep-alive',
    'sec-ch-ua': '"Google Chrome 80"',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Cache-Control': 'no-cache',
    'Sec-Fetch-Dest': 'empty',
    'X-Requested-With': 'XMLHttpRequest',
    'X-MicrosoftAjax': 'Delta=true',
    'Accept': '*/*',
    'Origin': 'https://www.therapistlocator.net',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001&name=',
    'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}


params = (
    ('zip', '10001'),
    ('name', ''),
)

data = {
  'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
  '__WPPS': 's',
  '__ClientContext': '{"baseUrl":"/","isAnonymous":true,"loggedInPartyId":"132791","selectedPartyId":"132791","websiteRoot":"http://www.therapistlocator.net/","virtualDir":""}',
  '__CTRLKEY': '',
  '__SHIFTKEY': '',
  'ctl01_ScriptManager1_TSM': '',
  'PageInstanceKey': '54d43052-a674-4b86-bebe-f3635b68db37',
  '__RequestVerificationToken': 'Q0PHslrV-Kffbpo7LCbjPe8RMOcT59p8PRLefKE93uc6G4hfz6Ewpjg_bCI3SV2MPNfGUd1VirBZ3igc1rB51IPZTvc1',
  'TemplateUserMessagesID': 'ctl01_TemplateUserMessages_ctl00_Messages',
  'PageIsDirty': 'false',
  'IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan': '1',
  'IsControlPostBackctl01$SearchField': '1',
  '__EVENTTARGET': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$btnFilter',
  '__EVENTARGUMENT': '',
  'NavMenuClientID': 'ctl01_Primary_NavMenu',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85': '1',
  'IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage1': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage2': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPage3': '1',
  'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
  'IsControlPostBackctl01$FooterCopyright$FooterCopyright': '1',
  'IsControlPostBackctl01$FooterCopyright$tosol': '1',
  '__VIEWSTATE': '/wEPaA8FDzhkN2UyOWRmZGE0ZGQ4NxgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WBwUYY3RsMDEkTG9naW5TdGF0dXMxJGN0bDAxBRhjdGwwMSRMb2dpblN0YXR1czEkY3RsMDMFFWN0bDAxJFByaW1hcnkkTmF2TWVudQUUY3RsMDEkV2luZG93TWFuYWdlcjEFE2N0bDAxJEdlbmVyaWNXaW5kb3cFE2N0bDAxJE9iamVjdEJyb3dzZXIFGWN0bDAxJE9iamVjdEJyb3dzZXJEaWFsb2fx/JLd/+XByre34VShpvA4WynsKA==',
  '__VIEWSTATEGENERATOR': '37E773F2',
  'ctl01$lastClickedElementId': '',
  'ctl01$SearchField$SearchTerms': 'Keyword Search',
  'ctl01_Primary_NavMenu_ClientState': '',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0': '10001',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0': '5',
  'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0': '',
  '__ASYNCPOST': 'true',
  '': ''
}

response = requests.post('https://www.therapistlocator.net/tl/therapist-finder.aspx', headers=headers, params=params, cookies=cookies, data=data)

html = str(response.content)
con = bs(html , 'lxml')

therapists = con.find('div',class_='QueryDisplayWrapper').find_all('div',class_='row')

for therapist in therapists:
    count+=1
    name = therapist.find('div',class_='item name').find('a').text.strip()

    therapist_href = therapist.find('div',class_='item name').find('a').get('href')
    therapist_href = therapist_href.replace('\\','')
    therapist_href = therapist_href.replace("'",'')

    therapist_link = 'https://www.therapistlocator.net{}'.format(therapist_href)

    therapist_info = requests.get(therapist_link)
    if therapist_info.ok:
        dataa = bs(therapist_info.text,'lxml')

        try:
            email = dataa.find('a',class_='PanelField').text.strip()

            location = dataa.find_all('div',class_='PanelFieldValue')[0].find('span').text.strip()
            loc1 = dataa.find_all('div',class_='PanelFieldValue')[0].find('br').next_sibling.strip()
            location = location.replace(loc1 , ' {}'.format(loc1))

            phone = dataa.find_all('div',class_='PanelFieldValue')[1].find('span').text.strip()


            print('\n*********** '+str(count)+' ************\n')
            print('Name: {}'.format(name))
            print('Email: {}'.format(email))
            print('Phone: {}'.format(phone))
            print('Location: {}'.format(location))
        except:
            pass

The rest of the pages seems to have the same URL, so i was not able to iterate through them all.

Each page has 25 entries. I wish getting them all.

Sample Output for each entry:

Name: Marya B . Slater
Email: nycitytherapist@gmail.com
Phone: (646) 265-1555
Location: 360 W 34th St Apt 5P New York, NY  10001-2407

You've asked much questions regarding the same field. isn't the time to sort your question well ? where is the sample of the desired output ? where's the email part? — αԋɱҽԃ αмєяιcαη, Apr 18 '20 at 18:41
i want help in just accessing all pages. the code works fine with the scraping part. — ahmadfaraz, Apr 18 '20 at 18:55
I'm not seeing the email value within the `url` which you shared! [check](https://imgur.com/DuHxZ9P). seems you are signed in ? — αԋɱҽԃ αмєяιcαη, Apr 18 '20 at 18:57
The email is inside the details page,, when you click on the name. Sir — ahmadfaraz, Apr 18 '20 at 19:01
I am not signed in.. the email is inside each therapist's detail page. — ahmadfaraz, Apr 18 '20 at 19:28

CypherX · Answer 1 · 2020-05-11T10:46:20.917

Solution

Your website has javascript. When you click on the next page, it fires a javascript function to populate the result. You could use Selenium browser automation to programatically access other pages.

See these:

Pagination with Selenium

navigating through pagination with selenium in python

Core Steps

You need to breakdown your problem in to the following steps:

Use Selenium (with python) BrowserAutomation to access your page.
Get the total number of pages (see in the page source, at the very end, it has a pagination section). Althernatively, you could use total_pages = total_results//max_results + 1 where, max_results = 25 by default.
For each page:
1. Use BeautifulSoup to extract the data from the response object obtained using Selenium.
2. Use selenium to click on the next page link
3. Append the results in a dict or a list or to a pandas.DataFrame if you like.

Sir, i don't have much command on seleniun. it will take me lot of time to do it in selenium. Can't it be done using requests and beautifulsoup? — ahmadfaraz, Apr 18 '20 at 19:30
I am afraid there's no other way around it. At least BeautifulSoup will not suffice to interact with javascript-based-actions. You could however, use BeautifulSoup still, if you can decode what the javascript functions are doing (in terms of what URL is being fired upon). But this tactic may not work always and also it could a be an ad hoc solution which could break down later. Instead, using Selenium will make it a robust solution that you could showcase in future as project work. — CypherX, Apr 18 '20 at 19:44

score 1 · Accepted Answer · answered Apr 19 '20 at 01:31

import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import pandas as pd

fish = ["ctl01$ScriptManager1", "ctl01$lastClickedElementId", "__EVENTTARGET"]

data = {
    'ctl01$ScriptManager1': 'ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$rapLoadingPanel|ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage',
    '__WPPS': 's',
    '__CTRLKEY': '',
    '__SHIFTKEY': '',
    'NavMenuClientID': 'ctl01_Primary_NavMenu',
    'IsControlPostBackctl01$TemplateBody$ContentPageFooter1': '1',
    'ctl01$lastClickedElementId': 'id|ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_lnkFirstPage',
    'ctl01$SearchField$SearchTerms': 'Keyword Search',
    "ctl01_Primary_NavMenu_ClientState": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtPOSTALCODE0": "10001",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlDISTANCE0": "5",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$txtName_TL0": "",
    "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$ddlResultsPerPage": "25",
    "ctl01_GenericWindow_ClientState": "",
    "ctl01_ObjectBrowser_ClientState": "",
    "ctl01_ObjectBrowserDialog_ClientState": "",
    "ctl01_WindowManager1_ClientState": "",
    "__EVENTTARGET": "ctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon$lnkFirstPage",
    "__EVENTARGUMENT": "",
    "__LASTFOCUS": "",
    "__VIEWSTATEGENERATOR": "37E773F2",
    "__ClientContext": "{\"baseUrl\":\"/\",\"isAnonymous\":true,\"loggedInPartyId\":\"132791\",\"selectedPartyId\":\"132791\",\"websiteRoot\":\"http://www.therapistlocator.net/\",\"virtualDir\":\"\"}",
    "TemplateUserMessagesID": "ctl01_TemplateUserMessages_ctl00_Messages",
    "PageIsDirty": "false",
    "IsControlPostBackctl01$HeaderLogo$HeaderLogoSpan": "1",
    "IsControlPostBackctl01$SearchField": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciTitleandintro_9bb3191967f941e883b2c501791a2061$ciTitleandintro_9bb3191967f941e883b2c501791a2061": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciStyles_59e895c08d4f407aa0ada09911013fd2$ciStyles_59e895c08d4f407aa0ada09911013fd2": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSCustomQueryDisplayCommon$ciNewATSCustomQueryDisplayCommon": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401$ciScriptsDONOTREMOVE_f3cae45af58246d8b3f4953f13f8d401": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewContentHtml_0be4f96424fb47de90d1c22db2588e85$ciNewContentHtml_0be4f96424fb47de90d1c22db2588e85": "1",
    "IsControlPostBackctl01$TemplateBody$WebPartManager1$gwpciNewATSGeoCodingCommon$ciNewATSGeoCodingCommon": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage1": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage2": "1",
    "IsControlPostBackctl01$TemplateBody$ContentPage3": "1",
    "IsControlPostBackctl01$FooterCopyright$FooterCopyright": "1",
    "IsControlPostBackctl01$FooterCopyright$tosol": "1",
    "__ASYNCPOST": "true",
    "RadAJAXControlID": "ctl01_TemplateBody_WebPartManager1_gwpciNewATSCustomQueryDisplayCommon_ciNewATSCustomQueryDisplayCommon_rapLoading"
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0",
    "Referer": "https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001"
}


def main(url):
    with requests.Session() as req:

        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')

        data['ctl01_ScriptManager1_TSM'] = unquote(soup.select_one(
            "script[src*=Telerik]").get("src")).split("=", 3)[-1]
        data['__VIEWSTATE'] = soup.find("input", id="__VIEWSTATE").get("value")
        data['PageInstanceKey'] = re.search(
            'PageInstanceKey=(.+?)"', r.text).group(1)
        data['__RequestVerificationToken'] = soup.find(
            "input", id="__RequestVerificationToken").get("value")
        urls = []
        for num in range(1, 4):
            print(f"Extracting Links From Page {num}")

            r = req.post(url, data=data, headers=headers)
            soup = BeautifulSoup(r.content, 'html.parser')

            links = [f'{url[:32]}{link.get("href")}'
                     for link in soup.select("a[href*=viewprofile]")]
            urls.extend(links)

            for f in fish:
                if num == 1:
                    data[f] = re.sub('(k)(.+)', r"\1SecondPage", data[f])
                else:
                    data[f] = re.sub('(k)(.+)', r"\1Last", data[f])

        print(f"Collected {len(urls)} Links")
        done = []
        for x in urls:
            r = req.get(x)
            soup = BeautifulSoup(r.content, 'html.parser')
            load = soup.select("div.PanelFieldValue")
            name = load[2].span.text
            add = load[0].span.text
            ph = load[1].span.text
            try:
                em = soup.select_one("a.PanelField").text
            except:
                em = "N/A"
            goal = [name, add, ph, em]
            done.append(goal)
        df = pd.DataFrame.from_records(
            done, columns=["Name", "Address", "Phone", "Email"])
        print(df)
        df.to_csv("data.csv", index=False)


main("https://www.therapistlocator.net/tl/therapist-finder.aspx?zip=10001")

Output: view-online

Sir,, it works fine for the given zip code and address but will you please explain a bit, what if i search for another zip code and distance? what will i have to change in the code then? thanksss — ahmadfaraz, Apr 21 '20 at 06:47

How to iterate through all pages in a site?

2 Answers2

Solution

Pagination with Selenium

Core Steps