The pages are concatenated onto end of url. I would use xhr issues requests in a loop over given page range and regex out the json which contains the required info (it is in one of the script tags). This method is very fast and more than offsets use of regex. I also re-use objects where possible.
I use jsonconverter.bas to handle the json and parse out required info (there is a lot more info in the json including the reviews). After downloading the .bas and adding to a module called JsonConverter in your project, you need to go VBE > Tools > References > Add reference to Microsoft Scripting Runtime.
Helper functions are used to test whether page to write out to already exists or needs creating, as well as one to write out json results to an array and dump the array out in one go to sheet (efficiency gain). The structure is left so it is easy to extend the retrieved info if more info is desired e.g. the review.
There may be some work to do on ensuring works for pages that don't exist. I have simply used status code of response at present to filter these out.
NOTES:
As a sanity check I would use InternetExplorer to go to page 1 and extract the total results count. I would divide that by results per page (currently 30) to calculate total pages. This would give me lbound and ubound values (min and max for possible pages). Then switch to xmlhttp to actually retrieve. See additional helper function at end.
Code:
Option Explicit
Public Sub GetRestuarantInfo()
Dim s As String, re As Object, p As String, page As Long, r As String, json As Object
Const START_PAGE As Long = 2
Const END_PAGE As Long = 4
Const RESULTS_PER_PAGE As Long = 30
p = "\[{""@context"".*?\]"
Set re = CreateObject("VBScript.RegExp")
Application.ScreenUpdating = False
With CreateObject("MSXML2.XMLHTTP")
For page = START_PAGE To END_PAGE
.Open "GET", "https://www.yellowpages.com/atlanta-ga/restaurants?page=" & page, False
.send
If .Status = 200 Then
s = .responseText
r = GetValue(re, s, p)
If r <> "Not Found" Then
Set json = JsonConverter.ParseJson(r)
WriteOutResults page, RESULTS_PER_PAGE, json
End If
End If
Next
End With
Application.ScreenUpdating = True
End Sub
Public Sub WriteOutResults(ByVal page As Long, ByVal RESULTS_PER_PAGE As Long, ByVal json As Object)
Dim sheetName As String, results(), r As Long, headers(), ws As Worksheet
ReDim results(1 To RESULTS_PER_PAGE, 1 To 3)
sheetName = "page" & page
headers = Array("Name", "Website", "Tel")
If Not WorksheetExists(sheetName) Then
Set ws = ThisWorkbook.Worksheets.Add
ws.Name = sheetName
Else
ThisWorkbook.Worksheets(sheetName).Cells.ClearContents
End If
With ws
Dim review As Object
For Each review In json 'collection of dictionaries
r = r + 1
results(r, 1) = review("name")
results(r, 2) = review("url")
results(r, 3) = review("telephone")
Next
.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers
.Cells(2, 1).Resize(UBound(results, 1), UBound(results, 2)) = results
End With
End Sub
Public Function GetValue(ByVal re As Object, inputString As String, ByVal pattern As String) As String
'https://regex101.com/r/M9oRON/1
With re
.Global = True
.MultiLine = True
.IgnoreCase = False
.pattern = pattern
If .Test(inputString) Then
GetValue = .Execute(inputString)(0)
Else
GetValue = "Not found"
End If
End With
End Function
Public Function WorksheetExists(ByVal sName As String) As Boolean '@Rory https://stackoverflow.com/a/28473714/6241235
WorksheetExists = Evaluate("ISREF('" & sName & "'!A1)")
End Function
Helper function to return number of pages
'VBE > Tools > References: Microsoft Internet Controls
Public Function GetNumberOfPages(ByVal RESULTS_PER_PAGE As Long) As Variant
Dim ie As Object, totalResults As Long
On Error GoTo errhand
Set ie = CreateObject("InternetExplorer.Application")
With ie
.Visible = False
.Navigate2 "https://www.yellowpages.com/atlanta-ga/restaurants?page=1"
While .Busy Or .readyState < 4: DoEvents: Wend
With .document
totalResults = Replace$(Replace$(.querySelector(".pagination p").innerText, "We found", vbNullString), "results", vbNullString)
GetNumberOfPages = totalResults / RESULTS_PER_PAGE
ie.Quit
Exit Function
End With
End With
errhand:
If Err.Number <> 0 Then
GetNumberOfPages = CVErr(xlErrNA)
End If
End Function
Regex explanation:
Try it here.
