Web scraping HTML by ID

Question

I would like to scrap info from this site

How do I retrieve the 176 per the pic below ...

Here is the code I have tried:

Option Explicit

Sub HL_Sectors()

    Dim XMLPage As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLSector As MSHTML.IHTMLElement
    Dim HTMLSectorID As MSHTML.IHTMLElement
    Dim RowNum As Long, ColNum As Integer
    Dim HTMLSectorIDVal As MSHTML.IHTMLAttributeCollection
    Dim HTMLSectorValue As MSHTML.IHTMLElement

    XMLPage.Open "GET", "http://www.hl.co.uk/funds", False
    XMLPage.send
    HTMLDoc.body.innerHTML = XMLPage.responseText
    Debug.Print HTMLDoc.getElementById("fundSearch-detail").innerText

End Sub

Looking for advice on adding the value to each of the corresponding Sectors. See code below. I'm having difficulty retrieving the value

Option Explicit


Sub HL_Sectors()

Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim HTMLSector As MSHTML.IHTMLElement
Dim HTMLSectorID As MSHTML.IHTMLElement
Dim RowNum As Long, ColNum As Integer

XMLPage.Open "GET", "http://www.hl.co.uk/funds", False
XMLPage.send

HTMLDoc.body.innerHTML = XMLPage.responseText

Set HTMLSector = HTMLDoc.getElementById("search-sector")

Range("A:B").ClearContents

    RowNum = 1
    For Each HTMLSectorID In HTMLSector.getElementsByTagName("option")

        ColNum = 1
            Cells(RowNum, ColNum) = HTMLSectorID.getAttribute("value")
            ColNum = ColNum + 1
            Cells(RowNum, ColNum) = HTMLSectorID.innerText
        RowNum = RowNum + 1
    Next HTMLSectorID

End Sub

do some research? 'getElementByID' – Nathan_Sav Mar 12 '18 at 15:56 — Nathan_Sav, Mar 12 '18 at 15:56
thanks @Nathan_Sav I tried but failed. I'm getting nothing – ddd Mar 12 '18 at 15:58 — ddd, Mar 12 '18 at 15:58
Post what you've tried then – Nathan_Sav Mar 12 '18 at 16:00 — Nathan_Sav, Mar 12 '18 at 16:00
Sure here's is part VBA I was stepping through ... – ddd Mar 12 '18 at 16:01 — ddd, Mar 12 '18 at 16:01

omegastripes · Accepted Answer · 2018-03-12T17:35:20.637

Take a look at the below example:

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim oOptions As Object
    Dim i As Long
    Dim vOption

    ' Retrieve search page web form HTML content
    XmlHttpRequest "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results", sResponse
    ' Extract options
    ExtractOptions sResponse, "sectorid", oOptions
    ' Prepare for output to first worksheet
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        ' Loop through each option
        i = 1
        For Each vOption In oOptions
            Do
                ' Retrieve search results for the option
                XmlHttpRequest "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?sectorid=" & oOptions(vOption) & "&lo=2&filters=0%2C1%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C&page=1&tab=prices&dummy=" & Round(Rnd * 10000000000000#), sResponse
                DoEvents
            Loop Until InStr(sResponse, """totalResults"":""") > 0
            ' Extract total and output
            .Cells(i, 1) = oOptions(vOption)
            .Cells(i, 2) = vOption
            .Cells(i, 3) = Split(Split(sResponse, """totalResults"":""", 2)(1), """", 2)(0)
            .Columns.AutoFit
            i = i + 1
            DoEvents
        Next
    End With
    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sURL As String, sResp As String)

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", sURL, True
        .send
        Do While .readyState <> 4
            DoEvents
        Loop
        sResp = .responseText
    End With

End Sub

Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)

    Dim aTmp
    Dim vItem

    ' Escape RegEx special characters
    For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
        sName = Replace(sName, vItem, "\" & vItem)
    Next
    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp(0)), aTmp, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp
        oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
    Next

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                If bNestSubMatches Then
                    aTmp0 = Array()
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aTmp0, sSubMatch
                    Next
                    PushItem aData, aTmp0
                Else
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aData, sSubMatch
                    Next
                End If
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function GetInnerText(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    GetInnerText = oDiv.innerText

End Function

The output for me is as follows:

Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor.

BTW there are another answers using the similar approach: 1, 2, 3, 4, 5, 6.

thanks @omegastripes it retrieved the value alright, how may I add it to my current code for each Sector? Much appreciated — ddd, Mar 12 '18 at 16:15
please see pic I added showing output I would like to achieve - Thanks! — ddd, Mar 12 '18 at 16:32
WOW! Thanks for all your time on this @omegastripes much appreciated — ddd, Mar 12 '18 at 21:04

score 0 · Answer 2 · answered Mar 12 '18 at 19:19

When you choose to scrape any information from a webpage using XMLHTTP request, you should look for the desired content in the source code instead of inspected element. The thing is when you look for stuffs inspecting element, you will always find static content along with dynamically generated ones. As XMLHTTP request can't handle dynamic content, you could not fetch the value you were after. However, the best bet for you in this case to go for IE. Here is a demo using IE, which can get you the content.

Sub Scrape_Item()
    URL$ = "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?sectorid=121&lo=2&filters=0%2C1%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C&page=1&tab=prices"
    Dim elem As Object

    With CreateObject("InternetExplorer.Application")
        .Visible = False
        .navigate URL
        While .Busy = True Or .ReadyState < 4: DoEvents: Wend

        Set elem = .Document.querySelector("#fundSearch-detail strong")
        [A1] = elem.innerText
    End With
End Sub

Output:

Btw, this is how the content (which is blank) looks like in the source code:

Hi @SIM Your code was running fine now I get "Run-time error '-2147467259 at the .... With CreateObject("InternetExplorer.Application") line in the code. Any ideas how I may resolve this? Have tried to Google it but with no joy. Much appreciate your help - thanks — ddd, Mar 13 '18 at 10:01

Web scraping HTML by ID

2 Answers2