0

This is my first web scraping task. I have been tasked with scraping a website

It is a site that contains the names of lawyers in Denmark. My difficulty is that I can only retrieve names based on the particular name query i put in the search bar. Is there an online web tool I can use to scrape all the names that the website contains? I have used tools like Import.io with no success so far. I am super confused on how all of this works.

Andreas
  • 2,455
  • 10
  • 21
  • 24
Hakeem Baba
  • 647
  • 1
  • 12
  • 32
  • If the name is not visible somewhere, and you can't search for everything, how could the tool find the data for you? Some part of it of course might be indexed by google, but I doubt that's any help – James Z Jan 30 '18 at 16:17

1 Answers1

1

Please scroll down to UPDATE 2

The website enforces you to enter at least one search parameter, so you may loop through all items for Arbejdsområde list, making request for each of them. Here is the example, showing how that could be done in Excel VBA (open VBE, create standard module, paste the code and run Test()):

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim oItems As Object
    Dim vItem
    Dim aData
    Dim sContent As String
    Dim lPage As Long
    Dim i As Long
    Dim j As Long

    ' Retrieve search page HTML content
    XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
    ' Extract work areas items
    ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$AreaSelect", oItems
    oItems.Remove oItems.Keys()(0)
    sContent = ""
    ' Process each work area item
    For Each vItem In oItems.Items()
        Debug.Print "Item [" & vItem & "]"
        lPage = 0
        ' Process each results page
        Do
            Debug.Print vbTab & "Page [" & lPage & "]"
            ' Retrieve result page HTML content
            XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&a=" & vItem & "&p=" & lPage, "", "", "", sResponse
            ' Extract result table
            ParseResponse _
                "<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                sResponse, _
                aData, _
                False
            ' Store parsed table
            sContent = sContent & aData(0)
            Debug.Print vbTab & "Parsed " & Len(sContent)
            lPage = lPage + 1
            DoEvents
        Loop Until InStr(sResponse, "<a class=""next""") = 0
    Next
    ' Extract data from the whole content
    ParseResponse _
        "<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
        "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
        "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
        "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
        "</tr>", _
        sContent, _
        aData, _
        False
    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Decode HTML
    For i = 1 To UBound(aData, 1)
        For j = 2 To 4
            aData(i, j) = GetInnerText((aData(i, j)))
        Next
    Next
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        Output2DArray .Cells(1, 1), aData
        .Columns.AutoFit
        .Rows.AutoFit
    End With
    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)

    Dim aHeader

    'With CreateObject("MSXML2.ServerXMLHTTP")
        '.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False ' , "u051772", "fy17janr"
        If IsArray(aSetHeaders) Then
            For Each aHeader In aSetHeaders
                .SetRequestHeader aHeader(0), aHeader(1)
            Next
        End If
        .Send (sFormData)
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With

End Sub

Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)

    Dim aTmp0
    Dim vItem

    ' Escape RegEx special characters
    For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
        sName = Replace(sName, vItem, "\" & vItem)
    Next
    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp0
        oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
    Next

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                aTmp0 = Array()
                For Each sSubMatch In oMatch.SubMatches
                    PushItem aTmp0, sSubMatch
                Next
                PushItem aData, aTmp0
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function GetInnerText(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    GetInnerText = oDiv.innerText

End Function

Function Denestify(aRows)

    Dim aData()
    Dim aItems()
    Dim i As Long
    Dim j As Long

    If UBound(aRows) = -1 Then Exit Function
    ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
    For j = 0 To UBound(aRows)
        If IsArray(aRows(j)) Then
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Else
            aData(j + 1, 1) = aRows(j)
        End If
    Next
    Denestify = aData

End Function

Sub Output2DArray(oDstRng As Range, aCells As Variant)

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = "@"
            .Value = aCells
        End With
    End With

End Sub

It takes few minutes to retrieve all data for the first time (after that when launched again all requests are loaded from the cache that makes process significantly faster, to get a latest data from the server you need to clean up the cache in IE settings). The output for me is as follows:

output

Generally RegEx's aren't recommended for HTML parsing, so there is disclaimer. Data being processed in this case is quite simple that is why it is parsed with RegEx. About RegEx: introduction (especially syntax), introduction JS, VB flavor.

BTW there are another answers using the similar approach: 1, 2, 3 and 4.

UPDATE

The above suggested scraping is based on parsing search results filtered by Arbejdsområde parameter, and as it turned out, actually returned results are inaccurate. Those lawyers, which have multiply Arbejdsområder are present multiply times in results, and which have empty Arbejdsområder are not in results at all.

Another parameter instead of Arbejdsområde, that can be used for such scraping is Retskreds. All lawyers records contain address, and only single address, so results are full and don't contain duplicates. Note, one lawyer can relate to several offices, so that will be several records in results.

There is the code that allows to scrape detailed info for each entry within loop:

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim oItems As Object
    Dim vKey
    Dim sItem As String
    Dim aTmp
    Dim aData
    Dim lPage As Long
    Dim i As Long
    Dim j As Long

    ' Retrieve search page HTML content
    XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/", "", "", "", sResponse
    ' Extract Retskreds items
    ExtractOptions sResponse, "ctl00$ContentPlaceHolder$Search$CourtSelect", oItems
    oItems.Remove oItems.Keys()(0)
    i = 0
    ' Process each Retskreds item
    For Each vKey In oItems
        sItem = oItems(vKey)
        Debug.Print "Area " & sItem & " " & vKey
        lPage = 0
        ' Process each results page
        Do
            Debug.Print vbTab & "Page " & lPage
            ' Retrieve results page
            XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&c=" & sItem & "&p=" & lPage, "", "", "", sResponse
            ' Extract table
            ParseResponse _
                "<table\b[^>]*?id=""ctl00_ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
                sResponse, _
                aTmp, _
                False
            ' Extract data from the table
            ParseResponse _
                "<tr.*?onclick=""location.href='([^']*)'"">\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
                "</tr>", _
                aTmp(0), _
                aData, _
                True
            ' Add Retskreds name
            For i = i To UBound(aData)
                aTmp = aData(i)
                PushItem aTmp, vKey
                aData(i) = aTmp
            Next
            Debug.Print vbTab & "Parsed " & UBound(aData)
            lPage = lPage + 1
            DoEvents
        Loop Until InStr(sResponse, "<a class=""next""") = 0
    Next
    ' Retrieve detailed info for each entry
    For i = 0 To UBound(aData)
        aTmp = aData(i)
        ' Retrieve details page
        aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
        ' Extract details
        XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
        ParseResponse _
            DecodeUriComponent( _
                "Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
                "Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
                "F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?" & _
                "M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
                "M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
                "E-mail\: [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?" & _
                "Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
            sResponse, _
            aTmp, _
            True, _
            False
        aTmp(9) = StrReverse(aTmp(9))
        aData(i) = aTmp
        Debug.Print vbTab & "Details " & i
        DoEvents
    Next
    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Decode HTML
    For i = 1 To UBound(aData, 1)
        For j = 2 To 4
            aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
        Next
    Next
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        OutputArray .Cells(1, 1), _
            Array("URL", _
                "Navn", _
                "Firma", _
                DecodeUriComponent("Arbejdsomr%C3%A5der"), _
                DecodeUriComponent("Retskreds"), _
                DecodeUriComponent("Beskikkelses%C3%A5r"), _
                DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
                DecodeUriComponent("M%C3%B8deret for landsret"), _
                DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
                "E-mail", _
                "Mobiltlf." _
            )
        Output2DArray .Cells(2, 1), aData
        .Columns.AutoFit
        .Rows.AutoFit
    End With
    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sMethod, sUrl, aSetHeaders, sFormData, sRespHeaders, sRespText)

    Dim aHeader

    'With CreateObject("MSXML2.ServerXMLHTTP")
        '.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
    With CreateObject("MSXML2.XMLHTTP")
        .Open sMethod, sUrl, False
        If IsArray(aSetHeaders) Then
            For Each aHeader In aSetHeaders
                .SetRequestHeader aHeader(0), aHeader(1)
            Next
        End If
        .Send (sFormData)
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With

End Sub

Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)

    Dim aTmp0
    Dim vItem

    ' Escape RegEx special characters
    For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
        sName = Replace(sName, vItem, "\" & vItem)
    Next
    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp0, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp0(0)), aTmp0, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp0
        oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
    Next

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                If bNestSubMatches Then
                    aTmp0 = Array()
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aTmp0, sSubMatch
                    Next
                    PushItem aData, aTmp0
                Else
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aData, sSubMatch
                    Next
                End If
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function DecodeUriComponent(sEncoded As String) As String

    Static objHtmlfile As Object

    If objHtmlfile Is Nothing Then
        Set objHtmlfile = CreateObject("htmlfile")
        objHtmlfile.parentWindow.execScript "function decode(s) {return decodeURIComponent(s)}", "jscript"
    End If
    DecodeUriComponent = objHtmlfile.parentWindow.decode(sEncoded)

End Function

Function GetInnerText(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    GetInnerText = oDiv.innerText

End Function

Function Denestify(aRows)

    Dim aData()
    Dim aItems()
    Dim i As Long
    Dim j As Long

    If UBound(aRows) = -1 Then Exit Function
    ReDim aData(1 To UBound(aRows) + 1, 1 To 1)
    For j = 0 To UBound(aRows)
        If IsArray(aRows(j)) Then
            aItems = aRows(j)
            For i = 0 To UBound(aItems)
                If i + 1 > UBound(aData, 2) Then ReDim Preserve aData(1 To UBound(aRows) + 1, 1 To i + 1)
                aData(j + 1, i + 1) = aItems(i)
            Next
        Else
            aData(j + 1, 1) = aRows(j)
        End If
    Next
    Denestify = aData

End Function

Sub OutputArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")

    With oDstRng
        .Parent.Select
        With .Resize(1, UBound(aCells) - LBound(aCells) + 1)
            .NumberFormat = sFormat
            .Value = aCells
        End With
    End With

End Sub

Sub Output2DArray(oDstRng As Range, aCells As Variant, Optional sFormat As String = "@")

    With oDstRng
        .Parent.Select
        With .Resize( _
                UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                UBound(aCells, 2) - LBound(aCells, 2) + 1)
            .NumberFormat = sFormat
            .Value = aCells
        End With
    End With

End Sub

There are 4896 entries total for 4689 lawyers:

output

UPDATE 2

Seems to get complete list you may just make search with set (space) as Firma parameter: http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20, there are 6511 entries at the moment. The Sub Test() code for parse that results should be changed then as shown below:

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim aTmp
    Dim aData
    Dim lPage As Long
    Dim i As Long
    Dim j As Long

    lPage = 0
    ' Process each results page
    Do
        Debug.Print vbTab & "Page " & lPage
        ' Retrieve results page
        XmlHttpRequest "GET", "http://www.advokatnoeglen.dk/sog.aspx?s=1&t=0&firm=%20&p=" & lPage, "", "", "", sResponse
        ' Extract table
        ParseResponse _
            "<table\b[^>]*?id=""ContentPlaceHolder_Grid""[^>]*>([\s\S]*?)</table>", _
            sResponse, _
            aTmp, _
            False
        ' Extract data from the table
        ParseResponse _
            "<tr.*?onclick=""location.href=&#39;(.*?)&#39;"">\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "<td[^>]*>\s*([\s\S]*?)\s*</td>\s*" & _
            "</tr>", _
            aTmp(0), _
            aData, _
            True
        Debug.Print vbTab & "Parsed " & (UBound(aData) + 1)
        lPage = lPage + 1
        DoEvents
    Loop Until InStr(sResponse, "<a class=""next""") = 0
    ' Retrieve detailed info for each entry
    For i = 0 To UBound(aData)
        aTmp = aData(i)
        ' Retrieve details page
        aTmp(0) = "http://www.advokatnoeglen.dk" & aTmp(0)
        ' Extract details
        Do
            XmlHttpRequest "GET", aTmp(0), "", "", "", sResponse
            If InStr(sResponse, "<title>Runtime Error</title>") = 0 Then Exit Do
            DoEvents
        Loop
        ParseResponse _
            DecodeUriComponent( _
                "Arbejdsomr%C3%A5der\: [\s\S]*?</h2>[\s\S]*?" & _
                "Beskikkelses%C3%A5r\: ([^<]*)[\s\S]*?" & _
                "(:?F%C3%B8dsels%C3%A5r\: ([^<]*)[\s\S]*?)?" & _
                "M%C3%B8deret for landsret\: ([^<]*)[\s\S]*?" & _
                "M%C3%B8deret for h%C3%B8jesteret\: ([^<]*)[\s\S]*?" & _
                "(:?E-mail [\s\S]*?href='\/email\.aspx\?e\=(.*?)'[\s\S]*?)?" & _
                "Mobiltlf\.\: ([\d\(\)\-+ ]*?)\s*<"), _
            sResponse, _
            aTmp, _
            True, _
            False
        aTmp(8) = StrReverse(aTmp(8))
        aData(i) = aTmp
        Debug.Print vbTab & "Details " & i
        DoEvents
    Next
    ' Rebuild nested arrays to 2d array for output
    aData = Denestify(aData)
    ' Decode HTML
    For i = 1 To UBound(aData, 1)
        For j = 2 To 4
            aData(i, j) = Trim(Replace(GetInnerText((aData(i, j))), vbCrLf, ""))
        Next
    Next
    ' Output
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        OutputArray .Cells(1, 1), _
            Array("URL", _
                "Navn", _
                "Firma", _
                DecodeUriComponent("Arbejdsomr%C3%A5der"), _
                DecodeUriComponent("Beskikkelses%C3%A5r"), _
                DecodeUriComponent("F%C3%B8dsels%C3%A5r"), _
                DecodeUriComponent("M%C3%B8deret for landsret"), _
                DecodeUriComponent("M%C3%B8deret for h%C3%B8jesteret"), _
                "E-mail", _
                "Mobiltlf." _
            )
        Output2DArray .Cells(2, 1), aData
        .Columns.AutoFit
        .Rows.AutoFit
    End With
    MsgBox "Completed"

End Sub
omegastripes
  • 12,351
  • 4
  • 45
  • 96
  • wow! this works great. Thanks a lot for your help. In the website when you click on a lawyers name, it provides contact information like email and phone number. Is there a way i can loop through all the output of lawyers names to get the contact details as well? – Hakeem Baba Feb 23 '18 at 14:05
  • @HakeemBaba You need just add one more loop through all items right after last `ParseResponse` call, make request for each extracted URL, parse response to get details and add extracted details into `aData`. All these operations are shown in the above example. – omegastripes Feb 26 '18 at 05:54