0

I'm trying to extract data from this website. I've written a function which extracts the data I need in javascript:

const total_planning_applications = 39437
const min_value = 39407
var superArray = []
var i = 1
window.location.href = "http://www2.ashfield.gov.uk/cfusion/Planning/plan_arc_results2_v_date.cfm?fromyear=1974&frommonth=01&fromday=01&to_year=2017&to_month=06&to_day=26&StartRow=" + (total_planning_applications - i*10)
window.onload = loop

//main loop handler to be called when window.onload event. see: https://stackoverflow.com/questions/588040/window-onload-vs-document-onload  for details.
function loop(){
    concatTables(superArray,document.getElementsByTagName("tbody")[0],function(){
        i++
        if(min_value < (total_planning_applications - i*10)){
            window.location.href = "http://www2.ashfield.gov.uk/cfusion/Planning/plan_arc_results2_v_date.cfm?fromyear=1974&frommonth=01&fromday=01&to_year=2017&to_month=06&to_day=26&StartRow=" + (total_planning_applications - i*10)
            window.onload = loop            
        }
    })
}

//merges a table from the Ashfield council's website with the mainArray (disincluding headers)
function concatTables(mainArray,table,callback){
    if(mainArray=[]){
        mainArray.push(["RefNum","RefLink","Application","Location","Proposal","ADCDecision","ValidDate","Map","MapLink"])
    }
    arr = getArray(table)
    arr.shift()
    mainArray.push(arr)
}


//gets an array from the table on the Ashfield counsil's website
function getArray(table){
    var ret = []
    for(var i=0;i<table.children.length;i++){
        var row = table.children[i]
        var aRow = []
        var bSkip = false
        for(var j=0;j<row.children.length;j++){
            if (row.children.length==1){
                bSkip = true
                break;
            }
            aRow.push(row.children[j].innerText.trim().replace(/\r|\n/g," "))
            if(row.children[j].getElementsByTagName("a")[0]!=undefined){
                aRow.push(row.children[j].getElementsByTagName("a")[0].href)
            }
        }
        if(!bSkip){
            ret.push(aRow)
        } else {
            bSkip = false
        }
    }
    return ret
}

However when I try to execute the javascript from the console, the javascript stops executing after the first loop. I realise that this occurs due to security reasons and that browsers disallow cross-website javascript.

However, in this case I don't necessarily have to navigate to another site. Rather I need to navigate to a certain query string. Is it possible to navigate to a query string without losing the javascript runtime?

If not are there any other solutions other than using something akin to Electron?

Sancarn
  • 2,575
  • 20
  • 45

1 Answers1

0

The solution I came up with was the following:

Why navigate to the website at all?

Instead use HTTP Requests:

var total_planning_applications = 39437
var min_value = 30000
var base_ref = "http://www2.ashfield.gov.uk/cfusion/Planning/plan_arc_results2_v_date.cfm?fromyear=1974&frommonth=01&fromday=01&to_year=2017&to_month=06&to_day=26&StartRow="
window.mainArray = []
var i = 1
//debugger;
httpGetAsync(base_ref + (total_planning_applications - i*10),loop)

//main loop handler to be called when window.onload event. see: https://stackoverflow.com/questions/588040/window-onload-vs-document-onload  for details.
function loop(docx){
    concatTables(window,docx.getElementsByTagName("tbody")[0],function(){
        i++
        console.log(`Processing...(${i})`)
        if(min_value < (total_planning_applications - i*10)){
            httpGetAsync(base_ref + (total_planning_applications - i*10),loop)      
        } else {
            console.log("All done!")
            console.log(JSON.stringify(window.mainArray))
        }
    })
}

//merges a table from the Ashfield council's website with the mainArray (disincluding headers)
function concatTables(window,table,callback){
    if(window.mainArray.length==0){
        window.mainArray.push(["RefNum","RefLink","Application","Location","Proposal","ADCDecision","ValidDate","Map","MapLink"])
    }
    arr = getArray(table)
    arr.shift()
    window.mainArray = window.mainArray.concat(arr)
    callback()
}


//gets an array from the table on the Ashfield counsil's website
function getArray(table){
    var ret = []
    for(var i=0;i<table.children.length;i++){
        var row = table.children[i]
        var aRow = []
        var bSkip = false
        for(var j=0;j<row.children.length;j++){
            if (row.children.length==1){
                bSkip = true
                break;
            }
            aRow.push(row.children[j].innerText.trim().replace(/\r|\n/g," "))
            if(row.children[j].getElementsByTagName("a")[0]!=undefined){
                aRow.push(row.children[j].getElementsByTagName("a")[0].href)
            }
        }
        if(!bSkip){
            ret.push(aRow)
        } else {
            bSkip = false
        }
    }
    return ret
}

function httpGetAsync(theUrl, callback){
    var xmlHttp = new XMLHttpRequest();
    xmlHttp.onreadystatechange = function() { 
        if (xmlHttp.readyState == 4 && xmlHttp.status == 200){
            parser = new DOMParser();
            xmlDoc = parser.parseFromString(this.responseText,"text/html")
            callback(xmlDoc);
        }
    }
    xmlHttp.open("GET", theUrl, true); // true for asynchronous 
    xmlHttp.send(); //null
}

The main difference here is we work only on the xml document returned by httpGetAsync() function. It also has the added benefit of working silently and not disrupting the user! In this case I didn't want to generate too much traffic so I called the function recursively, however it should also be possible to have many asynchronous calls in parallel, which should also be possible with this technique.

Sancarn
  • 2,575
  • 20
  • 45