0

I have a problem with casperjs/phantomjs. I wrote a script to gather all .xls/.xlsx files from a website. That worked. Now I have extendet my script to gather these files from a predefined array of urls. The strange thing I came across is that the download actually works. I wanted to save all files of either one of the sites in a separate folder named accordingly to the website where the file had been gathered from.

Example: All the files of http://minerals.usgs.gov/minerals/pubs/commodity/aluminum/ should be saved in a folder ...\Data\aluminum\

strangely (even though the download works) the script keeps using just one adress from the array (the very last one to be exact). Thus all files are stored in a folder named after the last website used to gather files.

I hope you can understand what I mean. Underneath you find my code...

var links = [];
var index;

var url = {
'abrasives': 'http://minerals.usgs.gov/minerals/pubs/commodity/abrasives/',
'aluminum': 'http://minerals.usgs.gov/minerals/pubs/commodity/aluminum/',
'antimony': 'http://minerals.usgs.gov/minerals/pubs/commodity/antimony/'
};
var casper = require('casper').create();

function getLinks() {
    var links = document.querySelectorAll('a');
    return Array.prototype.map.call(links, function(e) {
        return e.getAttribute('href');
    });
}

casper.start('http://google.com', function() {
    for(var k in url){
        this.thenOpen(url[k], function(){
            var j=0;
            if(url.hasOwnProperty(k)){
                this.echo("Key is " + k + ",value is " + url[k]);
                links = this.evaluate(getLinks);
                this.echo(links.length + ' Links gefunden...');
                for(index= 0; index < links.length;index++){
                    if ((new RegExp('.xls')).test(links[index]) || (new RegExp('.xlsx')).test(links[index])) {
                        j++;
                        var folder = url[k].split('/');
                        // the next line was for testing purposes 
                        this.echo(folder[6]);
                        var filename = links[index].replace(/^.*[\\\/]/, '')        
                        this.echo(j + ' Excel-Files found at ' + this.getTitle() + "!");
                        this.download(links[index],'Data\\' + folder[folder.length] + '\\'+filename);
                    }
                }
            }
        });
    }
});

casper.run(function() {
    this.echo('All files stored at C:\\User\\Username\\Data\\ .');
    this.echo('End...').exit();
});
Artjom B.
  • 61,146
  • 24
  • 125
  • 222
jonas778
  • 75
  • 2
  • 11
  • See the linked question with its answers. `thenOpen` is asynchronous and is only executed after the loop finished. If you still don't understand, I will explain. – Artjom B. Nov 05 '14 at 11:41
  • Hey Artjom, thank you very much for your response. Sadly, I dont know how to apply the examples of the link you provided to my problem. However, I understood that the problem is: Casperjs first creates all the thenOpen and executes all of them once it has finished looping. I tried to implement another var before the this.thenOpen call and hand that one to then Open instead of k, but that didnt help. Do you have any further advise? – jonas778 Nov 05 '14 at 12:37
  • If you understand the problem then you understand that you need some function inbetween the loop and the `thenOpen` like an [IIFE](http://stackoverflow.com/a/19324832). JS has function level scope so you need to bind `k` to some other function that is outside of `thenOpen`. – Artjom B. Nov 05 '14 at 22:00

0 Answers0