2

With the code below I have been trying to download a URL, save to file and than check if there are any html tags in that file (the tags I want to check are in the checks.json). I need to run this from the commandline and for that I think i need the asynch, callback features of the javascript language.

And I am lost at this point. I cannot make it work. My problem is that from the commandline I always get the error that the file does not exists.

I know it needs to wait until it is downloaded and then run the check. But it seems every time I run the code, that function is not called. And I don't know why.

So what I would need to understand is:

How I can run this from the commandline, download the URL, save it to file, check with checks.json and print the result to console.

Thank you.

var fs = require('fs');
var program = require('commander'); 
var cheerio = require('cheerio');
var rest = require('restler');
var HTMLFILE_DEFAULT = "index.html";
var CHECKSFILE_DEFAULT = "checks.json";
var URLFILE_DEFAULT = "downloaded.html";


var assertFileExists = function(infile) {
    var instr = infile.toString();
    if(!fs.existsSync(instr)) {
        console.log("%s does not exist. Exiting.", instr);
        process.exit(1); // http://nodejs.org/api/process.html#process_process_exit_code
    }
    return instr;
};

var cheerioHtmlFile = function(htmlfile) {
    return cheerio.load(fs.readFileSync(htmlfile));
};

var loadChecks = function(checksfile) {
    return JSON.parse(fs.readFileSync(checksfile));
};

var checkHtmlFile = function(htmlfile, checksfile) {
    $ = cheerioHtmlFile(htmlfile);
    var checks = loadChecks(checksfile).sort();
    var out = {};
    for(var ii in checks) {
        var present = $(checks[ii]).length > 0;
        out[checks[ii]] = present;
    }
    return out;
};

var clone = function(fn) {
    // Workaround for commander.js issue.
    // http://stackoverflow.com/a/6772648
    return fn.bind({});
};

var downAnd2File = function() {
  rest.get('http://www.wired.com/').on('complete', function(result) {
  if (result instanceof Error) {
    sys.puts('Error: ' + result.message);
    this.retry(5000); // try again after 5 sec
  } else 

  {
    fs.writeFile(__dirname + '/downloaded.html', result, function(err) {
    if (err) throw err;
    console.log('Saved!');

});
  }
});
  downAnd2File(checkHtmlFile);
}

if(require.main == module) {
    program
        .option('-c, --checks <check_file>', 'Path to checks.json', clone(assertFileExists), CHECKSFILE_DEFAULT)
        .option('-f, --file <html_file>', 'Path to index.html', clone(assertFileExists), HTMLFILE_DEFAULT)
        .option('-u, --url  <html_file>', 'Path to downloaded url', clone(assertFileExists), URLFILE_DEFAULT) ///////////////
        .parse(process.argv); 
    var down2FileAndCheck = downAnd2File(checkHtmlFile(program.url, program.checks));
    var checkJson = checkHtmlFile(program.file, program.checks);
    var outJson = JSON.stringify(checkJson, null, 4);
    console.log(outJson);

} else {
    exports.checkHtmlFile = checkHtmlFile;
}
digit
  • 1,513
  • 5
  • 29
  • 49

3 Answers3

4

You don't seem to understand how callbacks work in node.js. You will need to do some reading about it on stackoverflow or elsewhere.

I rewrote your program (but didn't test it), study it and ask me if you have any questions.

// halts if the provided filename doesn't exist
function assertFileExists(filename) {
    if (!fs.existsSync(filename)) {
        console.log("%s does not exist. Exiting.", filename);
        process.exit(1);
    }
    return filename;
}

// loads checks from a file
function loadChecks(checksfile) {
    return JSON.parse(fs.readFileSync(checksfile)).sort();
}

// checks html
function checkHtml(html, checks) {
    $ = cheerio.load(html);
    var out = {};
    for(var ii in checks) {
        var present = $(checks[ii]).length > 0;
        out[checks[ii]] = present;
    }
    return out;
}

// loads html from a file and checks it
// for exports only
function checkHtmlFile(filename, checks) {
    return checkHtml(fs.readFileSync(filename), checks);
}

// downloads html from the internet
// callback is called with two arguments: err, html
// where err is null if there is no error
function download(url, callback) {
    var resp = rest.get(url);
    resp.on('complete', function(result) {
        if (result instanceof Error) {
            // callback(result);
            sys.puts('Error: ' + result.message);
            this.retry(5000); // try again after 5 sec
            return;
        }
        callback(null, result);
    });
}

if (require.main == module) {
    program
        .option('-c, --checks <check_file>', 'Path to checks.json', assertFileExists, CHECKSFILE_DEFAULT)
        .option('-f, --file <html_file>', 'Path to index.html', assertFileExists, HTMLFILE_DEFAULT)
        .option('-u, --url  <url>', 'Path to downloaded url') ///////////////
        .parse(process.argv); 

    // this function loads checks & checks html
    function check(err, html) {
        if (err) {
            console.log('Error getting html: ' + err);
            process.exit(1);
        }
        var checks = loadChecks(program.checks);
        var checkJson = checkHtml(html, checks);
        var outJson = JSON.stringify(checkJson, null, 4);
        console.log(outJson);
    }

    if (program.url) {
        // download the provided url and then check the html
        download(program.url, check);
    } else if (program.file) {
        // load html from a file and then check it
        fs.readFile(program.file, check);
    }
} else {
    exports.loadChecks = loadChecks; // for loading checks
    exports.checkHtmlFile = checkHtmlFile; // for checking a file
}
mak
  • 13,267
  • 5
  • 41
  • 47
  • Hi mak. First, big, big thanks for taking the time and do this rewrite. I just tried to run it with the -c and -u flag and I get the exact same error (file does not exists). I chmod it with u+x before. I don't find where I can specify which site to download in the code. If it should be at the commandline, it does not work. Maybe I am missing something really obvious. I will keep trying. Thanks again. – digit Jul 09 '13 at 14:33
  • And yes I still struggle to understand the callback. Not the concept, but the implications. Thanks again – digit Jul 09 '13 at 14:35
  • Downloading is part of this new rewrite, but writing to file is missing. Therefore the script is always getting an error. Where do I have to put the writing to file part ? Thanks. – digit Jul 09 '13 at 14:53
  • You can keep the html in memory, it's no necessary to write it to a file to check it. That's why I didn't include it. – mak Jul 09 '13 at 14:58
  • Hmm. Thanks. So why does it get an error at the commandline "does not exist. Exiting.". That is the assertFileExists function. It should work now. Maybe the clone function that you got rid of, is still necessary ? – digit Jul 09 '13 at 16:00
  • Remove `assertFileExists` from `--url`. – mak Jul 09 '13 at 16:57
  • Awesome! I had removed that from my former code and it gave me an error, but this time it worked ! Thanks a million. – digit Jul 09 '13 at 17:19
1

As Mak mentioned already, you don't really need to download and save the html you just fetched.

Here's a minor refactor over the original code. The idea is to fetch the html (either from the url or the saved file). Once the result/data is ready, call the method to run the check on the content.

function checkHtml(err, html){
    if (err){
        console.log('Error: ' + err);
        process.exit(1);
    }
    var checkJson = checkHtmlFile(html, program.checks);
    var outJson = JSON.stringify(checkJson, null, 4);
    console.log(outJson);
}

if(require.main == module){

    program
        .option('-c, --checks <check_file>', 'Check path', clone(assertFileExists), CHECKS_DEF)
        .option('-f, --file <html_file>', 'File path', clone(assertFileExists), HTML_DEF)
        .option('-u, --url <url_pointer>', 'Url link that needs to be graded')
        .parse(process.argv);

    if (program.url){
        rest.get(program.url)
            .on('complete', function(result){
                checkHtml((html instanceof Error), result);
            });

    } else {
        fs.readFile(program.file, checkHtml);
    }
}

Hope this helps.

Prashant
  • 1,014
  • 11
  • 28
  • fs.js:427 return binding.open(pathModule._makeLong(path), stringToFlags(flags), mode); ^(bottom of open) Error: ENAMETOOLONG, name too long ' – WINSergey Jul 10 '13 at 21:45
0

Just in case your problem is happening before any JS code is executed:

You might want to make the file executable

  chmod +x <filename>

and add this statement on top of you file:

  #!/usr/bin/env node

Sorry if it's already the case

Aurélien Thieriot
  • 5,853
  • 2
  • 24
  • 25