0

First time using Node JS, Im doing some web scraping to verify if a certain page exists (404) or if it contains a certain div.

In stand alone version, it works perfect, meaning one url at a time, but now im trying to fetch the data from a google spreadsheet and loop through it and fire a request for each. It then fs.appendFile's the result to a document. (in another version I update a cell in the google doc instead)

Here is the output in googlemaps.json (file created after running the script):

{
    "client": "Not working client map (404 test)",
    "url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
    "verified": "Not verified",
    "status": "MAP DELETED (404) !"
}{
    "client": "Not working client map(404 test)",
    "url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
    "verified": "NOT verified",
    "status": "Somethings wrong, please verify."
}{
    "client": "Not working client map(404 test)",
    "url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
    "verified": "Verified local business",
    "status": "Map is verified !"
}{
    "client": "Not working client map(404 test)",
    "url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
    "verified": "Verified local business",
    "status": "Map is verified !"
}

So I have :

A spreadsheet, with 2 columns and 4 rows.

Clients / URL of there google + page.

Pretty basic stuff here. what I dont understand is why the "verified" and "status" seem to work, but the "client" name and the URL are all the same ? Its as if the loop sticks to number 4... (I have a total of 4 rows in the spreadsheet) .... yet i can confirm the status is correct.

Yet when i run a console.log just before the line where i make my request everything seems fine.

console.log(key + " -> " + rows[key][1] + " / " + rows[key][2]);

That line gets me back CLIENT / URL no problem for all 4 rows.

Yet right after this line :

request(url, function(error, response, html){
                    if(!error){
                            var $ = cheerio.load(html);
                            var json = { client : "", url : "", verified : "", status: ""};

Everything stays stuck at row 4...

Heres the rest of the code :

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();
var Spreadsheet = require('edit-google-spreadsheet');


app.get('/maps', function(req, res){

 Spreadsheet.load({
    debug: true,
    /*spreadsheetName: 'Google Maps Url List',
    worksheetName: 'Sheet1',*/
    spreadsheetId: 'MY ID',
    worksheetId: 'od6',
    //    1. Username and Password
    username: 'USERNAME',
    password: 'PASS',

  }, function sheetReady(err, spreadsheet) {
    if(err) throw err;

    spreadsheet.receive(function(err, rows, info) {
        if(err) throw err;
            console.log(rows);  
            //console.log(rows["1"]["2"]);

    for (var key in rows) {
                  if (rows.hasOwnProperty(key)) {
                    //key++;
                    console.log(key + " -> " + rows[key][1] + " / " + rows[key][2])     

                    var url = rows[key][2];
                    var clientName = rows[key][1];

                    request(url, function(error, response, html){
                        if(!error){
                                var $ = cheerio.load(html);
                                var json = { client : "", url : "", verified : "", status: ""};


                                //verify if the google + page is verified local business
                                if ($('.xR.OA.Hlb.d-k-l.NA').length){

                                            $('.xR.OA.Hlb.d-k-l.NA').filter(function(){
                                                var data = $(this);
                                                var isValid = data.text();
                                                json.client = clientName;
                                                json.url = url;
                                                json.verified = isValid;
                                                json.status = "Map is verified !";  
                                            })

                                } else {
                                            // if not, verify if its the google 404 page
                                            if ($('#af-error-container').length){       
                                                json.client = clientName;
                                                json.url = url;
                                                json.verified = "Not verified";
                                                json.status = "MAP DELETED (404) !";

                                            } else { // if not, then the map is there but is not verified anymore
                                                json.client = clientName;
                                                json.url = url;
                                                json.verified = "NOT verified";
                                                json.status = "Somethings wrong, please verify.";    

                                            }

                                } //endif

                            } //end of if error

                            fs.appendFile('googleMaps.json', JSON.stringify(json, null, 4), function(err){
                                console.log('File successfully written! - Check your project directory for the         googleMaps.json file');
                            })

                        })

                    } //endif rowhasproperty
    } //end for loop        


        res.send("check the generated file");

    }); //end of spreadsheet receive

}); //end of sheetReady

})

app.listen('8081')
console.log('Google Magic happens on port 8081');
exports = module.exports = app;

I hope someone understands, i was wondering if there is a scope/global var problem... ive honnestly played around hours with this and have no clue.

Here is the spreadsheet.

enter image description here

Renegade_Mtl
  • 430
  • 3
  • 8
  • did you print the error you getting, after you make following `request(url.....` ? try printing error object.. – ajduke Sep 22 '14 at 04:30
  • There is no error per say, request (url) gets me the 4th url in the table when I log it, When i do try to output an error it comes back null.... my URL log just above the request is fine, i get all my google data perfect, right after the request its the 4th url always... i dont get it. Its as if the loop is already done, or that there is conflict sending out multiple requests at once.. – Renegade_Mtl Sep 22 '14 at 04:40

2 Answers2

0

You're running into a closure problem.

Your request() is asynchronous. After it runs, it will take some time to return a result. However, Node doesn't block waiting for it to return-- it will happily continue to the next iteration of the loop.

When the loop continues, it changes the values of url and clientName:

var url = rows[key][2];
var clientName = rows[key][1];

At some point in the future, request() will finish and invoke its callback. At that time, it will use the values of url and clientName as they are currently defined. So every instance of the anonymous function will execute using the last values in your loop.

For more information, check out this SO answer.

One of the easiest solutions is to pass any variables you need into a new closure that's redefined for each iteration of the loop.

(function(url, clientName) {
    request(url, function(error, response, html){
    ...
    });
)(url, clientName);

Now, url and clientName are locally-scoped to that particular loop iteration, and will maintain their correct values.

Community
  • 1
  • 1
Interrobang
  • 16,984
  • 3
  • 55
  • 63
  • Great answer as well, i kinda figured something like that was happening, considering ill have close to 600 request to send im sure even more stuff would of criss-crossed, not giving at all the expected result ! – Renegade_Mtl Sep 22 '14 at 04:58
0

Ah the joys of js scope.

Remember that scope is at the function level. So to keep all your request handlers in the correct scope you should declare any variables in a scope that is unique to the the request handler. I suggest pulling the request out into a function and passing in the url and clientname

function checkUrl(url, clientName) {
    request(url, function(error, response, html){
                    if(!error){
                            var $ = cheerio.load(html);
                            var json = { client : "", url : "", verified : "", status: ""};


                            //verify if the google + page is verified local business
                            if ($('.xR.OA.Hlb.d-k-l.NA').length){

                                        $('.xR.OA.Hlb.d-k-l.NA').filter(function(){
                                            var data = $(this);
                                            var isValid = data.text();
                                            json.client = clientName;
                                            json.url = url;
                                            json.verified = isValid;
                                            json.status = "Map is verified !";  
                                        })

                            } else {
                                        // if not, verify if its the google 404 page
                                        if ($('#af-error-container').length){       
                                            json.client = clientName;
                                            json.url = url;
                                            json.verified = "Not verified";
                                            json.status = "MAP DELETED (404) !";

                                        } else { // if not, then the map is there but is not verified anymore
                                            json.client = clientName;
                                            json.url = url;
                                            json.verified = "NOT verified";
                                            json.status = "Somethings wrong, please verify.";    

                                        }

                            } //endif

                        } //end of if error

                        fs.appendFile('googleMaps.json', JSON.stringify(json, null, 4), function(err){
                            console.log('File successfully written! - Check your project directory for the         googleMaps.json file');
                        })

                    })
}

Then make the call to the function where you are assigning url and clientname in your current code.

checkUrl(rows[key][2], rows[key][1]);
Gordon Bockus
  • 860
  • 6
  • 11
  • Life saver ! thank you so much :) i really played a long time with this, after reading your answer i understand a lot better now ! darn scopes ! id vote but i dont have enough rep. :( – Renegade_Mtl Sep 22 '14 at 04:56
  • cool, glad to help. Another thing I noticed about your code is the response from for the express call sends after the for loop completes, but not necessarily after all the requests have come back. So if you have 600 requests to make it's very likely the user will open the file prior to it being fully populated. Just an FYI – Gordon Bockus Sep 22 '14 at 05:19