First time using Node JS, Im doing some web scraping to verify if a certain page exists (404) or if it contains a certain div.
In stand alone version, it works perfect, meaning one url at a time, but now im trying to fetch the data from a google spreadsheet and loop through it and fire a request for each. It then fs.appendFile's the result to a document. (in another version I update a cell in the google doc instead)
Here is the output in googlemaps.json (file created after running the script):
{
"client": "Not working client map (404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "Not verified",
"status": "MAP DELETED (404) !"
}{
"client": "Not working client map(404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "NOT verified",
"status": "Somethings wrong, please verify."
}{
"client": "Not working client map(404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "Verified local business",
"status": "Map is verified !"
}{
"client": "Not working client map(404 test)",
"url": "https://plus.google.com/113096347010804339975/about?gl=ca&hl=en",
"verified": "Verified local business",
"status": "Map is verified !"
}
So I have :
A spreadsheet, with 2 columns and 4 rows.
Clients / URL of there google + page.
Pretty basic stuff here. what I dont understand is why the "verified" and "status" seem to work, but the "client" name and the URL are all the same ? Its as if the loop sticks to number 4... (I have a total of 4 rows in the spreadsheet) .... yet i can confirm the status is correct.
Yet when i run a console.log just before the line where i make my request everything seems fine.
console.log(key + " -> " + rows[key][1] + " / " + rows[key][2]);
That line gets me back CLIENT / URL no problem for all 4 rows.
Yet right after this line :
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var json = { client : "", url : "", verified : "", status: ""};
Everything stays stuck at row 4...
Heres the rest of the code :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var Spreadsheet = require('edit-google-spreadsheet');
app.get('/maps', function(req, res){
Spreadsheet.load({
debug: true,
/*spreadsheetName: 'Google Maps Url List',
worksheetName: 'Sheet1',*/
spreadsheetId: 'MY ID',
worksheetId: 'od6',
// 1. Username and Password
username: 'USERNAME',
password: 'PASS',
}, function sheetReady(err, spreadsheet) {
if(err) throw err;
spreadsheet.receive(function(err, rows, info) {
if(err) throw err;
console.log(rows);
//console.log(rows["1"]["2"]);
for (var key in rows) {
if (rows.hasOwnProperty(key)) {
//key++;
console.log(key + " -> " + rows[key][1] + " / " + rows[key][2])
var url = rows[key][2];
var clientName = rows[key][1];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var json = { client : "", url : "", verified : "", status: ""};
//verify if the google + page is verified local business
if ($('.xR.OA.Hlb.d-k-l.NA').length){
$('.xR.OA.Hlb.d-k-l.NA').filter(function(){
var data = $(this);
var isValid = data.text();
json.client = clientName;
json.url = url;
json.verified = isValid;
json.status = "Map is verified !";
})
} else {
// if not, verify if its the google 404 page
if ($('#af-error-container').length){
json.client = clientName;
json.url = url;
json.verified = "Not verified";
json.status = "MAP DELETED (404) !";
} else { // if not, then the map is there but is not verified anymore
json.client = clientName;
json.url = url;
json.verified = "NOT verified";
json.status = "Somethings wrong, please verify.";
}
} //endif
} //end of if error
fs.appendFile('googleMaps.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the googleMaps.json file');
})
})
} //endif rowhasproperty
} //end for loop
res.send("check the generated file");
}); //end of spreadsheet receive
}); //end of sheetReady
})
app.listen('8081')
console.log('Google Magic happens on port 8081');
exports = module.exports = app;
I hope someone understands, i was wondering if there is a scope/global var problem... ive honnestly played around hours with this and have no clue.
Here is the spreadsheet.