I am writing a content scraper that scrapes information about shirts on a particular webiste. I have everything set up with NPM packages in Node to scrape and create a CSV file. The problem I am running into is that as many know, Node is asynchronous in nature. The CSV file I am trying to write is writing before the JSON object I create is finished being created (iterating with an each loop to build it), thus it passes in my 'fields' parameter for json2csv (npm package). But it passes in my data as an empty object. Can anyone tell me how to tell node to wait until my json object is built before trying to use fs.writefile to create the CSV file? Thank you
'use strict';
//require NPM packages
var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var json2csv = require('json2csv');
//Array for shirts JSON object for json2csv to write.
var ShirtProps = [];
var homeURL = "http://www.shirts4mike.com/";
//start the scraper
scraper();
//Initial scrape of the shirts link from the home page
function scraper () {
//use the datafolderexists function to check if data is a directory
if (!DataFolderExists('data')) {
fs.mkdir('data');
}
//initial request of the home url + the shirts.php link
request(homeURL + "shirts.php", function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//scrape each of the links for its html data
$('ul.products li').each(function(i, element){
var ShirtURL = $(this).find('a').attr('href');
console.log(ShirtURL);
//pass in each shirtURL data to be scraped to add it to an object
ShirtHTMLScraper(ShirtURL);
});
FileWrite();
// end first request
} else {
console.error(error);
}
});
}
//create function to write the CSV file.
function FileWrite() {
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Time'];
var csv = json2csv({data: ShirtProps, fields: fields});
console.log(csv);
var d = new Date();
var month = d.getMonth()+1;
var day = d.getDate();
var output = d.getFullYear() + '-' +
((''+month).length<2 ? '0' : '') + month + '-' +
((''+day).length<2 ? '0' : '') + day;
fs.writeFile('./data/' + output + '.csv', csv, function (error) {
if (error) throw error;
});
}
//function to scrape each of the shirt links and create a shirtdata object for each.
function ShirtHTMLScraper(ShirtURL) {
request(homeURL + ShirtURL, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var time = new Date().toJSON().substring(0,19).replace('T',' ');
//json array for json2csv
var ShirtData = {
title: $('title').html(),
price: $(".price").html(),
imgURL: $('img').attr('src'),
url: homeURL + ShirtURL,
time: time.toString()
};
//push the shirt data scraped into the shirtprops array
ShirtProps.push(ShirtData);
console.log(ShirtProps);
// //set the feilds in order for the CSV file
// var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Time'];
// //use json2csv to write the file -
// var csv = json2csv({data: ShirtProps, fields: fields});
// console.log(csv);
// //date for the filesystem to save the scrape with today's date.
// var d = new Date();
// var month = d.getMonth()+1;
// var day = d.getDate();
// var output = d.getFullYear() + '-' +
// ((''+month).length<2 ? '0' : '') + month + '-' +
// ((''+day).length<2 ? '0' : '') + day;
// //use filesystem to write the file, or overrite if it exists.
// fs.writeFile('./data/' + output + '.csv', csv, function (error) {
// if (error) throw error;
// }); //end writeFile
} else {
console.error(error);
}
});
}
//Check if data folder exists, source: http://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
function DataFolderExists(folder) {
try {
// Query the entry
var DataFolder = fs.lstatSync(folder);
// Is it a directory?
if (DataFolder.isDirectory()) {
return true;
} else {
return false;
}
} //end try
catch (error) {
console.error(error);
}
}