Edit 2 : i think , it might be a relation between this topic , and the other topic in this link : https://stackoverflow.com/questions/34600126/ in that link , i have store all urls of images in a file and try to read them by node.js and download them and i got a TimeOut Error on some of the files, (not for the majority of them)
I have some html which contain some images in each html file in mySql database, i wrote a script with Node.Js to get the html file and also get the images and download them to local (the images are remote url , like http://myServer/whmis_d2.gif) , I use createWriteStream , it would create the images files but they are the empty files. i guess the problem is file is not close when is downloaded , i try to use finish event , but i notice finish event doesn't fire up. my goal is search through html content and download image with remote url to my local computer. would appreciate any help, here is my code : Thanks!
here i put all the code , however the path is hardcoded that you need to change with your local path,
#!/usr/bin/env node
"use strict";
// dependencies
// npm install mysql
// npm install cheerio
var fs = require('fs');
var mysql = require('mysql');
var cheerio = require('cheerio');
var path = require('path');
var _ = require('underscore');
var async = require('async');
var url = require('url');
var request = require('request');
var lunr = require(__dirname + '/lunr.js');
var db = null;
var myTable = {};
var lunr_index = null;
var open_db = function (callback) {
var mysql_conf = {
host: "myHose",
user: "myUser",
password: "MyPass",
database: "MyServer"
};
db = mysql.createConnection(mysql_conf);
db.connect(function (error) {
if (error) {
exit_with_error('problem connecting to database: ' + error);
}
callback();
});
};
var ensure_directory_exists = function (directory) {
var directories = _.reduce(directory.split('/').slice(1), function (memo, subdir, index) {
memo.push(path.normalize(memo[index] + '/' + subdir));
return memo;
}, ['/']);
_.each(directories, function (subdir) {
if (!fs.existsSync(subdir)) {
fs.mkdirSync(subdir);
}
});
};
var get_myosh = function (callback) {
var make_full_markup = function (text) {
return text;
};
var get_images = function (text) {
var $ = null;
$ = cheerio.load(text, {decodeEntities: false});
$('img').each(function (i, img) {
var img_url = $(img).attr('src');
var img_src = path.basename(img_url);
console.log('requesting image ' + img_url + ' ...');
ensure_directory_exists(path.dirname('/Users/Documents/documents/'));
var file = fs.createWriteStream(img_src);
var stream = request(img_url).pipe(file);
file.on('finish', function () {
// this finish event seems not fire at all
console.log('finish streaming' + img_src);
file.close();
});
});
}
db.query('SELECT path,title,qatext FROM myTable', function (error, results) {
var $ = null;
if (error) {
exit_with_error('problem reading from database: ' + error);
}
for (var i in results) {
$ = cheerio.load('<template>' + results[i].qatext.replace(/<\//g, ' </') + '</template>');
myTable[results[i].path] = {
title: results[i].title,
summary: $('template').text().trim().replace(/\s{2,}/g, ' '),
images : get_images(results[i].qatext),
full_markup: make_full_markup(results[i].qatext)
};
}
callback();
});
};
var build_content = function (callback) {
for (var id in myTable) {
ensure_directory_exists(path.dirname('/Users/Documents/documents/' + id));
fs.writeFileSync('/Users/Documents/documents/' + id, myTable[id].full_markup);
console.log('indexed %s', id);
}
callback();
};
var close_db = function () {
try {
db.end();
} catch (e) {
}
};
var exit_with_error = function (error) {
console.log('\nERROR : %s\n', error);
process.exit(-1);
};
open_db(function () {
get_myosh(function () {
build_content(function () {
close_db();
process.exit(0);
});
});
});
also i have put one of the html file content that i get from mysql database. the html text and html file name and html file relative path , store in the table in mysql , and we have not only one html , we have a bunch of them , so the goal is search through html files , and download img files (which are href image link) to local computer.
<!DOCTYPE html>
<html>
<head>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type">
<meta content="utf-8" http-equiv="encoding">
<meta name = "viewport" content = "width = device-width">
<style type="text/css">
body {
font-family: '-apple-system-font', 'San Francisco', 'Helvetica Neue', 'Helvetica', 'Arial', 'Verdana', 'sans-serif';
margin-left:10px; margin-right:0px; margin-top:0px; margin-bottom:0px;}
</style>
</head>
<body><a name="_1_1"></a><div class="center"><img src="http://images.ccohs.ca/oshanswers/whmis_b.gif" width="108" height="106" alt="Symbol for Class B" /><br />Class B2</div><div class="center"><img src="http://images.ccohs.ca/oshanswers/whmis_d2.gif" width="108" height="107" alt="Symbol for Class D2" /><br />Class D2B</div><br /><a name="_1_3"></a><a name="_1_4"></a>
</body>
</html>