3

Edit 2 : i think , it might be a relation between this topic , and the other topic in this link : https://stackoverflow.com/questions/34600126/ in that link , i have store all urls of images in a file and try to read them by node.js and download them and i got a TimeOut Error on some of the files, (not for the majority of them)

I have some html which contain some images in each html file in mySql database, i wrote a script with Node.Js to get the html file and also get the images and download them to local (the images are remote url , like http://myServer/whmis_d2.gif) , I use createWriteStream , it would create the images files but they are the empty files. i guess the problem is file is not close when is downloaded , i try to use finish event , but i notice finish event doesn't fire up. my goal is search through html content and download image with remote url to my local computer. would appreciate any help, here is my code : Thanks!

here i put all the code , however the path is hardcoded that you need to change with your local path,

#!/usr/bin/env node
"use strict";

// dependencies
// npm install mysql
// npm install cheerio

var fs = require('fs');

var mysql = require('mysql');
var cheerio = require('cheerio');
var path = require('path');
var _ = require('underscore');

var async = require('async');
var url = require('url');
var request = require('request');


var lunr = require(__dirname + '/lunr.js');

var db = null;
var myTable = {};
var lunr_index = null;

var open_db = function (callback) {
  var mysql_conf = {
        host: "myHose",
        user: "myUser",
        password: "MyPass",
        database: "MyServer" 
          };
  db = mysql.createConnection(mysql_conf);
  db.connect(function (error) {
    if (error) {
      exit_with_error('problem connecting to database: ' + error);
    }
    callback();
  });
};

  var ensure_directory_exists = function (directory) {
    var directories = _.reduce(directory.split('/').slice(1), function (memo, subdir, index) {
      memo.push(path.normalize(memo[index] + '/' + subdir));
      return memo;
    }, ['/']);
    _.each(directories, function (subdir) {
      if (!fs.existsSync(subdir)) {
        fs.mkdirSync(subdir);
      }
    });
  };

var get_myosh = function (callback) {
  var make_full_markup = function (text) {
  return text;

  };

  var get_images = function (text) {

                     var $ = null;
                      $ = cheerio.load(text, {decodeEntities: false});

                      $('img').each(function (i, img) {
              var img_url = $(img).attr('src');
              var img_src = path.basename(img_url);


        console.log('requesting image ' + img_url + ' ...');
        ensure_directory_exists(path.dirname('/Users/Documents/documents/'));

var file = fs.createWriteStream(img_src);
var stream = request(img_url).pipe(file);


 file.on('finish', function () { 
     // this finish event seems not fire at all     
console.log('finish streaming' + img_src);
             file.close();
 });

           });

  }


  db.query('SELECT path,title,qatext FROM myTable', function (error, results) {
    var $ = null;
    if (error) {
      exit_with_error('problem reading from database: ' + error);
    }
    for (var i in results) {
      $ = cheerio.load('<template>' + results[i].qatext.replace(/<\//g, ' </') + '</template>');
      myTable[results[i].path] = {
        title: results[i].title,
        summary: $('template').text().trim().replace(/\s{2,}/g, '  '),
         images : get_images(results[i].qatext),
        full_markup: make_full_markup(results[i].qatext)

      };

    }
  callback();
  });

};

var build_content = function (callback) {

  for (var id in myTable) {

    ensure_directory_exists(path.dirname('/Users/Documents/documents/' + id));
    fs.writeFileSync('/Users/Documents/documents/' + id, myTable[id].full_markup);

    console.log('indexed %s', id);
  }

  callback();
};

var close_db = function () {
  try {
    db.end();
  } catch (e) {
  }
};

var exit_with_error = function (error) {
  console.log('\nERROR : %s\n', error);
  process.exit(-1);
};

open_db(function () {
  get_myosh(function () {
    build_content(function () {
      close_db();
      process.exit(0);
    });
});
});

also i have put one of the html file content that i get from mysql database. the html text and html file name and html file relative path , store in the table in mysql , and we have not only one html , we have a bunch of them , so the goal is search through html files , and download img files (which are href image link) to local computer.

<!DOCTYPE html>
<html>
<head>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type">
<meta content="utf-8" http-equiv="encoding">
<meta name = "viewport" content = "width = device-width">
<style type="text/css">
body {
font-family: '-apple-system-font', 'San Francisco', 'Helvetica Neue', 'Helvetica', 'Arial', 'Verdana', 'sans-serif';
margin-left:10px; margin-right:0px; margin-top:0px; margin-bottom:0px;}
</style>
</head>
<body><a name="_1_1"></a><div class="center"><img src="http://images.ccohs.ca/oshanswers/whmis_b.gif" width="108" height="106" alt="Symbol for Class B" /><br />Class B2</div><div class="center"><img src="http://images.ccohs.ca/oshanswers/whmis_d2.gif" width="108" height="107" alt="Symbol for Class D2" /><br />Class D2B</div><br /><a name="_1_3"></a><a name="_1_4"></a>
</body>
</html>
Community
  • 1
  • 1
Jack
  • 35
  • 1
  • 9
  • Is that all the code? Does it do anything else (like writing to console)? I don't see where you're calling the functions but I'm sure you just omitted them -- anyway, make sure you're actually *calling* the function! – Mondongo Dec 30 '15 at 15:59
  • Also, maybe you should add the 'finish' event listener *before* starting the request... Otherwise it seems like the stream could be closed by the time you add the event listener to it. Do you have an output of the console.log() messages? – Mondongo Dec 30 '15 at 16:00
  • @Mondongo the function is called. i saw the console log inside of get_images function (requesting image) but as i mentioned the console log inside finish event not shown in the log as the finish event not fire with above code. i have tried to add finish event listener before starting the request as you suggested but no change , still finish event not fire up. – Jack Dec 30 '15 at 17:36
  • if you can post the whole code or put it up on a jsfiddle somewhere, I can play with it and help you figure it out... – Mondongo Dec 30 '15 at 17:53
  • @Mondongo , i have edited my post and put entire source code in one block for you, the only matters are , the mysql Database is through our INTERNET and it's not accessible for you to test through internet. so you might to create your own mysql database , and put the html content that i gave you , and change the user/pass/host in the script with your own to debug the issue. Thanks – Jack Dec 30 '15 at 20:10
  • @Mondongo, i have tried to store all image url in a text file and then readline them to download image , i got that errno: 'ETIMEDOUT'. can you please take a look at my SO Thread in this url : http://stackoverflow.com/questions/34600126/node-js-download-part-of-them-fine-but-the-other-part-with-errno-etimedout it might the cause would be same for this thread too – Jack Jan 06 '16 at 17:20

1 Answers1

0

You don't need to close WriteStream manually, piping is enough. There could be internet access issues — did you check request response status? You do not add path to img_src so it should try to write files to your home/process directory, depending on environment. Just to simplify your code and minify possible errors, use mkdirp module to create subfolders chain in one line. And here is a good answer how to check if directory exists.

Community
  • 1
  • 1
iw2rmb
  • 118
  • 9
  • Closing part is not called by node.js , so it doesn't matter , i test it and remove that and make no change on result. also i know that i didn't put sub folder path for img_src , but that is not the case , because it store all files in the same folder, it is not internet issue, as in other thread , i did another test with node .js and put all url link in a txt file and try to read those line and download them , most of the file downloaded fine but some become empty, you can see the other thread in this link : http://stackoverflow.com/questions/34600126 – Jack Jan 11 '16 at 15:43
  • Did you try leave `timeout` blank or set value > 0? – iw2rmb Jan 12 '16 at 15:56