Is there a way to save the current webpage by using casperjs or phantomjs?
I tried to get the html and save it into a file. But the resulting file was a lot different from the screenshot of that time (with casper.capture
). Is there a way to save the current webpage?

- 1,223
- 4
- 20
- 40
-
What do you mean by different? If you save html into a file then it's what phantomjs has loaded. If you open it in desktop browser like Chrome you potentially won't see the exact look of the page due to a lot of reasons. – Andrey Borisko Jul 05 '14 at 03:12
-
@AndreyBorisko Yes, that is what I mean. Everything doesn't get loaded when I save the html. But when I capture the images using casperjs, it has all the elements, so I believe the headless browser can load every element. I am wondering if there is some better way than saving the html since that doesn't seem to save the page as can be rendered by the headless browser. – aste123 Jul 05 '14 at 03:17
2 Answers
Andrey Borisko suggested to use the disk cache to retrieve the resources. My solution is not that efficient, but you don't need to decompress text files.
I use XMLHttpRequest to retrieve all resources after I registered them with the resource.received
event handler. I then filter the resources into images, css and fonts. The current limitation is that remote resource paths that contain something like ../
or ./
are not handled correctly.
I retrieve the current page content with getHTML
and iterate over all captured resources to replace the path used in the markup, that is identified by a portion of the complete resource URL, with a randomly generated file name. The file extension is created from the content type of the resource. It is converted using mimeType from this gist.
Since CSS files may contain background images or fonts, they have to be processed before saving to disk. The provided loadResource
function loads the resource, but does not save it.
Since XMLHttpRequest to download the resources the script has to be invoked with the --web-security=false
flag:
casperjs script.js --web-security=false
script.js
var casper = require("casper").create();
var utils = require('utils');
var fs = require('fs');
var mimetype = require('./mimetype'); // URL provided below
var cssResources = [];
var imgResources = [];
var fontResources = [];
var resourceDirectory = "resources";
var debug = false;
fs.removeTree(resourceDirectory);
casper.on("remote.message", function(msg){
this.echo("remote.msg: " + msg);
});
casper.on("resource.error", function(resourceError){
this.echo("res.err: " + JSON.stringify(resourceError));
});
casper.on("page.error", function(pageError){
this.echo("page.err: " + JSON.stringify(pageError));
});
casper.on("downloaded.file", function(targetPath){
if (debug) this.echo("dl.file: " + targetPath);
});
casper.on("resource.received", function(resource){
// don't try to download data:* URI and only use stage == "end"
if (resource.url.indexOf("data:") != 0 && resource.stage == "end") {
if (resource.contentType == "text/css") {
cssResources.push({obj: resource, file: false});
}
if (resource.contentType.indexOf("image/") == 0) {
imgResources.push({obj: resource, file: false});
}
if (resource.contentType.indexOf("application/x-font-") == 0) {
fontResources.push({obj: resource, file: false});
}
}
});
// based on http://docs.casperjs.org/en/latest/modules/casper.html#download
casper.loadResource = function loadResource(url, method, data) {
"use strict";
this.checkStarted();
var cu = require('clientutils').create(utils.mergeObjects({}, this.options));
return cu.decode(this.base64encode(url, method, data));
};
function escapeRegExp(string) {
// from https://stackoverflow.com/a/1144788/1816580
return string.replace(/([.*+?^=!:${}()|\[\]\/\\])/g, "\\$1");
}
function replaceAll(find, replace, str) {
// from https://stackoverflow.com/a/1144788/1816580
return str.replace(find, replace);
}
var wrapFunctions = [
function wrapQuot1(s){
return '"' + s + '"';
},
function wrapQuot2(s){
return "'" + s + "'";
},
function csswrap(s){
return '(' + s + ')';
}
];
function findAndReplace(doc, resources, resourcesReplacer) {
// change page on the fly
resources.forEach(function(resource){
var url = resource.obj.url;
// don't download again
if (!resource.file) {
// set random filename and download it **or** call further processing which in turn will load ans write to disk
resource.file = resourceDirectory+"/"+Math.random().toString(36).slice(2)+"."+mimetype.ext[resource.obj.contentType];
if (typeof resourcesReplacer != "function") {
if (debug) casper.echo("download resource (" + resource.obj.contentType + "): " + url + " to " + resource.file);
casper.download(url, resource.file, "GET");
} else {
resourcesReplacer(resource);
}
}
wrapFunctions.forEach(function(wrap){
// test the resource url (growing from the back) with a string in the document
var lastURL;
var lastRegExp;
var subURL;
// min length is 4 characters
for(var i = 0; i < url.length-5; i++) {
subURL = url.substring(i);
lastRegExp = new RegExp(escapeRegExp(wrap(subURL)), "g");
if (doc.match(lastRegExp)) {
lastURL = subURL;
break;
}
}
if (lastURL) {
if (debug) casper.echo("replace " + lastURL + " with " + resource.file);
doc = replaceAll(lastRegExp, wrap(resource.file), doc);
}
});
});
return doc;
}
function capturePage(){
// remove all <script> and <base> tags
this.evaluate(function(){
Array.prototype.forEach.call(document.querySelectorAll("script"), function(scr){
scr.parentNode.removeChild(scr);
});
Array.prototype.forEach.call(document.querySelectorAll("base"), function(scr){
scr.parentNode.removeChild(scr);
});
});
// TODO: remove all event handlers in html
var page = this.getHTML();
page = findAndReplace(page, imgResources);
page = findAndReplace(page, cssResources, function(cssResource){
var css = casper.loadResource(cssResource.obj.url, "GET");
css = findAndReplace(css, imgResources);
css = findAndReplace(css, fontResources);
fs.write(cssResource.file, css, "wb");
});
fs.write("page.html", page, "wb");
}
casper.start("http://www.themarysue.com/").wait(3000).then(capturePage).run(function(){
this.echo("DONE");
this.exit();
});
The magic happens in findAndReplace
. capturePage
is completely synchronous so it can be dropped anywhere without much head ache.
URL for mimetype.js
No, I don't think there is an easy way to do this as phantomjs doesn't support rendering pages in mht format (Render as a .mht file #10117). I believe that's what you wanted. So, it needs some work to accomplish this. I did something similar, but i was doing it the other way around I had a rendered html code that I was rendering into image/pdf through phantomjs. I had to clean the file first and it worked fine for me.
So, what I think you need to do is:
strip all js calls, like
script
tags oronload
attributes, etc..if you have access from local to the resources like css, images and so on (and you don't need authentication to that domain where you grab the page) than you need to change relative paths of
src
attributes to absolute to load images/etc.if you don't have access to the resources when you open the page then I think you need to implement similar script to download those resources at the time phantomjs loads the page and then redirect
src
attributes to that folder or maybe use data uri. You might need to change links in css files as well.
This will bring up the images\fonts and styling you are missing currently.
I'm sure there are more points. I'll update the answer if you need more info, once I see my code.

- 1
- 1

- 4,511
- 2
- 22
- 31