How to save the current webpage with casperjs/phantomjs?

Question

Is there a way to save the current webpage by using casperjs or phantomjs? I tried to get the html and save it into a file. But the resulting file was a lot different from the screenshot of that time (with casper.capture). Is there a way to save the current webpage?

What do you mean by different? If you save html into a file then it's what phantomjs has loaded. If you open it in desktop browser like Chrome you potentially won't see the exact look of the page due to a lot of reasons. — Andrey Borisko, Jul 05 '14 at 03:12
@AndreyBorisko Yes, that is what I mean. Everything doesn't get loaded when I save the html. But when I capture the images using casperjs, it has all the elements, so I believe the headless browser can load every element. I am wondering if there is some better way than saving the html since that doesn't seem to save the page as can be rendered by the headless browser. — aste123, Jul 05 '14 at 03:17

score 10 · Accepted Answer · edited May 23 '17 at 12:09

Andrey Borisko suggested to use the disk cache to retrieve the resources. My solution is not that efficient, but you don't need to decompress text files.

I use XMLHttpRequest to retrieve all resources after I registered them with the resource.received event handler. I then filter the resources into images, css and fonts. The current limitation is that remote resource paths that contain something like ../ or ./ are not handled correctly.

I retrieve the current page content with getHTML and iterate over all captured resources to replace the path used in the markup, that is identified by a portion of the complete resource URL, with a randomly generated file name. The file extension is created from the content type of the resource. It is converted using mimeType from this gist.

Since CSS files may contain background images or fonts, they have to be processed before saving to disk. The provided loadResource function loads the resource, but does not save it.

Since XMLHttpRequest to download the resources the script has to be invoked with the --web-security=false flag:

casperjs script.js --web-security=false

script.js

var casper = require("casper").create();
var utils = require('utils');
var fs = require('fs');
var mimetype = require('./mimetype'); // URL provided below
var cssResources = [];
var imgResources = [];
var fontResources = [];
var resourceDirectory = "resources";
var debug = false;

fs.removeTree(resourceDirectory);

casper.on("remote.message", function(msg){
    this.echo("remote.msg: " + msg);
});

casper.on("resource.error", function(resourceError){
    this.echo("res.err: " + JSON.stringify(resourceError));
});

casper.on("page.error", function(pageError){
    this.echo("page.err: " + JSON.stringify(pageError));
});

casper.on("downloaded.file", function(targetPath){
    if (debug) this.echo("dl.file: " + targetPath);
});

casper.on("resource.received", function(resource){
    // don't try to download data:* URI and only use stage == "end"
    if (resource.url.indexOf("data:") != 0 && resource.stage == "end") {
        if (resource.contentType == "text/css") {
            cssResources.push({obj: resource, file: false});
        }
        if (resource.contentType.indexOf("image/") == 0) {
            imgResources.push({obj: resource, file: false});
        }
        if (resource.contentType.indexOf("application/x-font-") == 0) {
            fontResources.push({obj: resource, file: false});
        }
    }
});

// based on http://docs.casperjs.org/en/latest/modules/casper.html#download
casper.loadResource = function loadResource(url, method, data) {
    "use strict";
    this.checkStarted();
    var cu = require('clientutils').create(utils.mergeObjects({}, this.options));
    return cu.decode(this.base64encode(url, method, data));
};


function escapeRegExp(string) {
    // from https://stackoverflow.com/a/1144788/1816580
    return string.replace(/([.*+?^=!:${}()|\[\]\/\\])/g, "\\$1");
}

function replaceAll(find, replace, str) {
    // from https://stackoverflow.com/a/1144788/1816580
    return str.replace(find, replace);
}

var wrapFunctions = [
    function wrapQuot1(s){
        return '"' + s + '"';
    },
    function wrapQuot2(s){
        return "'" + s + "'";
    },
    function csswrap(s){
        return '(' + s + ')';
    }
];

function findAndReplace(doc, resources, resourcesReplacer) {
    // change page on the fly
    resources.forEach(function(resource){
        var url = resource.obj.url;

        // don't download again
        if (!resource.file) {
            // set random filename and download it **or** call further processing which in turn will load ans write to disk
            resource.file = resourceDirectory+"/"+Math.random().toString(36).slice(2)+"."+mimetype.ext[resource.obj.contentType];
            if (typeof resourcesReplacer != "function") {
                if (debug) casper.echo("download resource (" + resource.obj.contentType + "): " + url + " to " + resource.file);
                casper.download(url, resource.file, "GET");
            } else {
                resourcesReplacer(resource);
            }
        }

        wrapFunctions.forEach(function(wrap){
            // test the resource url (growing from the back) with a string in the document
            var lastURL;
            var lastRegExp;
            var subURL;
            // min length is 4 characters
            for(var i = 0; i < url.length-5; i++) {
                subURL = url.substring(i);
                lastRegExp = new RegExp(escapeRegExp(wrap(subURL)), "g");
                if (doc.match(lastRegExp)) {
                    lastURL = subURL;
                    break;
                }
            }
            if (lastURL) {
                if (debug) casper.echo("replace " + lastURL + " with " + resource.file);
                doc = replaceAll(lastRegExp, wrap(resource.file), doc);
            }
        });
    });
    return doc;
}

function capturePage(){

    // remove all <script> and <base> tags
    this.evaluate(function(){
        Array.prototype.forEach.call(document.querySelectorAll("script"), function(scr){
            scr.parentNode.removeChild(scr);
        });
        Array.prototype.forEach.call(document.querySelectorAll("base"), function(scr){
            scr.parentNode.removeChild(scr);
        });
    });

    // TODO: remove all event handlers in html

    var page = this.getHTML();
    page = findAndReplace(page, imgResources);
    page = findAndReplace(page, cssResources, function(cssResource){
        var css = casper.loadResource(cssResource.obj.url, "GET");
        css = findAndReplace(css, imgResources);
        css = findAndReplace(css, fontResources);
        fs.write(cssResource.file, css, "wb");
    });
    fs.write("page.html", page, "wb");
}

casper.start("http://www.themarysue.com/").wait(3000).then(capturePage).run(function(){
    this.echo("DONE");
    this.exit();
});

The magic happens in findAndReplace. capturePage is completely synchronous so it can be dropped anywhere without much head ache.

URL for mimetype.js

score 0 · Answer 2 · edited May 23 '17 at 11:53

No, I don't think there is an easy way to do this as phantomjs doesn't support rendering pages in mht format (Render as a .mht file #10117). I believe that's what you wanted. So, it needs some work to accomplish this. I did something similar, but i was doing it the other way around I had a rendered html code that I was rendering into image/pdf through phantomjs. I had to clean the file first and it worked fine for me.

So, what I think you need to do is:

strip all js calls, like script tags or onload attributes, etc..
if you have access from local to the resources like css, images and so on (and you don't need authentication to that domain where you grab the page) than you need to change relative paths of src attributes to absolute to load images/etc.
if you don't have access to the resources when you open the page then I think you need to implement similar script to download those resources at the time phantomjs loads the page and then redirect src attributes to that folder or maybe use data uri. You might need to change links in css files as well.

This will bring up the images\fonts and styling you are missing currently.

I'm sure there are more points. I'll update the answer if you need more info, once I see my code.

How to save the current webpage with casperjs/phantomjs?

2 Answers2

Linked