0

I wanna download this page as an HTML, alongside CSS, JavaScript, fonts, etc. How can I do this?

I tried with BS4 as outlined in this solution, but the output HTML didn't look good.

import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import re

def savePage(url, pagepath='page'):
    def soupfindnSave(pagefolder, tag2find='img', inner='src'):
        """saves on specified `pagefolder` all tag2find objects"""
        if not os.path.exists(pagefolder): # create only once
            os.mkdir(pagefolder)
        for res in soup.findAll(tag2find):   # images, css, etc..
            try:         
                if not res.has_attr(inner): # check if inner tag (file object) exists
                    continue # may or may not exist
                filename = re.sub('\W+', '', os.path.basename(res[inner])) # clean special chars
                fileurl = urljoin(url, res.get(inner))
                filepath = os.path.join(pagefolder, filename)
                # rename html ref so can move html and folder of files anywhere
                res[inner] = os.path.join(os.path.basename(pagefolder), filename)
                if not os.path.isfile(filepath): # was not downloaded
                    with open(filepath, 'wb') as file:
                        filebin = session.get(fileurl)
                        file.write(filebin.content)
            except Exception as exc:
                print(exc, file=sys.stderr)
        return soup
    
    session = requests.Session()
    #... whatever other requests config you need here
    response = session.get(url)
    soup = BeautifulSoup(response.text, features="lxml")
    pagepath, _ = os.path.splitext(pagepath) # avoid duplicate .html
    pagefolder = pagepath+'_files' # page contents
    soup = soupfindnSave(pagefolder, 'img', 'src')
    soup = soupfindnSave(pagefolder, 'link', 'href')
    soup = soupfindnSave(pagefolder, 'script', 'src')
    with open(pagepath+'.html', 'wb') as file:
        file.write(soup.prettify('utf-8'))
    return soup

soup = savePage('https://web.archive.org/web/20201217054355/https://twitter.com/FRANKCUNHAIII/status/1339445856811016192', 'google')
  • `the output HTML didn't look good`: Obviously after downloading resources you will also have to rewrite references to them with your static links. It's not _that_ trivial. – Selcuk Feb 02 '22 at 04:57
  • But [wayback_machine_downloader](https://github.com/hartator/wayback-machine-downloader) does this perfectly, so what makes Python inferior to Ruby when it comes to downloading HTML pages? – facialrecognition Feb 02 '22 at 05:02
  • 1
    Well, that project is approximately 1,000 lines of Ruby code. If you port the code from Ruby to Python writing a comparable number of lines, I'm sure it will also work perfectly. Your ~50 line prototype just doesn't cut it. – Selcuk Feb 02 '22 at 05:22

0 Answers0