0

I have made a website cloner in Python, it clones html files so well but

There are links with no file extension, especially in Wordpress websites

For example in wordpress links don't have the file extensions, eg example.com/about/ , example.com/contact, example.com/about/history/

I would like it to download all the href without file extensions and save them in html eg example.com/about/ to about.html , example.com/about/history/ to /histoty.html in about folder.

import requests
import sys
import socket
import os
import re

def download(resource):

    global downloadedFiles

    if any(s in resource for s in dataTypesToDownload):

        if " " in resource: # https://stackoverflow.com/a/4172592
            return

        while resource.startswith("/"):
            resource = resource[1:]

        external = False
        prefix = ""

        if resource.startswith("https://"):
            external = True
            prefix="https://"
            resource = resource.replace("https://", "")

        if resource.startswith("http://"):
            external = True
            prefix="http://"
            resource = resource.replace("http://", "")

        if resource.startswith("../"):
            resource = resource.replace("../", "dotdot/")

        if resource in downloadedFiles:
            return

        try:
            path = resource.split("/")

            if len(path) != 1:
                path.pop(len(path) - 1)
                trail = "./" + base_path + "/"
                for folder in path:
                    trail += folder+"/"
                    try:
                        os.mkdir(trail)
                    except OSError:
                        pass    

        except IOError:
            pass

        try:

            if "?" in resource:
                download = open(base_path + "/"+ resource.split("?")[len(resource.split("?")) - 2], "wb")
            else:
                download = open(base_path + "/"+ resource, "wb")

            print("Downloading {} to {}".format(resource, download.name))

            if external:
                dContent = requests.get(prefix+resource, stream=True)
            else:
                dContent = requests.get(url+"/"+resource, stream=True)

        except Exception as e:

            print("An error occured: " + str(e.reason))
            download.close()
            return

        for chunk in dContent:
            download.write(chunk)

        download.close()
        print("Downloaded!")
        downloadedFiles.append(resource)

socket.setdefaulttimeout(15)

downloadedFiles = []
dataTypesToDownload = [".jpg", ".jpeg", ".png", ".gif", ".ico", ".css", ".js", ".html", ".php", ".json"]

if len(sys.argv) == 1:
    url = input("URL of site to clone: ")
else:
    if sys.argv[1] == "-h":
        print("Usage: {} [url] [directory]".format(sys.argv[0]))
        exit()
    url = sys.argv[1]

if len(sys.argv) <= 2:
    base_path = input("Directory to clone into: ")
else:
    base_path = sys.argv[2]

if "http://" not in url and "https://" not in url:
    url = "http://"+url

domain = "//".join(url.split("//")[1:])

try:
    os.mkdir(base_path)
except OSError:
    pass

with requests.Session() as r:
    try:
        content = r.get(url).text
    except Exception as e:
        print("Error: {}".format(e))

file = open(base_path + "/index.html", "w")
file.write(content)
file.close()

resources = re.split("=\"|='", content)

for resource in resources:

    resource = re.split("\"|'", resource)[0]

    download(resource)

#Catch root level documents in href tags
hrefs = content.split("href=\"")

for i in range( len(hrefs) - 1 ):
    href = hrefs[i+1]
    href = href.split("\"")[0]
    if "/" not in href and "." in href and ("." + href.split(".")[-1]) in dataTypesToDownload:
        download(href)


print("Cloned "+url+" !")```

0 Answers0