-1

I'm trying to make a sitemap from crawled links and am wondering how I can iteratively append to a JSON file to do so. The outcome i want is:

{
    "https://www.example.com/": {
        "links":{
            "https://www.example.com/1":{
                "links": {}},
            "https://www.example.com/2":{
                "links": {}}
        }
    }
}

The program should add the links found to the url/link associated with where it is found. And repeat for each link/url until all links are found.

crawlandcreatesitemap.py

import json
import re
import requests
import urllib.parse as urlparse


def createsitemap(url_list, json_filename):
    """
    Function creates the initial site map with the urls.

    :param url_list: Urls to be crawled
    :param json_filename: Filename to output to
    :return:
    """
    try:
        with open(json_filename, 'r') as json_file:
            data = json.load(json_file)
    except FileNotFoundError:
        data = {}

    for url in url_list:
        if url not in data:
            data[url] = {"links": {}}

    with open(json_filename, 'w') as json_file:
        json.dump(data, json_file, indent=4, separators=(',', ':'), ensure_ascii=False)


def find_url_path(target_url, json_filename):
    """
    Finds the path in the sitemap JSON file.

    :param target_url:
    :param json_filename:
    :return:
    """
    try:
        with open(json_filename, 'r') as json_file:
            data = json.load(json_file)
            if target_url in data:
                return data[target_url]
            else:
                return None
    except FileNotFoundError:
        return None


def append_to_url_dictionary(target_url, new_link, json_filename):
    try:
        with open(json_filename, 'r') as json_file:
            data = json.load(json_file)
    except FileNotFoundError:
        return False

    if target_url in data:
        url_links = data[target_url]["links"]
        url_links[new_link] = {"links": {}}

        with open(json_filename, 'w') as json_file:
            json.dump(data, json_file, indent=4, separators=(',', ':'), ensure_ascii=False)

        return True

    return False


def extract_links_from(url):
    res = requests.get(url)
    return re.findall('href="(.*?)"', res.content.decode(errors="ignore"))


class crawlandcreatesitemap:
    def __init__(self, urls):
        self.urls = urls
        self.filename = "sitemap.json"

        self.target_links = []
        self.links_to_ignore = []

        createsitemap(self.urls, self.filename)

    def crawl(self, url=None):
        if url == None:
            for turl in self.urls:
                url = turl

        href_links = extract_links_from(url)
        for link in href_links:
            link = urlparse.urljoin(url, link)

            if '#' in link:
                link = link.split("#")[0]

            if url in link and link not in self.target_links and link not in self.links_to_ignore and link != url:
                print("URLs appended to JSON file.")
                append_to_url_dictionary(url, link, self.filename)
                print(f"Crawling: {link}")
                self.crawl(link)



urls = ["https://crawler-test.com/"]
vv = crawlandcreatesitemap(urls)
vv.crawl()
  • Check out https://stackoverflow.com/questions/4706499/how-do-i-append-to-a-file. You will need to seek backwards a character (or two or three depending upon the line ending in your JSON file) to overwrite the existing closing brace. – Malcolm Aug 11 '23 at 19:23
  • 1
    Your output has a dictionary with two identical keys `https://www.example.com/1` that is not possible. Did you want that key to resolve to a list rather than to two dictionaries? Alternatively, please edit the definition of the output you hope to achieve . – JonSG Aug 11 '23 at 19:54
  • What's wrong with what you have now? – mkrieger1 Aug 11 '23 at 20:07

0 Answers0