I'm trying to make a sitemap from crawled links and am wondering how I can iteratively append to a JSON file to do so. The outcome i want is:
{
"https://www.example.com/": {
"links":{
"https://www.example.com/1":{
"links": {}},
"https://www.example.com/2":{
"links": {}}
}
}
}
The program should add the links found to the url/link associated with where it is found. And repeat for each link/url until all links are found.
crawlandcreatesitemap.py
import json
import re
import requests
import urllib.parse as urlparse
def createsitemap(url_list, json_filename):
"""
Function creates the initial site map with the urls.
:param url_list: Urls to be crawled
:param json_filename: Filename to output to
:return:
"""
try:
with open(json_filename, 'r') as json_file:
data = json.load(json_file)
except FileNotFoundError:
data = {}
for url in url_list:
if url not in data:
data[url] = {"links": {}}
with open(json_filename, 'w') as json_file:
json.dump(data, json_file, indent=4, separators=(',', ':'), ensure_ascii=False)
def find_url_path(target_url, json_filename):
"""
Finds the path in the sitemap JSON file.
:param target_url:
:param json_filename:
:return:
"""
try:
with open(json_filename, 'r') as json_file:
data = json.load(json_file)
if target_url in data:
return data[target_url]
else:
return None
except FileNotFoundError:
return None
def append_to_url_dictionary(target_url, new_link, json_filename):
try:
with open(json_filename, 'r') as json_file:
data = json.load(json_file)
except FileNotFoundError:
return False
if target_url in data:
url_links = data[target_url]["links"]
url_links[new_link] = {"links": {}}
with open(json_filename, 'w') as json_file:
json.dump(data, json_file, indent=4, separators=(',', ':'), ensure_ascii=False)
return True
return False
def extract_links_from(url):
res = requests.get(url)
return re.findall('href="(.*?)"', res.content.decode(errors="ignore"))
class crawlandcreatesitemap:
def __init__(self, urls):
self.urls = urls
self.filename = "sitemap.json"
self.target_links = []
self.links_to_ignore = []
createsitemap(self.urls, self.filename)
def crawl(self, url=None):
if url == None:
for turl in self.urls:
url = turl
href_links = extract_links_from(url)
for link in href_links:
link = urlparse.urljoin(url, link)
if '#' in link:
link = link.split("#")[0]
if url in link and link not in self.target_links and link not in self.links_to_ignore and link != url:
print("URLs appended to JSON file.")
append_to_url_dictionary(url, link, self.filename)
print(f"Crawling: {link}")
self.crawl(link)
urls = ["https://crawler-test.com/"]
vv = crawlandcreatesitemap(urls)
vv.crawl()