Any one knows a way to get all the URLs in a website using JavaScript?
I only need the links starting with the same domain name.no need to consider other links.
Any one knows a way to get all the URLs in a website using JavaScript?
I only need the links starting with the same domain name.no need to consider other links.
Well this will get all the same-host links on the page:
var urls = [];
for(var i = document.links.length; i --> 0;)
if(document.links[i].hostname === location.hostname)
urls.push(document.links[i].href);
If by site you mean you want to recursively get the links inside linked pages, that's a bit trickier. You'd have to download each link into a new document (for example in an <iframe>
), and the onload
check the iframe's own document for more links to add to the list to fetch. You'd need to keep a lookup of what URLs you'd already spidered to avoid fetching the same document twice. It probably wouldn't be very fast.
Javascript to extract (and display) the domains, urls, and links from a page The "for(var i = document.links.length; i --> 0;)" method is a good collection to work with. Here is a example to pulls it from specific parts of the html page.
You could alter it to select and filter to whatever you want. And then use the list however you want. I wanted to show a working example.
var re = /^((http[s]?|ftp|mailto):(?:\/\/)?)?\/?(([^\/\.]+\.)*?([^\/\.]+\.[^:\/\s\.]{1,4})?(\.[^:\/\s\.]{1,2})?(:\d+)?)($|\/)([^#?\s]+)?(.*?)?(#[\w\-]+)?$/i;
var reG = /^((http[s]?|ftp|mailto):(?:\/\/)?)?\/?(([^\/\.]+\.)*?([^\/\.]+\.[^:\/\s\.]{1,4})?(\.[^:\/\s\.]{1,2})?(:\d+)?)($|\/)([^#?\s]+)?(.*?)?(#[\w\-]+)?$/ig;
var printList = document.getElementById("domains");
var unorderedList = document.createElement("ul");
unorderedList.setAttribute("id", "domainsList");
unorderedList.setAttribute("class", "list-group");
printList.appendChild(unorderedList);
var domainsList = document.getElementById("domainsList");
var list = document.getElementsByTagName("a");
//console.log(list);
var listArray = Array.from(list);
//loop through the list
listArray.forEach(function(link){
//console.log(link.href);
//console.log(typeof(link.href));
var listItem = document.createElement("li");
listItem.setAttribute("class", "list-group-item domain");
domainsList.appendChild(listItem);
var str = link.href;
var match = str.match(reG);
var matchGroup = str.match(re);
//console.log(matchGroup[5]);
var domainNode = document.createTextNode("Domain: " + matchGroup[5]);
listItem.appendChild(domainNode);
var breakNode = document.createElement("br");
listItem.appendChild(breakNode);
var websiteNode = document.createTextNode("Website: " + matchGroup[3]);
listItem.appendChild(websiteNode);
var breakNode = document.createElement("br");
listItem.appendChild(breakNode);
var fullNode = document.createTextNode("Full Link: " + match);
listItem.appendChild(fullNode);
domainsList.appendChild(listItem);
unorderedList.appendChild(listItem)
});
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta http-equiv="X-UA-Compatible">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Pull Domains form a page</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- Bootstrap CSS -->
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
</head>
<body>
<div class="card-deck">
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="https://www.youtube.com/watch?v=f9B_1Ac5jnc">Link 1</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://www.apple.com">Link 2</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://www.cnn.com.au">Link 3</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://downloads.news.com.au">Link 4</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://ftp.android.co.nz">Link 5</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://global.news.ca">Link 6</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="https://www.apple.com">Link 7</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="https://mira.mx/">Link 8</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://www.qs.com/">Link 9</a></div></div>
<div class="card mb-3" style="min-width: 10rem;"><div class="card-body"><a href="http://pbs.org">Link 10</a></div></div>
</div>
<div id="domains"></div>
</body>
</html>
In Javascript:
(function(){
let links = [], l = document.links;
for(let i=0; i<l.length; i++) {
links.push(l[i].href);
}
return links;
})();
using jquery u can find all the links on the page that match a specific criteria
$("a[href=^domain.com]").each(function(){
alert($(this).attr("href"));
});