Using the DOM, you could use document.Node.textContent
. However, NodeJs doesn't have textContent (since it doesn't have native access to the DOM), therefore you should use external packages. You could install request
and cheerio
, using npm. cheerio
, suggested by Jon Church, is maybe the easiest web scraping tool to use (there are also complexer ones like jsdom
)
With power of cheerio
and request
in your hands, you could write
const request = require("request");
const cheerio = require("cheerio");
const fs = require("fs");
//taken from https://stackoverflow.com/a/19709846/10713877
function is_absolute(url)
{
var r = new RegExp('^(?:[a-z]+:)?//', 'i');
return r.test(url);
}
function is_local(url)
{
var r = new RegExp('^(?:file:)?//', 'i');
return (r.test(url) || !is_absolute(url));
}
function send_request(URL)
{
if(is_local(URL))
{
if(URL.slice(0,7)==="file://")
url_tmp = URL.slice(7,URL.length);
else
url_tmp = URL;
//taken from https://stackoverflow.com/a/20665078/10713877
const $ = cheerio.load(fs.readFileSync(url_tmp));
//Do something
console.log($.text())
}
else
{
var options = {
url: URL,
headers: {
'User-Agent': 'Your-User-Agent'
}
};
request(options, function(error, response, html) {
//no error
if(!error && response.statusCode == 200)
{
console.log("Success");
const $ = cheerio.load(html);
return Promise.resolve().then(()=> {
//Do something
console.log($.text())
});
}
else
{
console.log(`Failure: ${error}`);
}
});
}
}
Let me explain the code. You pass a URL to send_request
function. It checks whether the URL string is a path to your local file, (a relative path, or a path starting with file://
). If it is a local file, it proceeds to use cheerio
module, otherwise, it has to send a request, to the website, using the request
module, then use cheerio
module. Regular Expressions are used in is_absolute
and is_local
. You get the text using text()
method provided by cheerio
. Under the comments //Do something
, you could do whatever you want with the text.
There are websites that let you know 'Your-User-Agent'
, copy-paste your user agent to that field.
Below lines will work
//your local file
send_request("/absolute/path/to/your/local/index.html");
send_request("/relative/path/to/your/local/index.html");
send_request("file:///absolute/path/to/your/local/index.html");
//website
send_request("https://stackoverflow.com/");
EDIT: I am on a linux system.