Well, I believe this answer can be helpful to those who have gone through the same thing as me.
My solution was to download the HTML (using HttpWebRequest) and write to file (stored in Azure Blobs).
In my case, I made a function to correct all the relatives paths in the HTML file, as below:
private static HtmlDocument CorrectHTMLReferencies(string urlRoot, string htmlContent)
{
HtmlDocument document = new HtmlDocument();
document.LoadHtml(htmlContent);
Regex rx = new Regex(@"([\w-]+\.)+[\w-]+(\/[\w- .\/?%&=]*)?");
var nodesIMG = document.DocumentNode.SelectNodes("//img");
var nodesCSS = document.DocumentNode.SelectNodes("//link");
var nodesJS = document.DocumentNode.SelectNodes("//script");
string protocol = "http:";
if (urlRoot.Contains(":"))
protocol = urlRoot.Split(':')[0] + ":";
void WatchURl(HtmlNodeCollection colNodes, string attr)
{
foreach (HtmlNode node in colNodes)
{
if (node.Attributes.Any(a => a.Name?.ToLower() == attr.ToLower()))
{
string link = node.Attributes[attr].Value;
if (rx.IsMatch(link))
{
if (link.Substring(0, 2) == "//")
{
string novaUrl = protocol + link;
node.SetAttributeValue(attr, novaUrl);
}
}
else
{
node.SetAttributeValue(attr, urlRoot + link);
}
}
}
}
WatchURl(nodesIMG, "src");
WatchURl(nodesCSS, "href");
WatchURl(nodesJS, "src");
return document;
}
instead of download all the website, I download only one file.
it works (for me)
;)