I have a Windows Desktop application that is used to do WebScraping on a website using WebBrowser.
I had to use WebBrowser because the website implements some Javascript function so that was the only way to get the html content of the pages.
The program has to parse about 1500 pages so I have implemented a task delay in order to avoid to overload the server ( and may be getting banned ).
The problem is that after 50-100 parsed pages, I get an out of memory error and the program gets closed.
This is the code:
private async void buttonProd_Click(object sender, EventArgs e)
{
const string C_Prod_UrlTemplate = "http://www.mysite.it";
var _searches = new List<Get_SiteSearchResult>();
using (ProdDataContext db = new ProdDataContext())
{
_searches = db.Get_SiteSearch("PROD").ToList();
foreach (var s in _searches)
{
WebBrowser wb1 = new WebBrowser();
wb1.ScriptErrorsSuppressed = true;
Uri uri = new Uri(String.Format(C_Prod_UrlTemplate,s.prod));
wb1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(webBrowser_DocumentCompleted);
wb1.Url = uri;
await Task.Delay(90 * 1000);
}
}
}
private void webBrowser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
using (ProdDataContext db = new ProdDataContext())
{
WebBrowser wb = (WebBrowser)sender;
string s = wb.Document.Body.InnerHtml;
string fName = wb.CodSite + "_" + wb.PostId + ".txt";
File.WriteAllText(wb.FolderPath + @"LINKS\" + fName, s);
db.Set_LinkDownloaded(wb.CodSite, wb.PostId);
}
}
The error messa is generated on this command line in webBrowser_DocumentCompleted method:
string s = wb.Document.Body.InnerHtml;
Thanks to support