I'm writing web scraping program and here I have situation when I have 10 links on one page, and for every link I need to download html text to scrape data from them, and move on next page and repeat all process. When I do this synchronously for one link it took 5-10 sec to download html text (it is slow when I try open page with browser). So I looked for asynchronous way to implement this, and for 10 links it took 5-10 sec to download html text. I have to loop through 100 pages and it took 30 minutes to process all data.
I don't have too much experience with Tasks in C#, so I made this code and it works, but I'm not sure is it good or there exists better solution?
class Program
{
public static List<Task> tasks = new List<Tasks>();
public static List<Data> webData = new List<Data>();
public static async Task<string> GetHtmlText(string link)
{
using (HttpClient client = new HttpClient())
{
return await client.GetStringAsync(link);
}
}
public static void Main(string[] args)
{
for(int i = 0; i < 100; i++)
{
List<string> links = GetLinksFromPage(i); // returns 10 links from page //replaced with edit solution >>>
foreach (var link in links)
{
Task<string> task= Task.Run(() => GetHtmlText(link));
TaskList.Add(task);
}
Task.WaitAll(TaskList.ToArray()); // replaced with edit solution <<<
foreach(Task<string> task in TaskList)
{
string html = task.Result;
Data data = GetDataFromHtml(html);
webData.Add(data);
}
...
}
}
EDIT: This made my day, setting DefaultConnectionLimit to 50 DefaultConnectionLimit
ServicePointManager.DefaultConnectionLimit = 50
var concurrentBag = new ConcurrentBag<string>();
var t = linksFromPage.Select(async link =>
{
var response = await GetLinkStringTaskAsync(link);
concurrentBag.Add(response);
});
await Task.WhenAll(t);