I understand the implications of using an async lambda with Parallel.ForEach
which is why I'm not using it here. This then forces me to use .Result
for each of my Tasks that make Http requests. However, running this simple scraper through the performance profiler shows that .Result
has an elapsed exclusive time % of ~98% which is obviously due to the blocking nature of the call.
My question is: is there any possibility of optimizing this for it to still be async? I'm not sure that will help in this case since it may just take this long to retrieve the HTML/XML.
I'm running a 4 core processor with 8 logical cores (hence the MaxDegreesOfParallelism = 8
. Right now I'm looking at around 2.5 hours to download and parse ~51,000 HTML/XML pages of simple financial data.
I was leaning towards using XmlReader instead of Linq2XML to speed up the parsing, but it appears the bottleneck is at the .Result
call.
And although it should not matter here, the SEC limits scraping to 10 requests/sec.
public class SECScraper
{
public event EventHandler<ProgressChangedEventArgs> ProgressChangedEvent;
public SECScraper(HttpClient client, FinanceContext financeContext)
{
_client = client;
_financeContext = financeContext;
}
public void Download()
{
_numDownloaded = 0;
_interval = _financeContext.Companies.Count() / 100;
Parallel.ForEach(_financeContext.Companies, new ParallelOptions {MaxDegreeOfParallelism = 8},
company =>
{
RetrieveSECData(company.CIK);
});
}
protected virtual void OnProgressChanged(ProgressChangedEventArgs e)
{
ProgressChangedEvent?.Invoke(this, e);
}
private void RetrieveSECData(int cik)
{
// move this url elsewhere
var url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" + cik +
"&type=10-q&dateb=&owner=include&count=100";
var srBody = ReadHTML(url).Result; // consider moving this to srPage
var srPage = new SearchResultsPage(srBody);
var reportLinks = srPage.GetAllReportLinks();
foreach (var link in reportLinks)
{
url = SEC_HOSTNAME + link;
var fdBody = ReadHTML(url).Result;
var fdPage = new FilingDetailsPage(fdBody);
var xbrlLink = fdPage.GetInstanceDocumentLink();
var xbrlBody = ReadHTML(SEC_HOSTNAME + xbrlLink).Result;
var xbrlDoc = new XBRLDocument(xbrlBody);
var epsData = xbrlDoc.GetAllEPSData();
//foreach (var eps in epsData)
// Console.WriteLine($"{eps.StartDate} to {eps.EndDate} -- {eps.EPS}");
}
IncrementNumDownloadedAndNotify();
}
private async Task<string> ReadHTML(string url)
{
using var response = await _client.GetAsync(url);
return await response.Content.ReadAsStringAsync();
}
}