1

I have been put into the situation that i have to probe a relatively huge dataset through a chocked output which is an id based single record returning endpoint. Iterating this endpoint to scrape all the data sequentially is not obviously a practical option so I tried to create parallelized multiple processes each has its own range of ids to deal with. Here is what i managed to do which is still not working perfectly. "Refer to the questions at the bottom"

Code Preface ↴

In this demo I used http://numbersapi.com/. It is simple you put a number and the API returns a random fact about it from the facts associated with it, If there are any otherwise It returns randomized one of 4 phrases containing the facts-free number.
API Examples:
Number that has atleast one fact → http://numbersapi.com/1
Number that has no facts associated → http://numbersapi.com/123456789

Appendix

ThreadRange : Items to be processed by a single thread

Code ↴

using System;
using System.Collections.Concurrent;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Security.Authentication;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;

namespace CSharp
{
    class Program
    {
        private static Stopwatch CrawlerStopWatch = new Stopwatch();
        private static HttpClient HttpClientCrawler = null;
        private static HttpClientHandler HttpClientCrawlerHttpsHandler = new HttpClientHandler() { SslProtocols = SslProtocols.Tls12 | SslProtocols.Tls11 | SslProtocols.Tls };
        private static BlockingCollection<Thread> CrawlerThreads = new BlockingCollection<Thread>();
        private static BlockingCollection<String> CrawlerFailuresBlockingCollection = new BlockingCollection<String>();
        private static BlockingCollection<String> CrawlerHitsBlockingCollection = new BlockingCollection<String>();
        private static BlockingCollection<String> CrawlerMissesBlockingCollection = new BlockingCollection<String>();
        private static String BaseUrl = "http://numbersapi.com/";
        private static String FullUrl = String.Concat(BaseUrl, "{0}");
        private static long ThreadsCount = 1;
        private static long ThreadRange = 10;
        private static long Offset = 2;
        private static long ItemsToProcess = ThreadsCount * ThreadRange;
        private static int MaxUrlLength = String.Format(FullUrl, ItemsToProcess).Length;
        private static ReaderWriterLockSlim CrawlerReaderWriterLockSlim = new ReaderWriterLockSlim();
        private static String CrawlerResultantFileName = "z.CrawlerResult.txt";
        public static void Main(String[] args)
        {
            CrawlerStopWatch.Start();
            //### Managing HttpClient ###/
            ServicePointManager.DefaultConnectionLimit = 50;
            if (Regex.IsMatch(BaseUrl, @"^https.*$")) { HttpClientCrawler = new HttpClient(HttpClientCrawlerHttpsHandler); } else { HttpClientCrawler = new HttpClient(); }
            HttpClientCrawler.BaseAddress = new Uri(BaseUrl);
            HttpClientCrawler.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("text/html"));
            //### Managing HttpClient ###/
            InitiateCrawler(BaseUrl, ThreadsCount, ThreadRange, Offset);
        }
        //### Crawler Methods ###
        public static void InitiateCrawler(String BaseUrl, long ThreadsCount, long ThreadRange, long Offset = 0)
        {
            Console.WriteLine("###############################");
            Console.WriteLine($"Commenced crawling the <{BaseUrl}> endpoint and working on <{ItemsToProcess}> items by creating <{ThreadsCount}> threads which each working on <{ThreadRange}> items.");
            while (CrawlerThreads.Count < ThreadsCount)
            {
                long Start = Offset + ThreadRange * CrawlerThreads.Count + 1;
                long End = Offset + ThreadRange * (CrawlerThreads.Count + 1);
                Thread CrawlerThread = new Thread(() => Crawl(Start, End));
                CrawlerThreads.Add(CrawlerThread);
                CrawlerThread.Start();
            }
            Task FinalizeCrawlerTask = Task.Run(() => { foreach (Thread CrawlerThread in CrawlerThreads) { CrawlerThread.Join(); } FinalizeCrawler(); });
            FinalizeCrawlerTask.Wait();
        }
        public static void Crawl(long Start, long End)
        {
            long Current = Start;
            while (Current <= End)
            {
                String CurrentUrlParamters = String.Format("{0}", Current);
                String CurrentUrl = $"{HttpClientCrawler.BaseAddress.AbsoluteUri}{CurrentUrlParamters}";
                String CurrentPageContent = "";
                HttpResponseMessage HttpResponseMessage = HttpClientCrawler.GetAsync(CurrentUrlParamters).Result;
                if (HttpResponseMessage.IsSuccessStatusCode)
                {
                    CurrentPageContent = Encoding.UTF8.GetString(HttpResponseMessage.Content.ReadAsByteArrayAsync().Result);
                    if (isResultRelevant(CurrentPageContent)) { HandleCrawlerRelevantResult(CurrentUrl, CurrentPageContent); } else { HandleCrawlerIrrelevantResult(CurrentUrl, CurrentPageContent); }
                }
                else
                {
                    HandleCrawlerFailure(CurrentUrl, HttpResponseMessage);
                }
                Current++;
            }
        }
        public static void HandleCrawlerFailure(String Url, HttpResponseMessage HttpResponseMessage)
        {
            CrawlerFailuresBlockingCollection.Add(Url);
            int ProcessedItems = CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count + CrawlerFailuresBlockingCollection.Count;
            Console.WriteLine($"[Item #{ProcessedItems.ToString().PadRight(ItemsToProcess.ToString().Length)}] {Url.PadRight(MaxUrlLength)} returned {(int)HttpResponseMessage.StatusCode} Code | {HttpResponseMessage.ReasonPhrase}");
        }
        public static Boolean isResultRelevant(String Content)
        {
            Boolean IsRelevant = true;
            String[] RegularExpressionsArray = new string[]
            {
                @"^[\d]+ is a boring number\.$",
                @"^[\d]+ is an uninteresting number\.$",
                @"^[\d]+ is an unremarkable number\.$",
                @"^[\d]+ is a number for which we're missing a fact (submit one to numbersapi at google mail!)\.$",
            };
            foreach (String RegularExpression in RegularExpressionsArray) { if (Regex.IsMatch(Content, RegularExpression)) { IsRelevant = false; break; } }
            return IsRelevant;
        }
        public static void HandleCrawlerRelevantResult(String Url, String Content)
        {
            CrawlerResultantFileWriteLine(Url);
            CrawlerHitsBlockingCollection.Add(Url);
            int ProcessedItems = CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count + CrawlerFailuresBlockingCollection.Count;
            Console.WriteLine($"[Item #{ProcessedItems.ToString().PadRight(ItemsToProcess.ToString().Length)}] {Url.PadRight(MaxUrlLength)} is relevant");
        }
        public static void HandleCrawlerIrrelevantResult(String Url, String Content)
        {
            CrawlerMissesBlockingCollection.Add(Url);
            int ProcessedItems = CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count + CrawlerFailuresBlockingCollection.Count;
            Console.WriteLine($"[Item #{ProcessedItems.ToString().PadRight(ItemsToProcess.ToString().Length)}] {Url.PadRight(MaxUrlLength)} is irrelevant");
        }
        public static void FinalizeCrawler()
        {
            CrawlerStopWatch.Stop();
            TimeSpan TimeSpan = TimeSpan.FromMilliseconds(CrawlerStopWatch.ElapsedMilliseconds);
            String TimeLapseInformation = String.Format("{0:D2}h:{1:D2}m:{2:D2}s:{3:D3}ms",
                                    TimeSpan.Hours,
                                    TimeSpan.Minutes,
                                    TimeSpan.Seconds,
                                    TimeSpan.Milliseconds);
            Console.WriteLine($"Crawling finished in {TimeLapseInformation}.");
            Console.WriteLine($"<{CrawlerFailuresBlockingCollection.Count + CrawlerHitsBlockingCollection.Count + CrawlerMissesBlockingCollection.Count}> out of <{ItemsToProcess}> items have been crawled having <{CrawlerHitsBlockingCollection.Count}> relevant items, <{CrawlerMissesBlockingCollection.Count}> irrelevant items and <{CrawlerFailuresBlockingCollection.Count}> failures.");
            Console.WriteLine("###############################");
        }
        //### Crawler Methods ###
        //### Auxiliary Methods ###
        public static void CrawlerResultantFileWriteLine(String Line)
        {
            CrawlerReaderWriterLockSlim.EnterWriteLock();
            try { using (StreamWriter StreamWriter = File.AppendText(String.Concat(AppDomain.CurrentDomain.BaseDirectory, "\\", CrawlerResultantFileName))) { StreamWriter.WriteLine(Line); StreamWriter.Close(); } }
            finally { CrawlerReaderWriterLockSlim.ExitWriteLock(); }
        }
        //### Auxiliary Methods ###
    }
}

Questions

(Q1.) Is the presented approach optimal ?


(Q2.) In FinalizeCrawler() function,In case of the number of the collective items to process in all threads is too big, The count of the processed items don't equal the designated number of items to process in the InitializeCrawler(UrlHandle,ThreadsCount,ThreadRange) call in main as:
(CrawlerFailuresBlockingCollection.Count + CrawlerHits.Count + CrawlerMisses.Count) has to equal(ThreadsCount * ThreadRange)
Do some items get omitted? if so, Why?

(A2.) As Joshua Robinson pointed out, I was using System.Collections.Generic.List which is unsafe collection to modify from multiple threads. I switched to System.Collections.Concurrent.BlockingCollection, tested it with big number like 10000 and it worked.

(Q3.) How to pinpoint the optimal ThreadRange and ThreadsCount combination regarding performance. Which one?
- (A) Min ThreadsCount with max ThreadRange
- (B) Low ThreadsCount with high ThreadRange
- (C) Median thread range with Median thread count
- (B) High ThreadsCount with low ThreadRange
- (E) Max ThreadsCount with min ThreadRange
To be more elaborate, If I wanted to process 10000 Items from this endpoint these are all the possible combinations satisfying the necessary predicate ThreadsCount*ThreadRange=10000 by that I have choose only from the them.
Which is the most optimal? And why?
<1>     ThreadsCount * <10000> ThreadRange
<2>     ThreadsCount * <5000>  ThreadRange
<4>     ThreadsCount * <2500>  ThreadRange
<5>     ThreadsCount * <2000>  ThreadRange
<8>     ThreadsCount * <1250>  ThreadRange
<10>    ThreadsCount * <1000>  ThreadRange
<16>    ThreadsCount * <625>   ThreadRange
<20>    ThreadsCount * <500>   ThreadRange
<25>    ThreadsCount * <400>   ThreadRange
<40>    ThreadsCount * <250>   ThreadRange
<50>    ThreadsCount * <200>   ThreadRange
<80>    ThreadsCount * <125>   ThreadRange
<100>   ThreadsCount * <100>   ThreadRange // Exact Median → square root of items to process if integer
<125>   ThreadsCount * <80>    ThreadRange
<200>   ThreadsCount * <50>    ThreadRange
<250>   ThreadsCount * <40>    ThreadRange
<400>   ThreadsCount * <25>    ThreadRange
<500>   ThreadsCount * <20>    ThreadRange
<625>   ThreadsCount * <16>    ThreadRange
<1000>  ThreadsCount * <10>    ThreadRange
<1250>  ThreadsCount * <8>     ThreadRange
<2000>  ThreadsCount * <5>     ThreadRange
<2500>  ThreadsCount * <4>     ThreadRange
<5000>  ThreadsCount * <2>     ThreadRange
<10000> ThreadsCount * <1>     ThreadRange
Community
  • 1
  • 1
Mina Gerges
  • 295
  • 2
  • 14
  • 2
    Your count is off because `List` is not safe to modify from multiple threads. [See here](https://learn.microsoft.com/en-us/dotnet/api/system.collections.generic.list-1#thread-safety). You need a thread safe collection like those in the[System.Collections.Concurrent Namespace](https://learn.microsoft.com/en-us/dotnet/api/system.collections.concurrent). Or you could use some sort of locking mechanism. – Joshua Robinson Apr 07 '20 at 22:18
  • 1
    `new System.Net.WebClient().DownloadData` This is the wrong way to download data in parallel. Read the duplicates from https://stackoverflow.com/q/61021862/34092 . – mjwills Apr 07 '20 at 23:57
  • 1
    This workflow looks ideal for using the [TPL Dataflow](https://learn.microsoft.com/en-us/dotnet/standard/parallel-programming/dataflow-task-parallel-library) library. [Here](https://stackoverflow.com/questions/60929044/c-sharp-parallel-foreach-memory-usage-keeps-growing/60930992#60930992) is an example of using this library in combination with the `HttpClient` class to download pages from the web, with a configurable level of concurrency. – Theodor Zoulias Apr 08 '20 at 01:28
  • @mjwills I edited and refactored the code using HttpClient and used `ServicePointManager.DefaultConnectionLimit = 50'` as you pointed out in your referenced answer questions. Is that what you meant ? – Mina Gerges Apr 08 '20 at 12:22
  • @TheodorZoulias i refactored the code with HttpClient but i am still not sure about TPL Dataflow library. as i couldn't make sense of it to incorporate it in the workflow. I still have to study it. Are sure it is going to offer significant improvement ? Maybe you can offer the benchmarks or the time elapsed processing 100000 "100k" items using TPL Dataflow with the same example? – Mina Gerges Apr 08 '20 at 12:27
  • The TPL Dataflow approach has the advantage that instead of processing your input in batches, it will process it with a specified (constant) level of concurrency. On the other hand with the batching approach you must wait for all operations of a batch to complete, before proceeding with the next batch. This may be a problem if the operations have durations of high variability. Otherwise, if there is no variability, and all operations of a batch complete at exactly the same time, then the batching approach will perform equally well with the TPL Dataflow approach. – Theodor Zoulias Apr 08 '20 at 13:36
  • I understand but isn't the approach that i am using already offering roughly the same thing as it is processing batches in a number of **`concurrent`** threads – Mina Gerges Apr 08 '20 at 13:42
  • The TPL Dataflow is using multiple concurrent workers too. But all these workers are running constantly. When a worker finishes with an item, it gets a new item and starts processing it immediately. I haven't delved deeply into the code you posted, but judging form the sentence *"ThreadRange: Items to be processed by a single thread"* it seems that you partition your workload before starting the processing. If this is the case, then the efficiency of your solution depends on how well balanced is the initial static partitioning. The TPL Dataflow employs a dynamic partitioning strategy instead. – Theodor Zoulias Apr 08 '20 at 15:16
  • @TheodorZoulias you said `The TPL Dataflow employs a dynamic partitioning strategy instead.' do you mean it resolves my third sub question (3.) "bottom of the question" automatically ? for example i wanted to process 10000 items i have a lot of combinations to choose from like 100 Thread * 100 Item or 250 Thread * 40 Item TPL chooses the right combination by its own ? – Mina Gerges Apr 08 '20 at 19:26
  • 2
    No, the TPL Dataflow just ensures a constant level of concurrency (or degree of parallelism if you prefer). It doesn't try to adjust the concurrency in order to achieve optimal performance based on heuristics. For example assuming that the remote server performs optimally when it receives 30 concurrent requests, and becomes overburdened when it receives more, it's up to you to configure the Dataflow block with the optimal `MaxDegreeOfParallelism = 30` setting (assuming that you'll figured it out from experimentation). – Theodor Zoulias Apr 08 '20 at 19:39
  • Okay @TheodorZoulias, I guess TPL is just more organized way of what I already devised and I am using the above code maybe two times at most to probe some data from a chocked endpoint so the return on investment learning TPL is not that high as i am not going to host the code above anyway. but definitely TPL Dataflow will be my first to go to when i intend to host something similar. Cheers – Mina Gerges Apr 08 '20 at 20:12
  • 1
    Yeap, the [TPL Dataflow](https://learn.microsoft.com/en-us/dotnet/standard/parallel-programming/dataflow-task-parallel-library) library definitely has some learning curve, but not nearly as steep as other libraries like the Reactive Extensions or the Akka.NET. You can reasonably expect that you will be able to use it successfully and be productive after studying it for a day or two. – Theodor Zoulias Apr 08 '20 at 20:42
  • @MinaGerges `Here is what i managed to do which is still not working perfectly.` What do you mean by this words? What problems do you currently have and what solution are you looking for? May be you want to increase performance or decrease resources (`CPU`) consumed during crawling or decrease number of failures. – Iliar Turdushev Apr 10 '20 at 07:16
  • @IliarTurdushev Did you check the questions that i listed at the end of the question ? – Mina Gerges Apr 10 '20 at 07:18
  • @MinaGerges Of course. But you ask if your solution is optimaI. I think that to answer this qestion it is important to know what problems you face. May be to solve this problems another approach should be used. Or do you just want to improve your current approach and know what strategy to use when choosing number of crawling threads? – Iliar Turdushev Apr 10 '20 at 07:30
  • Both. I already present the objectives is to probe data to form a data-set from a bottlenecked output that returns one record by id. I presented in the code above which is in a nutshell it createds a number of threads `ThreadsCount` for each number of ids to operate `ThreadRange`. 2 questions asking if someone could offer a better suggestions like Theodor Zoulias yesterday did offer to operate with TPL Data Flow. The latter question asks how to determine the optimal combination of the two parameters `ThreadsCount` & `ThreadRange` and presented an example to make that question more elaborate – Mina Gerges Apr 10 '20 at 07:42
  • Why are you doing this when there is already a [batch API](http://numbersapi.com/#batching) of the format `http://numbersapi.com/number/type`? – Wyck Apr 16 '20 at 19:54
  • @Wyck there maybe is a batch API format in the API shown in the demo `numberspai.com` but surely the APIs I actually intend to work with don't have that format for sure. – Mina Gerges Apr 16 '20 at 22:04

2 Answers2

1

(Q1.) Is the presented approach optimal ?

I have a suggestion how to enhance current solution. Currently all crawling threads process their http responses by themself. Moreover when they process their http responses they try to acquire an exclusive lock (ReaderWriterLockSlim.EnterWriteLock) to write results to the file.

I suggest to make crawling threads only send http requests and immediately add http responses into some collection. And another thread (lets call it processing thread) will process these http responses (extract data, parse it, write to file and etc). Such approach will give us next benefits:

  1. Crawling threads will not waste their time to process http responses. They will only execute the most expensive operations: send http requests and receive http responses.
  2. An exclusive lock will not be needed to process http responses. Processing thread will be able to process http responses without setting any locks.

If in your real program processing of the http responses is more time consumable than in the provided sample then these benefits will decrease the overall execution time of the program.

Here are changes that you need to make in your program to implement this suggestion:

// Now we can declare this collections as simple Lists, not BlockingCollections.
private static List<Thread> CrawlerThreads = new List<Thread>();
private static List<String> CrawlerFailuresBlockingCollection = new List<String>();
private static List<String> CrawlerHitsBlockingCollection = new List<String>();
private static List<String> CrawlerMissesBlockingCollection = new List<String>();

// I found out that for my environment this values are optimal and give better performance.
private static long ThreadsCount = 200;
private static long ThreadRange = 50;

// Add this collection. Crawling threads will add http responses into it.
// And processing thread will process them.
private static BlockingCollection<HttpResponseMessage> ResponsesToProcess = new BlockingCollection<HttpResponseMessage>();

public static void Main(String[] args)
{
   ...
   // If we use 200 crawling threads then we should set
   // DefaultConnectionLimit=200 to make possible for all
   // crawling threads to make http responses simultaneously
   // without waiting for available connections.
   ServicePointManager.DefaultConnectionLimit = 200;
   ...
}

public static void InitiateCrawler(String BaseUrl, long ThreadsCount, long ThreadRange, long Offset = 0)
{
    Console.WriteLine("###############################");
    Console.WriteLine($"Commenced crawling the <{BaseUrl}> endpoint and working on <{ItemsToProcess}> items by creating <{ThreadsCount}> threads which each working on <{ThreadRange}> items.");

    while (CrawlerThreads.Count < ThreadsCount)
    {
        long Start = Offset + ThreadRange * CrawlerThreads.Count + 1;
        long End = Offset + ThreadRange * (CrawlerThreads.Count + 1);

        Thread CrawlerThread = new Thread(() => Crawl(Start, End));
        CrawlerThreads.Add(CrawlerThread);
        CrawlerThread.Start();
    }

    Task FinalizeCrawlerTask = Task.Run(() =>
    { 
        foreach (Thread CrawlerThread in CrawlerThreads) 
        { 
            CrawlerThread.Join(); 
        }
        // Notify processing thread that there is no more
        // http responses to process.
        ResponsesToProcess.CompleteAdding();
    });

    // Processing thread.
    Task ProcessResponsesThread = Task.Run(() =>
    {
        foreach (var HttpResponseMessage in ResponsesToProcess.GetConsumingEnumerable())
        {
            string CurrentUrl = HttpResponseMessage.RequestMessage.RequestUri.AbsoluteUri;

            if (HttpResponseMessage.IsSuccessStatusCode)
            {
                string CurrentPageContent = Encoding.UTF8.GetString(HttpResponseMessage.Content.ReadAsByteArrayAsync().Result);
                if (isResultRelevant(CurrentPageContent))
                {
                    HandleCrawlerRelevantResult(CurrentUrl, CurrentPageContent);
                }
                else
                {
                    HandleCrawlerIrrelevantResult(CurrentUrl, CurrentPageContent);
                }
            }
            else
            {
                HandleCrawlerFailure(CurrentUrl, HttpResponseMessage);
            }
        }
    });

    FinalizeCrawlerTask.Wait();
    ProcessResponsesThread.Wait();

    // Now we print the results of the program here because we must
    // ensure that finalizer and processing threads have finished.
    FinalizeCrawler();
}

public static void Crawl(long Start, long End)
{
    long Current = Start;
    while (Current <= End)
    {
        String CurrentUrlParamters = String.Format("{0}", Current);
        String CurrentUrl = $"{HttpClientCrawler.BaseAddress.AbsoluteUri}{CurrentUrlParamters}";
        String CurrentPageContent = "";
        HttpResponseMessage HttpResponseMessage = HttpClientCrawler.GetAsync(CurrentUrlParamters).Result;
        // Now crawling thread only adds http response into collection
        // of http responses to process. It doesn't process http responses.
        ResponsesToProcess.Add(HttpResponseMessage);
        Current++;
    }
}

public static void CrawlerResultantFileWriteLine(String Line)
{
    // Now we don't need a lock. We can delete it.
    using (StreamWriter StreamWriter = File.AppendText(String.Concat(AppDomain.CurrentDomain.BaseDirectory, "\\", CrawlerResultantFileName))) 
    {
        StreamWriter.WriteLine(Line); 
        StreamWriter.Close(); 
    } 
}

(Q3.) How to pinpoint the optimal ThreadRange and ThreadsCount combination regarding performance. Which one?

I tested your sample and found out that there was no a rule of thumb how to choose ThreadsCount. Here are results of the tests on my environment (PC):

  • ThreadsCount = 20, ExecutionTime = 07m:40s
  • ThreadsCount = 25, ExecutionTime = 07m:00s
  • ThreadsCount = 50, ExecutionTime = 03m:30s
  • ThreadsCount = 100, ExecutionTime = 01m:30s
  • ThreadsCount = 200, ExecutionTime = 01m:10s
  • Further increasing of ThreadsCount does not improve ExecutionTime

I think that to find an optimal value of ThreadsCount you should test different values of this parameter on your environment and choose the best.

The only thing I want to highlight is that it is important to set ServicePointManager.DefaultConnectionLimit equal or larger than ThreadsCount. If ServicePointManager.DefaultConnectionLimit is less than ThreadsCount then some of the threads will wait for available connections without doing usefull work.

Iliar Turdushev
  • 4,935
  • 1
  • 10
  • 23
  • Good call having separate threads to handle the response content as this approach won't have to force the other threads to wait for the file lock and didn't think equalizing `ServicePointManager.DefaultConnectionLimit` with `ThreadCount` is good idea at first. It turns out I am wrong with this one. anyway those are the logs for your code while processing 10000 items 50 item for each of the 200 threads `Crawling finished in 00h:01m:20s:840ms.` `<10000> out of <10000> items have been crawled having <2711> relevant items, <6993> irrelevant items and <296> failures.` 3x faster than my code – Mina Gerges Apr 18 '20 at 01:13
  • btw i am curious why your code commits less failures while crawling like i had with the same test roughly 700~800 and yours had 200~300. Here is a [Code Snippet](https://dotnetfiddle.net/gOx13B) of your code integrated with mine – Mina Gerges Apr 18 '20 at 01:18
  • `i am curious why your code commits less failures while crawling like i had with the same test` How many times did you run test? I ran test with approach suggested by me several times and every time got different number of failures: 544, 1, 123, 645. I think that number of failures depend on how numbers api behave. Interestingly but for less number of crawling threads I got more failures. For example, for 50 threads I ran test two times and got 834 and 1026 failures. So I think that this is not the problem of the approach (yours or suggested by me) but of the numbers api. – Iliar Turdushev Apr 19 '20 at 15:05
  • i ran both programs five times mine was giving 700~800 and yours was 200~300. and yes this api fails with 5xx code when it is overloaded but i wanted to make sure if there are other reasons. – Mina Gerges Apr 19 '20 at 18:40
0

This is what I would have written to scrape a bunch of requests. It's plenty efficient. Compared to what you wrote, I also think this approach is much simpler. It relies on Tasks to take advantage of CPU parallelism (this example is not CPU bound, it is bound by the responsiveness of the remote server), and you can manually set the number of simultaneous connections to use. (I coded this with 6 which is a typical number of connections a modern web browser would allow to each domain.)

using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;

namespace ScrapeExample
{
    class Program
    {
        private const int NumberOfSimultaneousConnections = 6;
        private const int MaxResponseContentBufferSize = 4096;
        private const string uriTemplate = "http://numbersapi.com/{0}";
        private const int FirstIndexToRequest = 1;
        private const int LastIndexToRequest = 100;

        static void Main(string[] args)
        {
            HttpClientHandler hch = new HttpClientHandler() { Proxy = null, UseProxy = false };
            HttpClient[] clients = Enumerable.Range(0, NumberOfSimultaneousConnections).Select(i =>
                new HttpClient(hch) { MaxResponseContentBufferSize = MaxResponseContentBufferSize }
            ).ToArray();

            List<Task<string>> tasks = new List<Task<string>>();
            for (int i = FirstIndexToRequest; i <= LastIndexToRequest; ++i) {
                string uri = string.Format(uriTemplate, i);
                tasks.Add(ProcessURLAsync(uri, clients[i % NumberOfSimultaneousConnections]));
            }
            Task.WaitAll(tasks.ToArray());
            string[] results = tasks.Select(t => t.Result).ToArray();
            Console.WriteLine(string.Join(Environment.NewLine, results));
        }

        private async static Task<string> ProcessURLAsync(string uri, HttpClient client)
        {
            HttpResponseMessage response = await client.GetAsync(uri);
            response.EnsureSuccessStatusCode();
            byte[] content = await response.Content.ReadAsByteArrayAsync();
            return Encoding.UTF8.GetString(content);
        }
    }
}

IMPORTANT:

In practice, I find that that numbersapi.com will easily get overloaded after a few hundred requests and it will start returning 502 error codes (Bad Gateway / Service Temporarily Overloaded). So this design is missing something that would try each item again in the event of an error. I just wrote EnsureSuccessStatusCode which throws an exception on error. But this is already outperforming the remote API that you are using as an example.

Wyck
  • 10,311
  • 6
  • 39
  • 60
  • your take on this can actually be faster than the way i approach this but it needs some modfications as but GetAsync throws the System.Net.Http.HttpRequestException leaving lots of items unprocessed. Here is a [Code Snippet](https://dotnetfiddle.net/kffNux) that has your code integrated with mine .. It is a good take though I learn it from it a thing or two – Mina Gerges Apr 18 '20 at 02:13