2

I am working on an asp.net MVC-5 web application, and based on some articles i read that i should not use Parallel methods inside web servers and inside .net web applications espically. now in my case i have around 1,500 WebClient() calls that i need to issue inside a foreach, and then deserialize the return json object from the WebClient() calls. my original code before using Parallel.Foreach was as follow, which took around 15 minutes to complete:-

    public async Task <List<Details2>> Get()
            {       

              try
                {

                    using (WebClient wc = new WebClient()) 
                    {
                        string url = currentURL + "resources?AUTHTOKEN=" + pmtoken;
                        var json = await wc.DownloadStringTaskAsync(url);
                        resourcesinfo = JsonConvert.DeserializeObject<ResourcesInfo>(json);

                    }


                    ForEach( var c in resourcesinfo.operation.Details)
                   {

                        ResourceAccountListInfo resourceAccountListInfo = new ResourceAccountListInfo();
                        using (WebClient wc = new WebClient()) 
                        {

                            string url = currentURL + "resources/" + c.RESOURCEID + "/accounts?AUTHTOKEN=" + pmtoken;
                            string tempurl = url.Trim();



                            var json =  await wc.DownloadStringTaskAsync(tempurl);
                            resourceAccountListInfo = JsonConvert.DeserializeObject<ResourceAccountListInfo>(json);


                        }

                   if (resourceAccountListInfo.operation.Details.CUSTOMFIELD.Count > 0)
                    {
                        List<CUSTOMFIELD> customfield = resourceAccountListInfo.operation.Details.CUSTOMFIELD.Where(a =>
                                 a.CUSTOMFIELDLABEL.ToLower() == "name"
                                ).ToList();
                        if (customfield.Count == 1)
                        {
                            PMresourcesOnly.Add(resourceAccountListInfo.operation.Details);

                        }

                    }

                   }//end of foreach             

                    return PMresourcesOnly.ToList();

                }
                catch (Exception e)
                {
                }
                return new List<Details2>();
            }

now i did the following modifications :-

  • i replace foreach with Parallel.ForEach
  • since i should not use async methods inside Parallel.ForEach so i chnage the DownloadStringTaskAsync to DownloadString inside the Parallel.Foreach :-

    public async Task <List<Details2>> Get()
            {
    
    
    
                try
                {
    
                    using (WebClient wc = new WebClient()) 
                    {
                        string url = currentURL + "resources?AUTHTOKEN=" + pmtoken;
                        var json = await wc.DownloadStringTaskAsync(url);
                        resourcesinfo = JsonConvert.DeserializeObject<ResourcesInfo>(json);
    
                    }
    
    
                    Parallel.ForEach(resourcesinfo.operation.Details, new ParallelOptions { MaxDegreeOfParallelism = 7 }, (c) =>
                    {
    
                        ResourceAccountListInfo resourceAccountListInfo = new ResourceAccountListInfo();
                        using (WebClient wc = new WebClient()) 
                        {
    
                            string url = currentURL + "resources/" + c.RESOURCEID + "/accounts?AUTHTOKEN=" + pmtoken;
                            string tempurl = url.Trim();
    
    
    
                            var json =  wc.DownloadString(tempurl);
                            resourceAccountListInfo = JsonConvert.DeserializeObject<ResourceAccountListInfo>(json);
    
    
                        }
    
                    if (resourceAccountListInfo.operation.Details.CUSTOMFIELD.Count > 0)
                    {
                        List<CUSTOMFIELD> customfield = resourceAccountListInfo.operation.Details.CUSTOMFIELD.Where(a =>
                                 a.CUSTOMFIELDLABEL.ToLower() == "name"
                                ).ToList();
                        if (customfield.Count == 1)
                        {
                            PMresourcesOnly.Add(resourceAccountListInfo.operation.Details);
    
                        }
    
                    }
    
    
    
                    });//end of foreach
    
    
    
    
                return PMresourcesOnly.ToList();
    
                }
                catch (Exception e)
                {
                }
                return new List<Details2>();
            }
    

now when i use the Parallel.Foreach the execution time was reduced from 15 minutes to around 7 minutes. but i am a bit confused if my second method is valid , so can anyone adivce on these questions (or any question):-

  1. is using Parallel.Foreach with Webclient() a valid approach to follow ? or i should avoid using Parallel methods inside .net and web applications?

  2. when using Parallel.Foreach could i face any problem such as that the return PMresourcesOnly.ToList(); is return to the client while there are still some wc.DownloadString(tempurl); that did not complete?

  3. if i want to compare the 2 methods (Parallel.Foreach & Foreach) will the result be the same ?

  4. on some online articles they use Task.Factory.StartNew(() instead of using Parallel.foreach so what are the main differences between them ?

EDIT I tried defining the SemaphoreSlim as follow:-

public async Task <List<Details2>> Get()
{
SemaphoreSlim throttler = new SemaphoreSlim(initialCount: 15);       
  try
  {
//code goes here

var tasks = resourcesinfo.operation.Details.Select(c => TryDownloadResourceAsync(c.RESOURCEID,throttler)).ToList();
}

///---

private async Task<Details2> TryDownloadResourceAsync(string resourceId, SemaphoreSlim throttler)
        {
            await throttler.WaitAsync();
try
            {
                using (WebClient wc = new WebClient()) //get the tag , to check if there is a server with the same name & tag..
                {}
             }
 finally
            {
                throttler.Release();
            }
John John
  • 1
  • 72
  • 238
  • 501
  • 1
    Parallel.ForEach is the right approach. It is easier to use in your case. You can also use Task.Factory.StartNew but you must add also "Wait until all finish". That's why Parallel.ForEach is easier to use. I use also Parallel.ForEach with WebClient. If you use Parallel, then beware of synchronizing your code. Don't forget to use lock. –  Jun 29 '16 at 00:23
  • @Stanley can you advice more on "If you use Parallel, then beware of synchronizing your code. Don't forget to use lock." as in my case i am usinf sync methods inside paralle.foreach ,, but where i should place the lock and why i should use lock ? – John John Jun 29 '16 at 00:26
  • PMresourcesOnly.Add(resourceAccountListInfo.operation.Details); must be locked. You are adding to a collection from multiple threads. See this: https://msdn.microsoft.com/en-us/library/c5kehkcz.aspx –  Jun 29 '16 at 00:28
  • @Stanley so what is the problem if i am adding to it a collection from multiple threads ??? not sure what is the idea of using lock? – John John Jun 29 '16 at 00:37
  • Read this first: https://msdn.microsoft.com/en-us/library/mt679037.aspx –  Jun 29 '16 at 00:38
  • If you are entering critical section without synchronizing, then the result is "unpredictable data corruption" –  Jun 29 '16 at 00:40
  • @Stanley but based on my test the final result of my 2 methods (PMresourcesOnly) is the same, but in different order, but i did not face any data corruption even i am not using lock.. so not sure why i need to use lock – John John Jun 29 '16 at 00:43
  • Please read the link I sent you. It says "unpredictable data corruption". Multiple threads always consider to synchronize. If PMresourcesOnly is Thread Safe, then you don't need lock. Otherwise lock is very important to make it Thread Safe. –  Jun 29 '16 at 00:58
  • @Stanley so what determince if PMresourcesOnly is thread safe or not ? i did not get your point ? and what do you mean by Multiple threads always consider to synchronize?? – John John Jun 29 '16 at 01:00
  • I cannot describe it here. It will be very very long. Please learn first about "Multithreading". –  Jun 29 '16 at 01:03

2 Answers2

11

is using Parallel.Foreach with Webclient() a valid approach to follow ? or i should avoid using Parallel methods inside .net and web applications?

No, you absolutely should avoid using parallel methods inside ASP.NET apps.

on some online articles they use Task.Factory.StartNew(() instead of using Parallel.foreach so what are the main differences between them ?

Parallel is for data parallism (running the same CPU-bound code over a collection of data items). StartNew is for dynamic task parallelism (running the same or different CPU-bound code over a collection of items that changes as you process it).

Neither approach is appropriate here, since the work you have to do is I/O-bound, not CPU-bound.

What you actually want is concurrency (doing multiple things at a time), not parallelism. Instead of using parallel concurrency (doing multiple things at a time by using multiple threads), what you want is asynchronous concurrency (doing multiple things at a time using no threads).

Asynchronous concurrency is possible in code via await Task.WhenAll, as such:

private async Task<string> TryDownloadResourceAsync(string resourceId)
{
  ResourceAccountListInfo resourceAccountListInfo = new ResourceAccountListInfo();
  using (WebClient wc = new WebClient()) 
  {
    string url = currentURL + "resources/" + resourceId + "/accounts?AUTHTOKEN=" + pmtoken;
    string tempurl = url.Trim();

    var json =  await wc.DownloadStringTaskAsync(tempurl);
    resourceAccountListInfo = JsonConvert.DeserializeObject<ResourceAccountListInfo>(json);
  }

  if (resourceAccountListInfo.operation.Details.CUSTOMFIELD.Count > 0)
  {
    List<CUSTOMFIELD> customfield = resourceAccountListInfo.operation.Details.CUSTOMFIELD.Where(a =>
        a.CUSTOMFIELDLABEL.ToLower() == "name"
    ).ToList();
    if (customfield.Count == 1)
    {
      return resourceAccountListInfo.operation.Details;
    }
  }
  return null;
}

public async Task <List<Details2>> Get()
{       
  try
  {
    using (WebClient wc = new WebClient()) 
    {
      string url = currentURL + "resources?AUTHTOKEN=" + pmtoken;
      var json = await wc.DownloadStringTaskAsync(url);
      resourcesinfo = JsonConvert.DeserializeObject<ResourcesInfo>(json);
    }

    var tasks = resourcesinfo.operation.Details.Select(c => TryDownloadResourceAsync(c.RESOURCEID)).ToList();
    var results = await Task.WhenAll(tasks).Select(x => x != null);
    return results.ToList();
  }
  catch (Exception e)
  {
  }
  return new List<Details2>(); // Please, please don't do this in production.
}

As a final note, you may want to look into HttpClient, which was designed for asynchronous operations and has the nice property that you only need one of them for any number of simultaneous calls.

Stephen Cleary
  • 437,863
  • 77
  • 675
  • 810
  • 1
    thanks for your valuable reply.. i though i am in the right track of using Paralle.Foreach since it reduce the execution time by almost 50% compared to just using foreach .. but now you mentioned that using Parallel is not the correct approach to follow, and instead i need to use Task.WhenAll .. so can you adivce on these points please . 1) currently the "resourcesinfo.operation.Details.Select(c => TryDownloadResourceAsync(c.RESOURCEID)).ToList();" will call the TryDownloadResourceAsyncaround 1,500 time,, – John John Jun 29 '16 at 11:57
  • so how i can define the max number of concurrent threads as i do inside the paralle.foreach (MaxDegreeOfParallelism)? is this possible. second question 2) i still have some problem in understanding the differences between using Parallel.foreach and using Task.whenAll.. now in parallel processing multiple webclient will be initiate and executed at the same time (max of 7 in my case) instead of one webclient call which will improve the execution time.. but how will Task.WhenAll work , it will execute multiple tasks at the same time is this correct ? so it is a parallel processing ? – John John Jun 29 '16 at 12:00
  • third questio 3) is your approach of using Task.WhenAll similarity to using TPL DataFlow, ??? – John John Jun 29 '16 at 12:01
  • fourth question. i am afraid that using your code i will end up having 1,500 active tasks are being executed at the same time? while when i use parallel.foreach i can control this value using MaxDegreeOfParallelism .. so will i face any problem if the "await Task.WhenAll " will mainly initiate 1,500 tasks at the same time ? – John John Jun 29 '16 at 12:25
  • 2
    @johnG: 1) One way to throttle asynchronous concurrency is by using `SemaphoreSlim`; there are several answers on SO that show how. 2) Asynchronous tasks do not "execute", so they [do not use up a thread](http://blog.stephencleary.com/2013/11/there-is-no-thread.html). 3) Not really; TPL Dataflow gives you a mesh, which you *can* use as a producer/consumer queue. Asynchronous concurrency starts them all and then (asynchronously) waits for them all to complete, so there's no queueing. You can do throttling, which looks sort of like a queue if you tilt your head and squint a bit. – Stephen Cleary Jun 29 '16 at 12:42
  • 2
    @johnG: 4) The number of tasks is immaterial; they take up very few resources (a bit of memory, that's it). This is because tasks do not "execute". You can throttle (see (1)), which will allow you to limit the number of *web requests* that go out simultaneously (for 1500 requests, it's likely the server may timeout on some of them). But you don't throttle the number of tasks, because there's no benefit to doing so. As a final note, you may be interested in [my book](http://stephencleary.com/book/), which covers parallel, dataflow, and async, including throttling for each. – Stephen Cleary Jun 29 '16 at 12:44
  • so you mean there is not a huge advantage of using SemaphoreSlim in my case ?as the Webcleint will not start executing unless there are available threads on iis to handle it ? second point regarding the server timeout now i tested my original Paralle.foreach which have MaxDegreeOfParallelism = 7 . and i did not face any time out ,,, so will this differ when using Task.WhenAll? – John John Jun 29 '16 at 13:00
  • i tried running the Task.WhenAll on 1500 requests and i get exceptions, i am sure this is related to the face that 1500 webclient requests are being initiated at the same time ,maybe the 3rd party api will not be able to handle them !!, while when i use Parallel.Foreach with MaxdegreeOfParallelisin =7 i did not get any exception at the same time it improved the execution time... so not sure what i need to do now? i like the Task.whenAll since it allow me to run async methods unlike Parallel.Foreach ,,so to overcome the Task.whenAll problem i need to use SemaphoreSlim is this correct ? – John John Jun 29 '16 at 13:22
  • 1
    @johnG: Yes, if you want to throttle the number of requests, you should use `SemaphoreSlim`. – Stephen Cleary Jun 29 '16 at 14:21
  • But can you adivce how i can define a SemaphoreSlim in my case ? i find this link http://stackoverflow.com/questions/10806951/how-to-limit-the-amount-of-concurrent-async-i-o-operations but not sure how i can apply this approach to the code you provided ? – John John Jun 29 '16 at 14:37
  • @johnG: Try the template in [this answer](http://stackoverflow.com/questions/33174548/linq-calling-task-run-accessing-wrong-member-deferred-execution-issue/33174665#33174665). – Stephen Cleary Jun 29 '16 at 14:45
  • i am not sure how the SemaphoreSlim will work in the link you provided as we need to pass the same instance between the 2 methods other wise a new SemaphoreSlim will be created,, now can you check my Edit to my original question to see how i define the SemaphoreSlim in the code you provided ? – John John Jun 29 '16 at 15:08
  • @johnG: Why not just declare the `SemaphoreSlim` as a private member in your class? – Stephen Cleary Jun 29 '16 at 15:09
  • you are 100% correct, i miss read your example.. so now one final note before accepting your answer,, do i need to use lock inside your code ? i do not think it is valid correct ? – John John Jun 29 '16 at 15:16
  • No lock is necessary. – Stephen Cleary Jun 29 '16 at 18:03
  • so i am not sure i got your point correctly,,, you mean in the sample code you provided ,,, you did not use LOCK ,, but you are saying that lock is necessary .. so i am not sure where to place the LOCK inside the code and why ? – John John Jun 30 '16 at 13:59
  • 1
    @johnG: No, I'm saying that lock is *not* necessary. – Stephen Cleary Jun 30 '16 at 14:11
  • ok thanks for your great help,, your appraoch worked fine in my case – John John Jun 30 '16 at 14:34
0

Take a look at:
object syncObj = new object();
lock(syncObj)

try
{
    using (WebClient wc = new WebClient()) 
    {
        string url = currentURL + "resources?AUTHTOKEN=" + pmtoken;
        var json = await wc.DownloadStringTaskAsync(url);
        resourcesinfo = JsonConvert.DeserializeObject<ResourcesInfo>(json);
    }

    object syncObj = new object();  // create sync object
    Parallel.ForEach(resourcesinfo.operation.Details, new ParallelOptions { MaxDegreeOfParallelism = 7 }, (c) =>
    {
        ResourceAccountListInfo resourceAccountListInfo = new ResourceAccountListInfo();
        using (WebClient wc = new WebClient()) 
        {
            string url = currentURL + "resources/" + c.RESOURCEID + "/accounts?AUTHTOKEN=" + pmtoken;
            string tempurl = url.Trim();

            var json =  wc.DownloadString(tempurl);
            resourceAccountListInfo = JsonConvert.DeserializeObject<ResourceAccountListInfo>(json);
        }

        lock(syncObj)  // lock using sync object
        {
            PMresourcesOnly.Add(resourceAccountListInfo.operation.Details);
        }
    });//end of foreach

    return PMresourcesOnly.ToList();
}
catch (Exception e)
{
}
  • still i am not sure why i need to use lock ? as based on my test my above 2 methods return the same number of objects but in different order,, so i did not face any corrupted data , even i am not using lock ... so can you advice on the purpose of using lock ? – John John Jun 29 '16 at 00:59
  • Again, read this first: msdn.microsoft.com/en-us/library/mt679037.aspx –  Jun 29 '16 at 01:03
  • now based on the other answer, it is suggested not to use Parallel.foreach , and instead i should use Task.whenAll as it is more suitable to .net and web application,, can you adivce on this please? – John John Jun 29 '16 at 13:34
  • I'm still prefer Parallel.ForEach...:=) –  Jun 29 '16 at 13:40
  • can you provide why you still prefer Paralle.Foreach over Task.WhenAll – John John Jun 29 '16 at 14:15
  • Because I use Parallel.Foreach a lot and never use Task.WhenAll, only Task.Run. I choose always the easiest one, not the best one. –  Jun 29 '16 at 14:17