1

I want to perform integral calculations using rectangles and trapezoids method using multiple threads to get faster results. Unfortunately, in my case, executing multi-threaded code turns out to be slower than standard sequential code. Using multiple threads is much slower than a single thread - after all, shouldn't it be the other way around? It feels like the more threads, the slower the code executes.

In addition, I noticed that the more threads, the less precise the result of the integral is. This is especially noticeable when calculating the integral using the trapezoid method.

Here's my code: https://dotnetfiddle.net/jEPURO

using System;
using System.Diagnostics;
using System.Threading;
using System.Threading.Tasks;

namespace ParallelProgramming.ConsoleApp
{
    class Program
    {
        public static string IntegrationMethod { get; set; }
        public static double IntervalBegin { get; set; }
        public static double IntervalEnd { get; set; }
        public static int NPrecisionValue { get; set; }
        public static bool IsParallel { get; set; }
        public static int ThreadValue { get; set; }
        public static Stopwatch Stopwatch { get; set; }
        public static double Result { get; set; }

        static void Main(string[] args)
        {
            Console.WriteLine("Function                                  | Elapsed Time     | Estimated Integral");
            Console.WriteLine("-----------------------------------------------------------------");

            IntervalBegin = 5;
            IntervalEnd = -2;
            NPrecisionValue = 100000000;

            //RectangularIntegration – Sequential
            NumericalIntegrationMethods integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.RectangularIntegration(IntervalBegin, IntervalEnd, NPrecisionValue);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.RectangularIntegration)} – Sequential | {Stopwatch.Elapsed} | {Result}");

            //RectangularIntegrationParallel - 1 thread
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.RectangularIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 1);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.RectangularIntegrationParallel)} – 1 Thread | {Stopwatch.Elapsed} | {Result}");

            //RectangularIntegrationParallel - 2 threads
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.RectangularIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 2);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.RectangularIntegrationParallel)} – 2 Threads | {Stopwatch.Elapsed} | {Result}");

            //RectangularIntegrationParallel - 3 threads
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.RectangularIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 3);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.RectangularIntegrationParallel)} – 3 Threads | {Stopwatch.Elapsed} | {Result}");

            //RectangularIntegrationParallel - 4 threads
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.RectangularIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 4);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.RectangularIntegrationParallel)} – 4 Threads | {Stopwatch.Elapsed} | {Result}");

            //TrapezoidalIntegration - Sequential 
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.TrapezoidalIntegration(IntervalBegin, IntervalEnd, NPrecisionValue);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.TrapezoidalIntegration)} – Sequential | {Stopwatch.Elapsed} | {Result}");

            //TrapezoidalIntegrationParallel – 1 Thread
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.TrapezoidalIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 1);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.TrapezoidalIntegrationParallel)} – 1 Thread | {Stopwatch.Elapsed} | {Result}");

            //TrapezoidalIntegrationParallel – 2 Threads
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.TrapezoidalIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 2);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.TrapezoidalIntegrationParallel)} – 2 Threads | {Stopwatch.Elapsed} | {Result}");
            
            //TrapezoidalIntegrationParallel – 3 Threads
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.TrapezoidalIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 3);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.TrapezoidalIntegrationParallel)} – 3 Threads | {Stopwatch.Elapsed} | {Result}");

            //TrapezoidalIntegrationParallel – 4 Threads
            integral = new();
            Stopwatch = Stopwatch.StartNew();
            Result = integral.TrapezoidalIntegrationParallel(IntervalBegin, IntervalEnd, NPrecisionValue, 4);
            Stopwatch.Stop();

            Console.WriteLine($"{nameof(integral.TrapezoidalIntegrationParallel)} – 4 Threads | {Stopwatch.Elapsed} | {Result}");

            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
    }
    public class NumericalIntegrationMethods
    {
        double Function(double x)
        {
            return x * x + 2 * x;
        }

        public double RectangularIntegration(double xp, double xk, int n)
        {
            double dx, integral = 0;
            dx = (xk - xp) / n;

            for (int i = 1; i <= n; i++)
            {
                integral += dx * Function(xp + i * dx);
                //Console.WriteLine("Sekwencyjnie - iteracja {0} wątek ID: {1}", i, Thread.CurrentThread.ManagedThreadId);
            }

            return integral;
        }

        public double TrapezoidalIntegration(double xp, double xk, int n)
        {
            double dx, integral = 0;
            dx = (xk - xp) / n;

            for (int i = 1; i <= n; i++)
            {
                integral += Function(xp + i * dx);
                //Console.WriteLine("Sekwencyjnie - iteracja {0} wątek ID: {1}", i, Thread.CurrentThread.ManagedThreadId);
            }

            integral += (Function(xp) + Function(xk)) / 2;
            integral *= dx;

            return integral;
        }

        public double RectangularIntegrationParallel(double xp, double xk, int n, int maxThreads)
        {
            double dx, integral = 0;
            dx = (xk - xp) / n;

            Parallel.For(1, n + 1, new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, i =>
            {
                integral += dx * Function(xp + i * dx);
                //Console.WriteLine("Równolegle - iteracja {0} wątek ID: {1}", i, Thread.CurrentThread.ManagedThreadId);
            });

            return integral;
        }

        public double TrapezoidalIntegrationParallel(double xp, double xk, int n, int maxThreads)
        {
            double dx, integral = 0;
            dx = (xk - xp) / n;

            Parallel.For(1, n + 1, new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, i =>
            {
                integral += Function(xp + i * dx);
                //Console.WriteLine("Równolegle - iteracja {0} wątek ID: {1}", i, Thread.CurrentThread.ManagedThreadId);
            });

            integral += (Function(xp) + Function(xk)) / 2;
            integral *= dx;

            return integral;
        }
    }
}


And here's output:

Function                                  | Elapsed Time     | Estimated Integral
-----------------------------------------------------------------
RectangularIntegration – Sequential | 00:00:00.9284260 | -65.33333210831276
RectangularIntegrationParallel – 1 Thread | 00:00:01.7040507 | -65.33333210831276
RectangularIntegrationParallel – 2 Threads | 00:00:01.7191484 | -65.33333210831276
RectangularIntegrationParallel – 3 Threads | 00:00:01.6888398 | -57.73164823448317
RectangularIntegrationParallel – 4 Threads | 00:00:01.5530828 | -65.33333210831276
TrapezoidalIntegration – Sequential | 00:00:00.7278303 | -65.33333333332568
TrapezoidalIntegrationParallel – 1 Thread | 00:00:01.4265208 | -65.33333333332568
TrapezoidalIntegrationParallel – 2 Threads | 00:00:02.3009881 | -33.110522448239216
TrapezoidalIntegrationParallel – 3 Threads | 00:00:01.6062253 | -57.02137898750542
TrapezoidalIntegrationParallel – 4 Threads | 00:00:01.9967140 | -18.120285251376426

Why is this happening? What am I doing wrong? After all, the more threads used, the faster the results should be. 4 threads should be faster than 3, and 3 threads should be faster than 2 and so on. How can I get faster results using more threads?

Sisimośki
  • 143
  • 1
  • 10
  • 1
    Honestly, I wouldn't trust these numbers at all - there's no warmup for JIT and thread-pool growth. If you care about perf, benchmarkdotnet (freely available on NuGet) is highly recommended (it is pretty easy to get started with, there's examples on the GitHub page). I'd also be interested to see whether SIMD can be used here (via spans and vectors) – Marc Gravell Mar 21 '21 at 19:31
  • If you are really concerned about high performance, then C# is probably not the right language to use. – Neil Mar 21 '21 at 19:34
  • @Neil what programming language would you suggest for making high performance applications? – Theodor Zoulias Mar 21 '21 at 19:39
  • Depends on your algorithm, but C++/AMP so that it runs massively parallel on your GPU would be my choice. – Neil Mar 21 '21 at 19:40
  • @Neil I agree, low-level programming languages would be a better choice, but I wanted to see how .NET deals with that kind of tasks. I still don't believe - even for higher-level programming languages - that these results are correct. I expected that these results would be slower than say C++, but I don't expected that multithread would be slower than single thread. – Sisimośki Mar 21 '21 at 19:41
  • As @MarcGravell suggests, rather than just running each test once, run it 100 or 1000s times and then take the average. There are many reasons why one particular run might be affected by other OS/drive/memory issues. Take a look at Benchmark.NET that might help. – Neil Mar 21 '21 at 19:44
  • @MarcGravell I've run multiple times and each time sequential and 1-thread code execution was faster than multithread. – Sisimośki Mar 21 '21 at 19:46
  • Loading the app into memory 1000 times is very different from loading the application once and running the actual algorithm 1000 times. – Neil Mar 21 '21 at 19:48
  • 1
    @Neil the C++ vs C# thing is not as obvious as that - for example large chunks of the CLR internals have recently been moved from C++ to C# and seen *increases* in performance. It is contextual, of course, but the current JIT and other capabilities: mean it isn't a simple race to bet on. – Marc Gravell Mar 21 '21 at 19:49
  • Sure, but if you can go from a few CPU threads running simultaneously to 1000s of GPU threads running simultaneously, then the difference will be noticeable. Obviously it depends on the algorithm, memory requirements etc. – Neil Mar 21 '21 at 19:51
  • 1
    @Neil I don't think that the OP will be helped much by switching from the `Stopwatch` to the Benchmark.NET for their measurements. The `Stopwatch` is good enough when you want a quick and rough estimation of which method is faster, and in this case it's clear that the single-threaded version is faster than the multithreaded version. – Theodor Zoulias Mar 21 '21 at 19:54
  • Related: [Disappointing performance with Parallel.For](https://stackoverflow.com/questions/10846550/disappointing-performance-with-parallel-for), and also [Parallel.ForEach Slower than ForEach](https://stackoverflow.com/questions/6036120/parallel-foreach-slower-than-foreach). – Theodor Zoulias Mar 21 '21 at 19:57
  • 2
    Unless Parallel.For works some magic that I'm not aware of, it looks like your parallel implementation might generate a TON of contention around the "integral" variable. Each loop is very simple and short, but they all compete read/writes of this shared variable. I would try to sum up everything in non-shared variables, and then sum those up after the individual iterations are done. Also, IF parallel for spawns up new threads (not sure if that's the case), that would incur a solid amount of overhead. And, as pointed out, make sure everything is warmed up etc. – Bogey Mar 21 '21 at 20:03
  • 3
    ps: Is what you're doing even thread-safe? "+=" is not an atomic operation, unless there's some further synchronization magic going on behind the scenes, you might end up with invalid results – Bogey Mar 21 '21 at 20:05
  • 3
    @Bogey there is no magic. The OP's parallel code is not thread-safe, and produces incorrect results. – Theodor Zoulias Mar 21 '21 at 20:13
  • @TheodorZoulias how can I make thread-safe code? What do I need to change? – Sisimośki Mar 21 '21 at 21:03
  • 1
    Check this https://learn.microsoft.com/en-us/dotnet/standard/parallel-programming/how-to-speed-up-small-loop-bodies?redirectedfrom=MSDN – Genusatplay Mar 21 '21 at 21:17
  • 1
    Completely not thread-safe so it's unsurprising you're seeing those results. You are effectively making `integral` a global variable and accessing it without any memory-barrier. A memory-barrier will slow it down even further, but at least you get correct results. I think you need a way for all threads to execute independently of each other and combine the results afterwards, otherwise you're wasting your time (and I think C/C++ would have similar results) – Charlieface Mar 21 '21 at 21:24

1 Answers1

5

Here is a way to parallelize the calculation in a way that allows each thread to work independently, with minimal interference from the other threads, using thread-local state (the accumulator argument). In general the less each thread has to step on each other's toes, the more efficient your parallel code becomes.

public double RectangularIntegrationParallel(double xp, double xk, int n, int maxThreads)
{
    double dx, integral = 0;
    dx = (xk - xp) / n;
    var locker = new object();

    Parallel.ForEach(Partitioner.Create(0, n + 1), new ParallelOptions
    {
        MaxDegreeOfParallelism = maxThreads
    }, () => 0.0D, (range, state, accumulator) =>
    {
        for (int i = range.Item1; i < range.Item2; i++)
        {
            accumulator += dx * Function(xp + i * dx);
        }
        return accumulator;
    }, accumulator =>
    {
        lock (locker) { integral += accumulator; }
    });
    return integral;
}

The intention of using the Partitioner.Create method is to chunkify the workload. Instead of invoking the lambda for each tiny loop of the calculation, you can use the Partitioner to split the total range of the calculation into sub-ranges, and invoke the lambda once for each sub-range. Lambda invocations cannot be inlined by the compiler, so in general you would like to avoid calling a lambda millions of times per second.

The Parallel.ForEach overload that is used in this example has this signature:

public static ParallelLoopResult ForEach<TSource, TLocal>(
    Partitioner<TSource> source,
    ParallelOptions parallelOptions,
    Func<TLocal> localInit,
    Func<TSource, ParallelLoopState, TLocal, TLocal> body,
    Action<TLocal> localFinally);

An alternative to the Parallel class is to use PLINQ. In general PLINQ produces more concise and easier to understand code, usually at the cost of some additional overhead.

public double RectangularIntegrationParallel(double xp, double xk, int n, int maxThreads)
{
    double dx = (xk - xp) / n;

    return Partitioner.Create(0, n + 1)
        .AsParallel()
        .WithDegreeOfParallelism(maxThreads)
        .Select(range =>
        {
            double integral = 0.0;
            for (int i = range.Item1; i < range.Item2; i++)
            {
                integral += dx * Function(xp + i * dx);
            }
            return integral;
        })
        .Sum();
}
Theodor Zoulias
  • 34,835
  • 7
  • 69
  • 104