0

I'm having an issue with the following code. The code works with no errors but I'm receiving different output values when using a parallel for loop vs a regular for loop. I need to get the parallel for loop working properly because I run this code thousands of times. Does anyone know why my parallel for loop is returning different outputs?

private object _lock = new object();

public double CalculatePredictedRSquared()
    {
        double press = 0, tss = 0, press2 = 0, press1 = 0;
        Vector<double> output = CreateVector.Dense(Enumerable.Range(0, 400).Select(i => Convert.ToDouble(i)).ToArray());
        List<double> input1 = new List<double>(Enumerable.Range(0, 400).Select(i => Convert.ToDouble(i)));
        List<double> input2 = new List<double>(Enumerable.Range(200, 400).Select(i => Convert.ToDouble(i)));

            Parallel.For(0, output.Count, i =>
            {
                ConcurrentBag<MultipleRegressionInfo> listMRInfoBag = new ConcurrentBag<MultipleRegressionInfo>(listMRInfo);
                ConcurrentBag<double> vectorArrayBag = new ConcurrentBag<double>(output);
                ConcurrentBag<double[]> matrixList = new ConcurrentBag<double[]>();

                lock (_lock)
                {
                    matrixList.Add(input1.Where((v, k) => k != i).ToArray());
                    matrixList.Add(input2.Where((v, k) => k != i).ToArray());
                }

                var matrixArray2 = CreateMatrix.DenseOfColumnArrays(matrixList);
                var actualResult = vectorArrayBag.ElementAt(i);
                var newVectorArray = CreateVector.Dense(vectorArrayBag.Where((v, j) => j != i).ToArray());
                var items = FindBestMRSolution(matrixArray2, newVectorArray);
                double estimate1 = 0;

                if (items != null)
                {
                    lock (_lock)
                    {
                        var y = 0d;
                        var independentCount = matrixArray2.RowCount;
                        var dependentCount = newVectorArray.Count;

                        if (independentCount == dependentCount)
                        {
                            var populationCount = independentCount;
                            y = newVectorArray.Average();

                            for (int l = 0; l < matrixArray2.ColumnCount; l++)
                            {
                                var avg = matrixArray2.Column(l).Average();
                                y -= avg * items[l];
                            }
                        }

                        for (int m = 0; m < 2; m++)
                        {
                            var coefficient = items[m];

                            if (m == 0)
                            {
                                estimate1 += input1.ElementAt(i) * coefficient;
                            }
                            else
                            {
                                estimate1 += input2.ElementAt(i) * coefficient;
                            }
                        }

                        estimate1 += y;
                    }
                }
                else
                {
                    lock (_lock)
                    {
                        estimate1 = 0;
                    }
                }

                lock (_lock)
                {
                    press1 += Math.Pow(actualResult - estimate1, 2);
                }
            });

            for (int i = 0; i < output.Count; i++)
            {
                List<double[]> matrixList = new List<double[]>();
                matrixList.Add(input1.Where((v, k) => k != i).ToArray());
                matrixList.Add(input2.Where((v, k) => k != i).ToArray());
                var matrixArray = CreateMatrix.DenseOfColumnArrays(matrixList);
                var actualResult = output.ElementAt(i);
                var newVectorArray = CreateVector.Dense(output.Where((v, j) => j != i).ToArray());
                var items = FindBestMRSolution(matrixArray, newVectorArray);
                double estimate = 0;

                if (items != null)
                {
                    var y = CalculateYIntercept(matrixArray, newVectorArray, items);
                    for (int m = 0; m < 2; m++)
                    {
                        var coefficient = items[m];

                        if (m == 0)
                        {
                            estimate += input1.ElementAt(i) * coefficient;
                        }
                        else
                        {
                            estimate += input2.ElementAt(i) * coefficient;
                        }
                    }
                }
                else
                {
                    estimate = 0;
                }

                press2 += Math.Pow(actualResult - estimate, 2);
            }

            tss = CalculateTotalSumOfSquares(vectorArray.ToList());
            var test1 = 1 - (press1 / tss);
            var test2 = 1 - (press2 / tss);
}

public Vector<double> CalculateWithQR(Matrix<double> x, Vector<double> y)
    {
        Vector<double> result = null;

            result = MultipleRegression.QR(x, y);

            for (int i = 0; i < result.Count; i++)
            {
                var value = result.ElementAt(i);

                if (Double.IsNaN(value) || Double.IsInfinity(value))
                {
                    return null;
                }
            }

        return result;
    }

    public Vector<double> CalculateWithNormal(Matrix<double> x, Vector<double> y)
    {
        Vector<double> result = null;

            result = MultipleRegression.NormalEquations(x, y);

            for (int i = 0; i < result.Count; i++)
            {
                var value = result.ElementAt(i);

                if (Double.IsNaN(value) || Double.IsInfinity(value))
                {
                    return null;
                }
            }

        return result;
    }

    public Vector<double> CalculateWithSVD(Matrix<double> x, Vector<double> y)
    {
        Vector<double> result = null;

            result = MultipleRegression.Svd(x, y);

            for (int i = 0; i < result.Count; i++)
            {
                var value = result.ElementAt(i);

                if (Double.IsNaN(value) || Double.IsInfinity(value))
                {
                    return null;
                }
            }

        return result;
    }

    public Vector<double> FindBestMRSolution(Matrix<double> x, Vector<double> y)
    {
        Vector<double> result = null;

            result = CalculateWithNormal(x, y);

            if (result != null)
            {
                return result;
            }
            else
            {
                result = CalculateWithSVD(x, y);

                if (result != null)
                {
                    return result;
                }
                else
                {
                    result = CalculateWithQR(x, y);

                    if (result != null)
                    {
                        return result;
                    }
                }
            }

        return result;
    }

public double CalculateTotalSumOfSquares(List<double> dependentVariables)
    {
        double tts = 0;

            for (int i = 0; i < dependentVariables.Count; i++)
            {
                tts += Math.Pow(dependentVariables.ElementAt(i) - dependentVariables.Average(), 2);
            }

        return tts;
    }

Actual Output (Updated results):

test1 = 137431.12889999992 (parallel for loop)

test2 = 7.3770258447689254E- (regular for loop)

Epilogue: How to setup an MCVE-compliant testing

This may be a fair way to prepare an indeed fully reproducible setup of an MCVE-code + A/B/C/... DataSET-s, put inside a ready-to-run [IDE & Testing Sandbox, hyperlinked here][1], so that Community members can click a re-run button and focus on root-cause analysis, not on decoding and re-engineering the heaps of incomplete SLOCs.

If this runs for the O/P, it will run for other Community Members, whom the O/P has asked for an answer or help.

Try it online!

My new version of the code:

public double CalculatePredictedRSquared()
    {
        Vector<double> output = CreateVector.Dense(Enumerable.Range(0, 400).Select(i => Convert.ToDouble(i)).ToArray());
        List<double> input1 = new List<double>(Enumerable.Range(0, 400).Select(i => Convert.ToDouble(i)));
        List<double> input2 = new List<double>(Enumerable.Range(200, 400).Select(i => Convert.ToDouble(i)));
        double tss = CalculateTotalSumOfSquares(output.ToList());

        IEnumerable<int> range = Enumerable.Range(0, output.Count);
        var query = range.Select(i => DoIt(i, output, input1, input2));
        var result = 1 - (query.Sum() / tss);
        return result;
    }

    public double DoIt(int i, Vector<double> output, List<double> input1, List<double> input2)
    {
        List<double[]> matrixList = new List<double[]>
                {
                    input1.Where((v, k) => k != i).ToArray(),
                    input2.Where((v, k) => k != i).ToArray()
                };
        var matrixArray = CreateMatrix.DenseOfColumnArrays(matrixList);
        var actualResult = output.ElementAt(i);
        var newVectorArray = CreateVector.Dense(output.Where((v, j) => j != i).ToArray());
        var items = FindBestMRSolution(matrixArray, newVectorArray);
        double estimate = 0;

        if (items != null)
        {
            var y = CalculateYIntercept(matrixArray, newVectorArray, items);
            for (int m = 0; m < 2; m++)
            {
                var coefficient = items[m];

                if (m == 0)
                {
                    estimate += input1.ElementAt(i) * coefficient;
                }
                else
                {
                    estimate += input2.ElementAt(i) * coefficient;
                }
            }
        }
        else
        {
            estimate = 0;
        }

       return Math.Pow(actualResult - estimate, 2);
    }
DarthVegan
  • 1,719
  • 7
  • 25
  • 42
  • 3
    I see external calls in the parallel code, like `FindBestMRSolution`, are you sure those things are thread-safe in that they can be safely used in this manner/context? – Lasse V. Karlsen Nov 12 '17 at 20:46
  • 5
    In any case, please post a [mcve]. – Lasse V. Karlsen Nov 12 '17 at 20:46
  • Would you mind to also post the test1 / test2 data, that were processed, so as to **complete the MCVE-compliant formulation also with the set setup + data**? Otherwise the values -2.34, resp. -4.72 have no value for re-testing. How about Parallel updates without atomic operations? **An irreproducible code** ( not a re-run ready test ) **is of negative value here, at StackOverflow ...** – user3666197 Nov 12 '17 at 20:47
  • @LasseVågsætherKarlsen I edited my question to show the code for the external calls. Because I have so many custom classes and custom calculations, the only way to post a complete example would be to share my entire project so I'm not sure what to do at this point :( – DarthVegan Nov 12 '17 at 21:00
  • 4
    _"the only way to post a complete example would be ..."_ - No, learn to reduce the problem to less than ~30 lines. Start with `File|New Project`. – H H Nov 12 '17 at 21:16
  • 3
    If you cannot narrow it down there's not much anyone here can do to help you. The typical reason why a parallel piece of code produces a different result than a sequential version of the same is that you've messed up shared state, so that multiple parallel threads change and use the same values. Also, have you verified which of the two values are correct? – Lasse V. Karlsen Nov 12 '17 at 21:18
  • @user3666197 I'm not sure how to add the references to math.net and for Ienumerable so I edited my above code to use some junk data for the inputs and outputs so that the code can be tested. This should work for everyone I believe because I'm not using any custom classes – DarthVegan Nov 13 '17 at 04:21
  • Great to start moving in the right direction. Where is the ( *updated* ) state-full-IDE-SandBox URL hyperlink to click and re-run it? – user3666197 Nov 13 '17 at 04:26
  • @user3666197 I updated the link to show what I changed everything to but I'm not sure how to include the dll for math.net numerics to get it to compile correctly – DarthVegan Nov 13 '17 at 04:36
  • @LasseVågsætherKarlsen The sequential version is producing the correct result and I added a link to my code where it compiles – DarthVegan Nov 13 '17 at 21:07
  • 2
    It is immediately obvious that `matrixList` is used both inside and outside of locks. This looks very, very wrong. Explain please why a concurrent bag *requires* a lock in some cases, and may be accessed *outside* the lock in others. Why is it not simply a list, accessed unlocked, as in the non-parallel case? – Eric Lippert Nov 14 '17 at 21:24
  • 2
    Correctness aside: *how do you expect to get an advantage from parallelism when almost the entire body of the parallel method is serialized on a lock*? – Eric Lippert Nov 14 '17 at 21:26
  • @EricLippert It is simple to explain. I'm a complete beginner with parallel/multithreading and so you are saying I don't need to use locks after all? – DarthVegan Nov 14 '17 at 21:34
  • 3
    I'm not saying anything other than *this code at first glance looks badly broken*. If you are a beginner at parallelism then *start with an easier problem than this and work your way up*. If you are unable to explain things like "why did I use a concurrent bag?" and "what's my locking strategy intended to accomplish?" then you will not be successful writing this program. Multithreaded programming is *hard*: there are multiple threads of control, and *all your instincts from single threaded programming are wrong*. – Eric Lippert Nov 14 '17 at 21:44
  • @EricLippert I understand why I'm using concurrent bags but I only inserted the locks as a test to see why I was getting different results. – DarthVegan Nov 14 '17 at 21:46
  • OK then: why are you using concurrent bags? Under what circumstances is any bag accessed by multiple threads? – Eric Lippert Nov 14 '17 at 21:47
  • @EricLippert Taking the locks away doesn't change the returned data and I was trying to find the cause for the differences. Concurrent bags are thread safe collections used for multithreading. I'm using them because they were made for multithreading – DarthVegan Nov 14 '17 at 21:53
  • But... you're not accessing them on multiple threads, right? – Eric Lippert Nov 14 '17 at 21:58
  • @EricLippert I'm accessing them inside the parallel for loop which uses different threads so yes – DarthVegan Nov 14 '17 at 21:59
  • 1
    **Is any one instance of a bag ever accessed on two threads at once**? I can't see how one would be, but perhaps I am missing something. – Eric Lippert Nov 14 '17 at 21:59
  • @EricLippert I don't know then – DarthVegan Nov 14 '17 at 22:00
  • 6
    Seriously: start with a much simpler problem. You're trying to fix things by randomly making stuff threadsafe and randomly introducing locks; you will fail if you keep doing so. Writing correct multithreaded programs requires *a detailed understanding of exactly how both memory and control flow works in C#*. – Eric Lippert Nov 14 '17 at 22:03
  • @EricLippert I can't. I am taking a class and I have to write a program that uses variable selection to find the set of variables that produces the highest predicted rsquared value and I have over 50,000 combinations to process and each combination has thousands of data points to check which means multithreading is my only option – DarthVegan Nov 14 '17 at 22:05
  • Are you **sure** that the non-parallel algorithm produces the correct result? Because it doesn't look right to me. The non-parallel version of the algorithm never resets `estimate` between iterations of the loop unless `items` is `null`, but the parallel version correctly does reset `estimate1` to zero every parallel iteration of the loop. Your parallel and non-parallel versions do not do the same thing. – Eric Lippert Nov 14 '17 at 22:09
  • 2
    You certainly *can* start with a simpler problem and work your way up. **There is always a simpler problem that you know how to solve**. Start with a *simple* multithreaded problem that you *know* you can code correctly, and that you can write a program that *asserts* its correctness. Then *gradually modify that correct program into another correct program*. – Eric Lippert Nov 14 '17 at 22:10
  • 1
    The fundamental problem with your program is not that you've made a mess of the parallelism. The fundamental problem is that you've written a loop body that attempts to do far, far too much work. **Extract the loop body into a method that you can test**, and then write tests for it. You'll find that your program becomes much easier to debug. – Eric Lippert Nov 14 '17 at 22:13
  • @EricLippert I didn't catch the error where I wasn't resetting estimate for each loop and I fixed that but I'm still getting very different results. – DarthVegan Nov 14 '17 at 22:16
  • 4
    No, hold on. **You said that the value returned by the non-parallel loop was known to be correct**. Were you *wrong*? You can't possibly be successful trying to debug a program whose correct output you don't even know. **Approach debugging as an engineering discipline**. You need to start by *refactoring this program into pure methods that can be tested*, and then *testing the heck out of them*. **Do not attempt to parallelize a wrong algorithm**. You said before that the "right answer" was *trillions of times larger than the new "right answer"*. You need to *understand the math*. – Eric Lippert Nov 14 '17 at 22:19
  • 1
    A "pure" method is a method that (1) modifies no state, and (2) always returns the same value when given the same inputs. Mathematical functions are ideal candidates for pure functions. Pure functions are good because they are easy to debug, easy to test, and can be safely parallelized because they do not modify state. – Eric Lippert Nov 14 '17 at 22:20
  • @EricLippert Yes I was wrong but just so you know the full details, my original method that I created broke down the predicted rsquared calculation and I had calls to external methods but to make it easier for you guys to read it, I put it all back into one method. The predicted rsquared looks accurate now but I'm focusing on the parallel portion to see where I went wrong – DarthVegan Nov 14 '17 at 22:26
  • 1
    @user3610374: You've handed us a 200 line program and asked why it does not work. My request to you is to produce a 20 line program that exhibits the same problem. For example, replace `for (int m = 0; m < 2; m++){var coefficient = items[m];if (m == 0){estimate1 += input1.ElementAt(i) * coefficient;}else{estimate1 += input2.ElementAt(i) * coefficient;}}` with `estimate1 += (input2.ElementAt(i) + input1.ElementAt(i)) * items[m];` (and then try deleting that line and seeing if parallel implementations still produce different results than serial implementations). – Brian Nov 14 '17 at 22:39
  • 1
    @user3610374: Assuming your computer has 4 cores, the best speedup you're going to get from parallelism is 4X (well, technically that's not true due to hyper-threading and the like, but it's true _enough_). Excepting where the class is directly teaching parallelism, It is rare for homework problems to have both sufficient computing power and sufficient data for a problem to not be tractable unless parallelism is added to the mix. Normally, either the problem is intractable with parallelism (find a different algorithm entirely) or the problem is tractable without parallelism. – Brian Nov 14 '17 at 22:45
  • I notice that you recompute the average of `dependentVariables` every time through the loop, instead of once outside the loop. Look for places where you are doing computations too many times and eliminate them. `CalculateTotalSumOfSquares` could be rewritten as `var m = dependentVariables.Average(); return dependentVariables.Select(x => (x - m) * (x - m)).Sum();` – Eric Lippert Nov 14 '17 at 23:39
  • 1
    Finally, read this: https://ericlippert.com/2014/03/05/how-to-debug-small-programs/ Follow good engineering practices and you will be able to find your own bugs rather than asking on SO. – Eric Lippert Nov 14 '17 at 23:41
  • @EricLippert I'm not sure what you mean when you say that I'm recomputing the average every time through the loop. I only calculate the total sum of squares once before the loop – DarthVegan Nov 14 '17 at 23:54
  • **Read the code again**. You have a call to `dependentVariables.Average` in an inner loop. That is *very expensive*. The average does not change throughout the loop, but it is O(n) to compute, so that's an O(n squared) operation. – Eric Lippert Nov 15 '17 at 00:31
  • 1
    You're not approaching this *seriously*. Programming is an *engineering discipline*. Proceed *carefully*, and make sure that *every step along the way you have a justification for why the code is correct and fast*. The time to stop making careless errors and changing code at random in the hopes that it will work is *now*. If you double the amount of time you spend up front writing your code to ensure it is correct and testable, you will spend ten times less time debugging your careless mistakes. – Eric Lippert Nov 15 '17 at 00:36
  • 1
    Don't throw parallelism at a problem until you are *sure* that you have already worked out all the careless recomputations; your computation of the sum of squares of differences is *hundreds* of times slower than it should be, and you're trying to make it *tens* of times faster with parallelism. Again, **concentrate your effort in the right places to make it efficient the first time**. Parallelism will not turn a fundamentally slow algorithm into a fast one. – Eric Lippert Nov 15 '17 at 00:41
  • @EricLippert I misread what you originally wrote and I was looking at the wrong place but I did notice what you were talking about and I fixed that method. I fixed the problem with the sum of squares by taking the average line out and changing it from a loop to a linq statement unless I'm missing something else – DarthVegan Nov 15 '17 at 00:58
  • @Brian With all due respect, Brian you are wrong. There is no such code, that could achieve 4x speedup if going from a pure-[SERIAL] to a "just"-[CONCURRENT] or a true-[PARALLEL] process scheduling on 4 CPUs -- **read both the original Amdahl's Law formulation and it's cricitism** due to both the add-on overheads and process-atomicity of further indivisible transactions fro even infinite number of processors ... **on >>>** https://stackoverflow.com/tags/parallelism-amdahl/info – user3666197 Nov 15 '17 at 05:00
  • 4
    @user3666197: You are missing Brian's point entirely, which is that there is a huge set of problems that are both highly parallelizable, **and** where adding a reasonable amount of parallelism doesn't help. If going parallel takes your algorithm from running in 2 minutes to running in 30 seconds, **who cares**? You could have burned those extra 90 seconds no problem. If going parallel takes your algorithm from running in 20 years to running in 5 years, **who cares?** You don't have 5 years to wait. Only a very small number of problems are *exactly slow enough* to make it worthwhile. – Eric Lippert Nov 15 '17 at 14:17
  • Honestly speaking, I did not try to review any other Brian's insights except his first sentence "***Assuming your computer has 4 cores, the best speedup you're going to get from parallelism is 4X***" which **does not hold** for whatever {slow|fast|any}-problem. On your remarks, I do advocate on proper designs for problems, where waiting +128 hours on a few TB ( ill-designed ) matrix-processing is principally not acceptable, while a smart-process may deliver results in reasonable time right by the art of professional process-scheduling + latency masking, so **there are businesses who do care** – user3666197 Nov 15 '17 at 14:28
  • That's at least twice now that you've responded to a comment without reading it fully or understanding it. You're asking for help and people are being generous; you could respect that by thinking carefully about what they have to say. – Eric Lippert Nov 15 '17 at 14:47
  • 1
    @user3666197: You're missing my point entirely, but I'll address your specific criticism nonetheless. I said that 4X was the *best* you could do. That 4X is a theoretical maximum rather than a truly reachable maximum is largely irrelevant. Obviously in cases where overheads are high, you will do much worse than 4X. But I said 4X was the best, not a guaranteed results. In cases were parallelism overheads are negligible, you can easily reach 3.9X. I'm rounding up, but you can't refute an argument by saying the claims were unfairly _weakened_. – Brian Nov 16 '17 at 14:56
  • @Brian Let's revise facts: I try to best of my imagination, how could one present a **speedup "can easily reach ( speedup ) 3.9X**". In order to go quantitative, kindly review the math of the process scheduling - **Section Epilogue** ( +URL there ) in **https://stackoverflow.com/a/46124635** There an extreme case of [PAR] having an ultimately extreme value of **p == 1** and yet even a microscopic amount of overheads **o** visibly lower the ceiling of the achievable speedup. Next, move **p** just to 0.99 and check the devastated values achievable just from 1% [SERIAL] fraction of the process. – user3666197 Nov 16 '17 at 15:29
  • Can you show `CreateMatrix.DenseOfColumnArrays`? If this is doing any kind of matrix multiplication or anything, this'll break. – EJoshuaS - Stand with Ukraine Nov 16 '17 at 16:13

2 Answers2

12

This whole thing is a dog's breakfast; you should abandon that attempt at parallelism entirely.

Start over. Here's what I want you to do. I want you to write a method DoIt that returns double and takes a int i and whatever other state is required to do a single iteration of the loop.

You will then rewrite your method as follows:

public double CalculatePredictedRSquared()
{
    Vector<double> output = whatever;
    // Whatever other state you need here
    IEnumerable<int> range = Enumerable.Range(0, output.Count);
    var query = range.Select(i => DoIt(i, whatever_other_state));
    return query.Sum();
}

Got it? DoIt is the thing that is inside your loop right now. It must take in i, and output and whatever other vectors you need to pass into it. It must only compute a double -- in this case, the square of the estimate error -- and return that double.

It must be pure: It must not read or write any non-local variable, it must not call any non-pure method, and it must give exactly the same results when given the same inputs, every time. Pure methods are the easiest methods to write, to read, to understand, to test and to parallelize; always try to write pure methods when doing math computations.

Write test cases for DoIt, and test the heck out of it. It's a pure method; you should be able to write lots of test cases. Similarly test any of the pure methods called by DoIt.

Once you are satisfied that DoIt is both correct and pure, then the magic happens. Just change it to:

range.AsParallel().Select...

Then compare the parallel and non-parallel versions. They should produce the same result; if not, then something was impure. Figure out what it was.

Then, verify that the parallel version was faster. If not, then you have failed to do enough work in DoIt to justify parallelism; see https://en.wikipedia.org/wiki/Amdahl%27s_law for details.

Eric Lippert
  • 647,829
  • 179
  • 1,238
  • 2,067
  • I just updated my question to show what I wrote as you suggested. Does it look correct? – DarthVegan Nov 14 '17 at 23:32
  • 6
    @user3610374: You tell me! Did you write some tests? Does it produce the right results? – Eric Lippert Nov 14 '17 at 23:34
  • Yes I wrote some tests and I seem to be getting the right results – DarthVegan Nov 15 '17 at 01:04
  • @user3610374 While Eric has mentioned the original ( *overhead-naive, atomic-process-agnostic* ) Amdahl's Law on Wikipedia, let me direct you to **rather read both the original Amdahl's Law formulation and it's cricitism due to both the add-on overheads and process-atomicity** of further indivisible transactions from even an infinite number of processors ... **on >>> https://stackoverflow.com/tags/parallelism-amdahl/info** original formulation is often mis-led if applied with a blind belief neither overheads nor indivisible-transactions exist, but they do and do matter. Anyway, stay tuned, men – user3666197 Nov 15 '17 at 14:39
0

A few things:

lock (_lock)
{
   matrixList.Add(input1.Where((v, k) => k != i).ToArray());
   matrixList.Add(input2.Where((v, k) => k != i).ToArray());
}

You're adding items to a collection that is already thread-safe by design, so no need to lock. While List is not thread-safe, it should be OK to read from it concurrently. From the documentation:

It is safe to perform multiple read operations on a List, but issues can occur if the collection is modified while it’s being read. To ensure thread safety, lock the collection during a read or write operation. To enable a collection to be accessed by multiple threads for reading and writing, you must implement your own synchronization.

Also note that matrixList is stored in a local variable; in this case, the collection can't be called from multiple threads because the entire body of the delegate is guaranteed to run on the same thread - it won't be the case that half of the body of the Parallel.For loop will be run on thread A and that the other half will be run on thread B, for example.

Similarly, there's no reason to lock while making changes to estimate1 because it can't possibly be modified from other threads.

Disclaimer: there's no guarantee as to the degree of parallelism of the Parallel.For loop overall. There's not even a guarantee that it will run in parallel at all.

press1 and press2, however, are not local variables, so you do need to synchronize these somehow. (It would be better if you found some way to avoid locking every time, though, because that'll at least partially kill the point of multithreading).

Perhaps most critically, ConcurrentBag is an unordered collection. You don't show all of the operations you're doing on your matrices, but if you're doing matrix multiplication anywhere this could easily cause wrong results. There is no guarantee that matrix multiplication will commute. While A * B = B * A for integers, this is not true in general for matrices. It's quite possible that your logic is subtly dependent on operations occurring in a particular order (and they won't because ConcurrentBag is unordered).