0

So I have a task to calculate Euler's number using multiple threads, using this formula: sum( ((3k)^2 + 1) / ((3k)!) ), for k = 0...infinity.

import java.math.BigDecimal;
import java.math.BigInteger;
import java.io.FileWriter;
import java.io.IOException;
import java.math.RoundingMode;

class ECalculator {
  private BigDecimal sum;
  private BigDecimal[] series;
  private int length;
  
  public ECalculator(int threadCount) {
    this.length = threadCount;
    this.sum = new BigDecimal(0);
    this.series = new BigDecimal[threadCount];
    for (int i = 0; i < this.length; i++) {
      this.series[i] = BigDecimal.ZERO;
    }
  }
  
  public synchronized void addToSum(BigDecimal element) {
    this.sum = this.sum.add(element);
  }
  
  public void addToSeries(int id, BigDecimal element) {
    if (id - 1 < length) {
      this.series[id - 1] = this.series[id - 1].add(element);      
    }
  }
  
  public synchronized BigDecimal getSum() {
    return this.sum;
  }
  
  public BigDecimal getSeriesSum() {
    BigDecimal result = BigDecimal.ZERO;
    for (int i = 0; i < this.length; i++) {
      result = result.add(this.series[i]);
    }
    return result;
  }
}

class ERunnable implements Runnable {
  private final int id;
  private final int threadCount;
  private final int threadRemainder;
  private final int elements;
  private final boolean quietFlag;
  private ECalculator eCalc;

  public ERunnable(int threadCount, int threadRemainder, int id, int elements, boolean quietFlag, ECalculator eCalc) {
    this.id = id;
    this.threadCount = threadCount;
    this.threadRemainder = threadRemainder;
    this.elements = elements;
    this.quietFlag = quietFlag;
    this.eCalc = eCalc;
  }

  @Override
  public void run() {
    if (!quietFlag) {
      System.out.println(String.format("Thread-%d started.", this.id));      
    }
    long start = System.currentTimeMillis();
    int k = this.threadRemainder;
    int iteration = 0;
    BigInteger currentFactorial = BigInteger.valueOf(intFactorial(3 * k));
    
    while (iteration < this.elements) {
      if (iteration != 0) {
        for (int i = 3 * (k - threadCount) + 1; i <= 3 * k; i++) {
          currentFactorial = currentFactorial.multiply(BigInteger.valueOf(i));
        }
      }
      
      this.eCalc.addToSeries(this.id, new BigDecimal(Math.pow(3 * k, 2) + 1).divide(new BigDecimal(currentFactorial), 100, RoundingMode.HALF_UP));
      
      iteration += 1;
      k += this.threadCount;
    }
    
    long stop = System.currentTimeMillis();
    if (!quietFlag) {
      System.out.println(String.format("Thread-%d stopped.", this.id));
      System.out.println(String.format("Thread %d execution time: %d milliseconds", this.id, stop - start));      
    }
  }
  
  public int intFactorial(int n) {
    int result = 1;
    for (int i = 1; i <= n; i++) {
      result *= i;
    }
    return result;
  }
}

public class TaskRunner {
  public static final String DEFAULT_FILE_NAME = "result.txt";
  public static void main(String[] args) throws InterruptedException {

    int threadCount = 2;
    int precision = 10000;
    int elementsPerTask = precision / threadCount;
    int remainingElements = precision % threadCount;
    boolean quietFlag = false;
    
    calculate(threadCount, elementsPerTask, remainingElements, quietFlag, DEFAULT_FILE_NAME);
  }
  
  public static void writeResult(String filename, String result) {
    try {
      FileWriter writer = new FileWriter(filename);
      writer.write(result);
      writer.close();
    } catch (IOException e) {
      System.out.println("An error occurred.");
      e.printStackTrace();
    }
  }
  
  public static void calculate(int threadCount, int elementsPerTask, int remainingElements, boolean quietFlag, String outputFile) throws InterruptedException {
    long start = System.currentTimeMillis();
    Thread[] threads = new Thread[threadCount];
    ECalculator eCalc = new ECalculator(threadCount);
    
    for (int i = 0; i < threadCount; i++) {
      if (i == 0) {
        threads[i] = new Thread(new ERunnable(threadCount, i, i + 1, elementsPerTask + remainingElements, quietFlag, eCalc));
      } else {
        threads[i] = new Thread(new ERunnable(threadCount, i, i + 1, elementsPerTask, quietFlag, eCalc));        
      }
      threads[i].start();
    }
    
    for (int i = 0; i < threadCount; i++) {
      threads[i].join();
    }
    
    String result = eCalc.getSeriesSum().toString();
    
    if (!quietFlag) {
      System.out.println("E = " + result);      
    }
    
    writeResult(outputFile, result);
    
    long stop = System.currentTimeMillis();
    System.out.println("Calculated in: " + (stop - start) + " milliseconds" );
  }
}

I stripped out the prints, etc. in the code that have no effect. My problem is that the more threads I use the slower it gets. Currently the fastest run I have is for 1 thread. I am sure the factorial calculation is causing some issues. I tried using a thread pool but still got the same times.

  1. How can I make it so that running it with more threads, up until some point, will speed up the calculation process?
  2. How would one go about calculating this big factorials?
  3. The precision parameter that is passed is the amount of elements in the sum that are used. Can I set the BigDecimal scale to be somehow dependent on that precision so I don't hard code it?

EDIT I updated the code block to be in 1 file only and runnable without external libs.

EDIT 2 I found out that the factorial code messes up with the time. If I let the threads ramp up to some high precision without calculating factorials the time goes down with increasing threads. Yet I cannot implement the factorial calculating in any way while keeping the time decreasing.

EDIT 3 Adjusting code to address answers.

    private static BigDecimal partialCalculator(int start, int threadCount, int id) {
        BigDecimal nBD = BigDecimal.valueOf(start);
        
        BigDecimal result = nBD.multiply(nBD).multiply(BigDecimal.valueOf(9)).add(BigDecimal.valueOf(1));
        
        for (int i = start; i > 0; i -= threadCount) {
            BigDecimal iBD = BigDecimal.valueOf(i);
            BigDecimal iBD1 = BigDecimal.valueOf(i - 1);
            BigDecimal iBD3 = BigDecimal.valueOf(3).multiply(iBD);
            
            BigDecimal prevNumerator = iBD1.multiply(iBD1).multiply(BigDecimal.valueOf(9)).add(BigDecimal.valueOf(1));
            
            // 3 * i * (3 * i - 1) * (3 * i - 2);
            BigDecimal divisor = iBD3.multiply(iBD3.subtract(BigDecimal.valueOf(1))).multiply(iBD3.subtract(BigDecimal.valueOf(2)));
            result = result.divide(divisor, 10000, RoundingMode.HALF_EVEN)
                                     .add(prevNumerator);
        }
        return result;
    }
    
    public static void main(String[] args) {
        int threadCount = 3;
        int precision = 6;
        
        ExecutorService executorService = Executors.newFixedThreadPool(threadCount);
        ArrayList<Future<BigDecimal> > futures = new ArrayList<Future<BigDecimal> >();
        for (int i = 0; i < threadCount; i++) {
            int start = precision - i;
            System.out.println(start);
            final int id = i + 1;
            futures.add(executorService.submit(() -> partialCalculator(start, threadCount, id)));
        }
        BigDecimal result = BigDecimal.ZERO;
        try {
            for (int i = 0; i < threadCount; i++) {
                result = result.add(futures.get(i).get());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        
        executorService.shutdown();
        System.out.println(result);
    }

Seems to be working properly for 1 thread but messes up the calculation for multiple.

Ivelin Dinev
  • 51
  • 2
  • 8
  • The best way to speed up this process would be to write it in another language (C or C++). Also see [Java Multithreading Performance](https://stackoverflow.com/questions/15292626/does-multi-threading-improve-performance-how) and [Calculate Pi with BigDecimal](https://stackoverflow.com/questions/25964754/calculating-pi-with-bigdecimal) – fuggerjaki61 Jun 21 '20 at 15:10
  • @fuggerjaki61 What I meant is that it is not supposed to become slower with more threads up until the CPU core count if each one has enough work to do. Am I mistaken? Please correct me if I am wrong. – Ivelin Dinev Jun 21 '20 at 15:19
  • How are you splitting the task among the threads? – Joni Jun 21 '20 at 15:47
  • @Joni each task has a threadRemainder and calculates only those elements from the series whose index % threadCount == threadRemainder. – Ivelin Dinev Jun 21 '20 at 16:09
  • As one would think the threads performance increases linear by each thread. But passing the arguments and communication between the threads takes its time. Over a specific number of threads the performance doesn't increase significantly. The only way to find it out is to test it. Example: 1 Thread - 1000ms; 2 Threads - 500ms; 3 Threads - 300ms; 4 Threads - 250ms – fuggerjaki61 Jun 21 '20 at 17:20
  • I agree, thats partly what I said. I cannot understand your example though. Did you run and get these results? You are not addressing my questions properly. – Ivelin Dinev Jun 21 '20 at 17:23

2 Answers2

1

There is a lot of contention between the threads: they all compete to get a lock on the ECalculator object after every little bit of computation, because of this method:

  public synchronized void addToSum(BigDecimal element) {
    this.sum = this.sum.add(element);
  }

In general having threads compete for frequent access to a common resource leads to bad performance, because you're asking for the operating system to intervene and tell the program which thread can continue. I haven't tested your code to confirm that this is the issue because it's not self-contained.

To fix this, have the threads accumulate their results separately, and merge results after the threads have finished. That is, create a sum variable in ERunnable, and then change the methods:

// ERunnable.run:
this.sum = this.sum.add(new BigDecimal(Math.pow(3 * k, 2) + 1).divide(new BigDecimal(factorial(3 * k)), 100, RoundingMode.HALF_UP));

// TaskRunner.calculate:
for (int i = 0; i < threadCount; i++) {
  threads[i].join();
  eCalc.addToSum(/* recover the sum computed by thread */);
}

By the way would be easier if you used the higher level java.util.concurrent API instead of creating thread objects yourself. You could wrap the computation in a Callable which can return a result.


Q2 How would one go about calculating this big factorials?

Usually you don't. Instead, you reformulate the problem so that it does not involve the direct computation of factorials. One technique is Horner's method.

Q3 The precision parameter that is passed is the amount of elements in the sum that are used. Can I set the BigDecimal scale to be somehow dependent on that precision so I don't hard code it?

Sure why not. You can work out the error bound from the number of elements (it's proportional to the last term in the series) and set the BigDecimal scale to that.

Joni
  • 108,737
  • 14
  • 143
  • 193
  • Please see my edit to the post. I did something like you said. I accumulate results from threads and do not have them to race for saving every element. Although I still do not get improvements. I also do not believe the answer you provided would improve the time. Have you ran it and could confirm? – Ivelin Dinev Jun 21 '20 at 17:36
  • I do not understand what you mean by self-contained. Would you like me to update it with the latest improvements from my side? – Ivelin Dinev Jun 21 '20 at 17:41
  • Also, how would one work out the error bound from the amount of elements. I do not see any link between the error and the last term, not how to calculate the error not having anything to compare it with. Please excuse my newbie questions. – Ivelin Dinev Jun 21 '20 at 17:51
  • Self-contained means without using external dependencies. You seem to be using some Apache libraries. It would also help to reduce the size of the code to the absolute minimum, which also means putting everything in one class. Error analysis is a deep subject, I don't know how deep you want to get. There's a very short paragraph in [wikipedia](https://en.wikipedia.org/wiki/Series_(mathematics)#Evaluation_of_truncation_errors). I think in this case you can prove that for a sufficiently large k the error is bound by the last term you include in the series. – Joni Jun 21 '20 at 21:20
  • I see, well the apache libraries are just there to handle command line arguments. Those values can be just put in instead of the variables. But still let me update the post to be with 1 file only. – Ivelin Dinev Jun 21 '20 at 21:25
  • I updated my code to be in 1 file and ran it in some online playground to confirm it works. The variables threadCount and precision have to be modified instead of using cmd args. – Ivelin Dinev Jun 21 '20 at 21:36
1

After a review of the updated code, I've made the following observations:

First of all, the program runs for a fraction of a second. That means that this is a micro benchmark. Several key features in Java make micro benchmarks difficult to implement reliably. See How do I write a correct micro-benchmark in Java? For example, if the program doesn't run enough repetitions, the "just in time" compiler doesn't have time to kick in to compile it to native code, and you end up benchmarking the intepreter. It seems possible that in your case the JIT compiler takes longer to kick in when there are multiple threads,

As an example, to make your program do more work, I changed the BigDecimal precision from 100 to 10,000 and added a loop around the main method. The execution times were measured as follows:

1 thread:

Calculated in: 2803 milliseconds
Calculated in: 1116 milliseconds
Calculated in: 1040 milliseconds
Calculated in: 1066 milliseconds
Calculated in: 1036 milliseconds

2 threads:

Calculated in: 2354 milliseconds
Calculated in: 856 milliseconds
Calculated in: 624 milliseconds
Calculated in: 659 milliseconds
Calculated in: 664 milliseconds

4 threads:

Calculated in: 1961 milliseconds
Calculated in: 797 milliseconds
Calculated in: 623 milliseconds
Calculated in: 536 milliseconds
Calculated in: 497 milliseconds

The second observation is that there is a significant part of the workload that does not benefit from multiple threads: every thread is computing every factorial. This means the speed-up cannot be linear - as described by Amdahl's law.

So how can we get the result without computing factorials? One way is with Horner's method. As an example, consider the simpler series sum(1/k!) which also conveges to e but a little slower than yours.

Let's say you want to compute sum(1/k!) up to k = 100. With Horner's method you start from the end and extract common factors:

sum(1/k!, k=0..n) = 1/100! + 1/99! + 1/98! + ... + 1/1! + 1/0!
= ((... (((1/100 + 1)/99 + 1)/98 + ...)/2 + 1)/1 + 1

See how you start with 1, divide by 100 and add 1, divide by 99 and add 1, divide by 98 and add 1, and so on? That makes a very simple program:

private static BigDecimal serialHornerMethod() {
    BigDecimal accumulator = BigDecimal.ONE;
    for (int k = 10000; k > 0; k--) {
        BigDecimal divisor = new BigDecimal(k);
        accumulator = accumulator.divide(divisor, 10000, RoundingMode.HALF_EVEN)
                                 .add(BigDecimal.ONE);
    }
    return accumulator;
}

Ok that's a serial method, how do you make it use parallel? Here's an example for two threads: First split the series into even and odd terms:

1/100! + 1/99! + 1/98! + 1/97! + ... + 1/1! + 1/0! =
(1/100! + 1/98! + ...  + 1/0!) + (1/99! + 1/97! + ... + 1/1!)

Then apply Horner's method to both the even and odd terms:

1/100! + 1/98! + 1/96! + ...  + 1/2! + 1/0! = 
((((1/(100*99) + 1)/(98*97) + 1)/(96*95) + ...)/(2*1) + 1

and:

1/99! + 1/97! + 1/95! + ... + 1/3! + 1/1! =
((((1/(99*98) + 1)/(97*96) + 1)/(95*94) + ...)/(3*2) + 1

This is just as easy to implement as the serial method, and you get pretty close to linear speedup going from 1 to 2 threads:

    private static BigDecimal partialHornerMethod(int start) {
        BigDecimal accumulator = BigDecimal.ONE;
        for (int i = start; i > 0; i -= 2) {
            int f = i * (i + 1);
            BigDecimal divisor = new BigDecimal(f);
            accumulator = accumulator.divide(divisor, 10000, RoundingMode.HALF_EVEN)
                                     .add(BigDecimal.ONE);
        }
        return accumulator;
    }

// Usage:

ExecutorService executorService = Executors.newFixedThreadPool(2);
Future<BigDecimal> submit = executorService.submit(() -> partialHornerMethod(10000));
Future<BigDecimal> submit1 = executorService.submit(() -> partialHornerMethod(9999));
BigDecimal result = submit1.get().add(submit.get());
Joni
  • 108,737
  • 14
  • 143
  • 193
  • I see, this is very insightful. Is Horner's method applicable for the original series of the post. I was left with the idea that this method was only for polynomial root finding. – Ivelin Dinev Jun 23 '20 at 14:27
  • So I got the code working with the original series but only for 1 thread. What I do not understand is why `int f = i * (i + 1);` this line of code is not `int f = i * (i - 1);` Since you are starting from the back side. – Ivelin Dinev Jun 23 '20 at 17:33
  • That's a good point, I think `i*(i+1)` actually causes it to add one term too many by accident, so instead of doing 10000 terms it's doing 10001. You could fix that by starting the iteration from `start-1`. The problem with `i*(i-1)` is that with i=1 you get 0. You can make `i*(i-1)` work but it will require adjusting the iteration bounds. – Joni Jun 23 '20 at 21:00