6

Introduction:

Using two identical mergesort algorithms, I tested the execution speed of C++ (using Visual Studios C++ 2010 express) vs Java (using NetBeans 7.0). I conjectured that the C++ execution would be at least slightly faster, but testing revealed that the C++ execution was 4 - 10 times slower than the Java execution. I believe that I have set all the speed optimisations for C++, and I am publishing as a release rather than as a debug. Why is this speed discrepancy occurring?

Code:

Java:

public class PerformanceTest1
{
 /**
  * Sorts the array using a merge sort algorithm
  * @param array The array to be sorted
  * @return The sorted array
  */
 public static void sort(double[] array)
 {
      if(array.length > 1)
      {
           int centre;
           double[] left;
           double[] right;
           int arrayPointer = 0;
           int leftPointer = 0;
           int rightPointer = 0;

           centre = (int)Math.floor((array.length) / 2.0);

           left = new double[centre];
           right = new double[array.length - centre];

           System.arraycopy(array,0,left,0,left.length);
           System.arraycopy(array,centre,right,0,right.length);

           sort(left);
           sort(right);

           while((leftPointer < left.length) && (rightPointer < right.length))
           {
                if(left[leftPointer] <= right[rightPointer])
                {
                     array[arrayPointer] = left[leftPointer];
                     leftPointer += 1;
                }
                else
                {
                     array[arrayPointer] = right[rightPointer];
                     rightPointer += 1;
                }
                arrayPointer += 1;
           }
           if(leftPointer < left.length)
           {
                System.arraycopy(left,leftPointer,array,arrayPointer,array.length - arrayPointer);
           }
           else if(rightPointer < right.length)
           {
                System.arraycopy(right,rightPointer,array,arrayPointer,array.length - arrayPointer);
           }
      }
 }

 public static void main(String args[])
 {
      //Number of elements to sort
      int arraySize = 1000000;

      //Create the variables for timing
      double start;
      double end;
      double duration;

      //Build array
      double[] data = new double[arraySize];
      for(int i = 0;i < data.length;i += 1)
      {
           data[i] = Math.round(Math.random() * 10000);
      }

      //Run performance test
      start = System.nanoTime();
      sort(data);
      end = System.nanoTime();

      //Output performance results
      duration = (end - start) / 1E9;
      System.out.println("Duration: " + duration);
 }
}

C++:

#include <iostream>
#include <windows.h>
using namespace std;

//Mergesort
void sort1(double *data,int size)
{
if(size > 1)
{
    int centre;
    double *left;
    int leftSize;
    double *right;
    int rightSize;
    int dataPointer = 0;
    int leftPointer = 0;
    int rightPointer = 0;

    centre = (int)floor((size) / 2.0);
    leftSize = centre;
    left = new double[leftSize];
    for(int i = 0;i < leftSize;i += 1)
    {
        left[i] = data[i];
    }
    rightSize = size - leftSize;
    right = new double[rightSize];
    for(int i = leftSize;i < size;i += 1)
    {
        right[i - leftSize] = data[i];
    }

    sort1(left,leftSize);
    sort1(right,rightSize);

    while((leftPointer < leftSize) && (rightPointer < rightSize))
    {
        if(left[leftPointer] <= right[rightPointer])
        {
            data[dataPointer] = left[leftPointer];
            leftPointer += 1;
        }
        else
        {
            data[dataPointer] = right[rightPointer];
            rightPointer += 1;
        }
        dataPointer += 1;
    }
    if(leftPointer < leftSize)
    {
        for(int i = dataPointer;i < size;i += 1)
        {
            data[i] = left[leftPointer++];
        }
    }
    else if(rightPointer < rightSize)
    {
        for(int i = dataPointer;i < size;i += 1)
        {
            data[i] = right[rightPointer++];
        }
    }
            delete left;
            delete right;
}
}

void main()
{
//Number of elements to sort
int arraySize = 1000000;

//Create the variables for timing
LARGE_INTEGER start; //Starting time
LARGE_INTEGER end; //Ending time
LARGE_INTEGER freq; //Rate of time update
double duration; //end - start
QueryPerformanceFrequency(&freq); //Determinine the frequency of the performance counter (high precision system timer)

//Build array
double *temp2 = new double[arraySize];
QueryPerformanceCounter(&start);
srand((int)start.QuadPart);
for(int i = 0;i < arraySize;i += 1)
{
    double randVal = rand() % 10000;
    temp2[i] = randVal;
}

//Run performance test
QueryPerformanceCounter(&start);
sort1(temp2,arraySize);
QueryPerformanceCounter(&end);
    delete temp2;

//Output performance test results
duration = (double)(end.QuadPart - start.QuadPart) / (double)(freq.QuadPart);
cout << "Duration: " << duration << endl;

//Dramatic pause
system("pause");
}

Observations:

For 10000 elements, the C++ execution takes roughly 4 times the amount of time as the Java execution. For 100000 elements, the ratio is about 7:1. For 10000000 elements, the ratio is about 10:1. For over 10000000, the Java execution completes, but the C++ execution stalls, and I have to manually kill the process.

SpeedIsGood
  • 187
  • 1
  • 10
  • 43
    I see 3 `new[]` but **no** `delete[]` in your C++ code. You're running out of memory. Learn how to use *both* languages correctly before you want to compare them. – Xeo Jun 21 '11 at 16:07
  • How did you measure the time in each case? How do you relate them? – Nawaz Jun 21 '11 at 16:08
  • 16
    Argh! A `void main`! My eyes! They burn! – Etienne de Martel Jun 21 '11 at 16:09
  • 3
    yah, bad memory management is the answer. – CrazyDart Jun 21 '11 at 16:10
  • 1
    I agree with Xeo. For even better speed, avoid new/delete and switch to an inplace merge. – Peter G. Jun 21 '11 at 16:11
  • 1
    You don't say how you're compiling your C++. If you're compiling your C++ with no optimizations, that's not a fair comparison because Java optimizes. So for visual studio, make sure you're compiling in Release mode. – Chris Eberle Jun 21 '11 at 16:14
  • 1
    That's C code disguised in C++ syntax. @Xeo is right, you do need to learn C++ first. For starters, use `std::sort` and come back reporting. – sbi Jun 21 '11 at 16:16
  • 13
    Not really fare to compare mediocre Java against badly written C++ – Martin York Jun 21 '11 at 16:21
  • @Xeo: That was an error in my post, but I do have memory management. Thanks for pointing it out, though. – SpeedIsGood Jun 21 '11 at 16:33
  • You use `System.arraycopy` in Java but write the loop out longhand in C++ instead of using `memmove()` or `memcpy()`. That gives Java some benefit. – Jonathan Leffler Jun 21 '11 at 16:34
  • @Jonathan Leffler: Good point, I will do that. Thanks. – SpeedIsGood Jun 21 '11 at 16:35
  • @Chris: I mentioned that I was using the speed optimisation (/O2), but I am not very familiar with optimisation, so I was hoping to be enlightened on what more I could do to make it faster. – SpeedIsGood Jun 21 '11 at 16:37
  • @Martin: Yes, that is why I want to learn more c++. – SpeedIsGood Jun 21 '11 at 16:40
  • I still see no `delete[]`. :) – Xeo Jun 21 '11 at 16:53
  • @Peter G.: That's a good idea. Thanks. – SpeedIsGood Jun 21 '11 at 16:53
  • @Xeo: Yes, this seems to be a problem. I used `delete` rather than `delete[]`, as pointed out by Mark B. I will rectify this and get back to you. Thanks. – SpeedIsGood Jun 21 '11 at 16:55
  • Optimizations don't help too much if you're in debug mode. The extra symbols and boundary checks slow things down a lot. Also your memory leak doesn't help. The /O2 is a start, but make sure to put it into release mode. – Chris Eberle Jun 21 '11 at 17:06
  • @Chris The extra symbols don't cost anything, and you can turn both them and the bounds checking off regardless of the mode. More realistically, the "modes" are only default starting points; you create your own modes as you go alone (and they aren't limited to two). – James Kanze Jun 21 '11 at 17:15
  • I run your code and got faster results with C++. I guess you made a mistake and ran the program under debugger: even in release mode it slows down the program a lot! Look at my answer below for the results. – AlefSin Jun 21 '11 at 17:18
  • @James: ok, good point. I'm just after a quick way to get the OP to the point where the code can be tested fairly. There are many ways to skin this cat. – Chris Eberle Jun 21 '11 at 17:22
  • @Chris: Thanks, that seems to be the answer. @AlefSin: That worked! Thanks! – SpeedIsGood Jun 21 '11 at 18:52

8 Answers8

16

I think there might be a mistake in the way you ran the program. When you hit F5 in Visual C++ Express, the program is running under debugger and it will be a LOT slower. In other versions of Visual C++ 2010 (e.g. Ultimate that I use), try hitting CTRL+F5 (i.e. Start without Debugging) or try running the executable file itself (in the Express) and you see the difference.

I run your program with only one modification on my machine (added delete[] left; delete[] right; to get rid of memory leak; otherwise it would ran out of memory in 32 bits mode!). I have an i7 950. To be fair, I also passed the same array to the Arrays.sort() in Java and to the std::sort in C++. I used an array size of 10,000,000.

Here are the results (time in seconds):

Java code:            7.13
Java Arrays.sort:     0.93

32 bits
C++ code:             3.57
C++ std::sort         0.81

64 bits
C++ code:             2.77
C++ std::sort         0.76

So the C++ code is much faster and even the standard library, which is highly tuned for in both Java and C++, tends to show slight advantage for C++.

Edit: I just realized in your original test, you run the C++ code in the debug mode. You should switch to the Release mode AND run it outside the debugger (as I explained in my post) to get a fair result.

AlefSin
  • 1,086
  • 9
  • 20
  • Holy crap! Ctrl + F5 increased the execution speed by over an order of magnitude! Thanks! – SpeedIsGood Jun 21 '11 at 18:49
  • 3
    @SpeedIsGood: I'm glad it helped. Notice the large gap between your code and the library code (both in Java and C++). It means there are lots of things you may do to improve the performance in both languages. I don't know about java Arrays.sort but the std::sort is written entirely in C++ and it is also generic! Some of the ways to improve it have been suggested in other comments. – AlefSin Jun 21 '11 at 19:42
10

I don't program C++ professionally (or even unprofessionally:) but I notice that you are allocating a double on the heap (double *temp2 = new double[arraySize];). This is expensive compared to Java initialisation but more importantly, it constitutes a memory leak since you never delete it, this could explain why your C++ implementation stalls, it's basically run out of memory.

PhilDin
  • 2,802
  • 4
  • 23
  • 38
5

To start with did you try using std::sort (or std::stable_sort which is typically mergesort) to get a baseline performance in C++?

I can't comment on the Java code but for the C++ code:

  • Unlike Java, new in C++ requires manual intervention to free the memory. Every recursion you'll be leaking memory. I would suggest using std::vector instead as it manages all the memory for you AND the iterator, iterator constructor will even do the copy (and possibly optimized better than your for loop`). This is almost certainly the cause of your performance difference.
  • You use arraycopy in Java but don't use the library facility (std::copy) in C++ although again this wouldn't matter if you used vector.
  • Nit: Declare and initialize your variable on the same line, at the point you first need them, not all at the top of the function.
  • If you're allowed to use parts of the standard library, std::merge could replace your merge algorithm.

EDIT: If you really are using say delete left; to cleanup memory that's probably your problem. The correct syntax would be delete [] left; to deallocate an array.

Mark B
  • 95,107
  • 10
  • 109
  • 188
  • 1) I actually seem to have omitted that from my post. It is there in my original code. I just updated my post with the deletes. 2) I will try that. Thanks. 3) Sure. 4) I am not actually doing a merge; I am just testing the execution times of c++ vs Java. – SpeedIsGood Jun 21 '11 at 16:41
  • @SpeedIsGood But see my edit regarding array deletion syntax (not to mention the data copy, etc). – Mark B Jun 21 '11 at 16:43
  • There is an out-of-boundary issue: data[data.size()] – Heng Li Jul 10 '20 at 17:45
4

Your version was leaking so much memory that the timing were meaningless.

I am sure the time was spent thrashing the memory allocator.
Rewrite it to use standard C++ objects for memory management std::vector and see what happens.

Personally I would still expect the Java version to win (just). Because the JIT allows machine specific optimizations and while the C++ can do machine specific optimizations generally it will only do generic architecture optimizations (unless you provide the exact architecture flags).

  • Note: Don't forget to compile with optimizations turned on.

Just cleaning up your C++:
I have not tried to make a good merge sort (just re-written) in a C++ style

void sort1(std::vector<double>& data)
{
    if(data.size() > 1)
    {
        std::size_t         centre    = data.size() / 2;
        std::size_t         lftSize   = centre;
        std::size_t         rhtSize   = data.size() - lftSize;

        // Why are we allocating new arrays here??
        // Is the whole point of the merge sort to do it in place?
        // I forget bbut I think you need to go look at a knuth book.
        //
        std::vector<double> lft(data.begin(),           data.begin() + lftSize);
        std::vector<double> rht(data.begin() + lftSize, data.end());

        sort1(lft);
        sort1(rht);
        std::size_t dataPointer   = 0;
        std::size_t lftPointer    = 0;
        std::size_t rhtPointer    = 0;

        while((lftPointer < lftSize) && (rhtPointer < rhtSize))
        {                                                                               
            data[dataPointer++] = (lft[lftPointer] <= rht[rhtPointer])
                                    ?  lft[lftPointer++]
                                    :  rht[rhtPointer++];
        }
        std::copy(lft.begin() + lftPointer, lft.end(), &data[dataPointer]);
        std::copy(rht.begin() + rhtPointer, rht.end(), &data[dataPointer]);
    }
}

Thinking about merge sort. I would try this:
I have not tested it, so it may not work correctly. Bu it is an attempt to not keep allocating huge amounts of memory to do the sort. instead it uses a single temp area and copies the result back when the sort is done.

void mergeSort(double* begin, double* end, double* tmp)
{
    if (end - begin <= 1)
    {   return;
    }

    std::size_t size    = end - begin;
    double*     middle  = begin +  (size / 2);

    mergeSort(begin, middle, tmp);
    mergeSort(middle, end, tmp);

    double* lft    = begin;
    double* rht    = middle;
    double* dst    = tmp;
    while((lft < middle) && (rht < end))
    {
        *dst++  = (*lft < *rht)
                        ? *lft++
                        : *rht++;
    }
    std::size_t count   = dst - tmp;
    memcpy(begin,          tmp, sizeof(double) * count);
    memcpy(begin + count,  lft, sizeof(double) * (middle - lft));
    memcpy(begin + count,  rht, sizeof(double) * (end    - rht));
}

void sort2(std::vector<double>& data)
{
    double*     left    = &data[0];
    double*     right   = &data[data.size()];

    std::vector<double> tmp(data.size());

    mergeSort(left,right, &tmp[0]);
}
Community
  • 1
  • 1
Martin York
  • 257,169
  • 86
  • 333
  • 562
  • Thanks, I'll test this out. However, I am not very familiar with optimisations (or most of c++, as you can guess). I think I have enabled the `/O2` optimisation in Visual Studios c++. What more can I do to optimise my code? – SpeedIsGood Jun 21 '11 at 17:01
  • I have a question regarding vectors. Being a total c++ newbie, I sought to consult a number of sources regarding execution speed, and one of them informed me that since c++ apparently cannot handle genericity as well as Java, code using vectors will not execute as fast as code using double pointers. Is this true? Thanks. – SpeedIsGood Jun 21 '11 at 19:20
  • 3
    @SpeedlsGood: in the release mode with any modern compiler, the speed to access data through a double* or std::vector will be exactly the same. Notice that Java Generics and C++ Template are very different. Templates offer much more than Generics (e.g. meta-programming and expression templates) which makes them ideal for building high performance numerical code. – AlefSin Jun 21 '11 at 19:40
  • @AlefSin: Thanks, that's really useful to know. Vectors are really useful. – SpeedIsGood Jun 21 '11 at 21:03
2

A couple of things.

Java is highly optimized and after the code has executed once the JIT compiler then executes the code as native.

Your System.arraycopy in Java is going to execute much faster than simply copying each element one at a time. try replacing this copy with a memcpy and you will see that it is much faster.

EDIT: Look at this post: C++ performance vs. Java/C#

Community
  • 1
  • 1
Romain Hippeau
  • 24,113
  • 5
  • 60
  • 79
  • 3
    Several people have mentioned using `memcpy` or something similar; if the array is of a fundamental type, I would expect a good C++ compiler to optimize the loop into something at least as good. – James Kanze Jun 21 '11 at 17:17
  • The linked thread is slightly misleading since although JITters can *theoretically* produce faster code than static compilers this simply isn’t the case today, at least not across representable problem sets. In any case, it *isn’t* the cause for the performance in this particular case, not even partially. – Konrad Rudolph Jun 22 '11 at 19:35
  • @Konrad: JIT compilers can produce faster code than static compilers if the target architecture is not precisely known at compilation or if it does not do alias analysis well (e.g. HotSpot vs GNU GCC on LU task from SciMark2). – J D Sep 05 '11 at 20:37
  • @Jon If the architecture isn’t known then you need to compile your C++ code on site anyway, and thus can take advantage of profile-guided optimisation. This is completely on par with the optimisations performed by HotSpot, with the difference that no JIT has to run in the background so if anything it will always be *faster* (assuming the profile data is representative). Almost any information you read on the ’net about JITting is outdated because it doesn’t take profile-guided optimisation into account since that’s only been “recently” available in Microsoft’s C++ compiler. – Konrad Rudolph Sep 05 '11 at 20:50
  • @Konrad: "If the architecture isn’t known then you need to compile your C++ code on site anyway". In practice, the architecture is usually not *precisely* known so C/C++ are compiled to a lowest common denominator like generic x86 and not optimized for AMD Barcelona or Intel Nehalem. In contrast, JITs optimize for the current machine. – J D Sep 06 '11 at 10:39
  • @Jon The lowest common denominator is fine for most purposes. But *if* you need the speed to such an extend that the HotSpot optimisation makes a difference, then you’d surely compile for each target platform separately. – Konrad Rudolph Sep 06 '11 at 10:47
1

It is hard to tell from just looking at your code, but I would hazard a guess that the reason is in the handling recursion rather than actual computations. Try using some sorting algorithm that relies on the iteration instead of the recursion and share the results of the performance comparison.

Olaf
  • 6,249
  • 1
  • 19
  • 37
0

I don't know why Java is so much faster here.

I compared it with the built in Arrays.sort() and it was 4x faster again. (It doesn't create any objects).

Usually if there is a test where Java is much faster its because Java is much better at removing code which doesn't do anything.

Perhaps you could use memcpy rather than a loop at the end.

Peter Lawrey
  • 525,659
  • 79
  • 751
  • 1,130
0

Try to make a global vector as a buffer, and try not to allocate a lot of memory. This will run faster than your code, because if uses some tricks(uses only one buffer and the memory is allocated when the program starts, so the memory will not be fragmented):

#include <cstdio>
#define N 500001

int a[N];
int x[N];
int n;

void merge (int a[], int l, int r)
{
    int m = (l + r) / 2;
    int i, j, k = l - 1;
    for (i = l, j = m + 1; i <= m && j <= r;)
        if (a[i] < a[j])
            x[++k] = a[i++];
        else
            x[++k] = a[j++];
    for (; i <= m; ++i)
        x[++k] = a[i];
    for (; j <= r; ++j)
        x[++k] = a[j];
    for (i = l; i <= r; ++i)
        a[i] = x[i];
}

void mergeSort (int a[], int l, int r)
{
    if (l >= r)
        return;
    int m = (l + r) / 2;
    mergeSort (a, l, m);
    mergeSort (a, m + 1, r);
    merge (a, l, r);
}

int main ()
{
    int i;
    freopen ("algsort.in", "r", stdin);
    freopen ("algsort.out", "w", stdout);
    scanf ("%d\n", &n);
    for (i = 1; i <= n; ++i)
        scanf ("%d ", &a[i]);
    mergeSort (a, 1, n);
    for (i = 1; i <= n; ++i)
        printf ("%d ", a[i]);
    return 0;
}
SerbanLupu
  • 427
  • 2
  • 5
  • 9