3

I was making a simple homework in which I had to develop a software in C to find the two nearest points between 100 of them.

When I finished, I was curious to see how much time it would take to run it with a lot more points and with full VC++ optimizations enabled. I tried with 10000 and it took about 8~9 seconds. Then I was curious to see how much time C# and Java would take to do the same thing. As expected, C# took a little longer, 9~10 seconds; Java, however, took only ~400 milliseconds! Why does this happen?!

This is my code in C, C# and Java:

C:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <Windows.h>

long perfFrequency = 0;

typedef struct
{
    double X;
    double Y;
} Point;

double distance(Point p1, Point p2)
{
    return sqrt(pow(p1.X - p2.X, 2) + pow(p1.Y - p2.Y, 2));
}

double smallerDistance(Point *points, int size, Point *smallerA, Point  *smallerB)
{
    int i, j;
    double smaller = distance(points[0], points[1]);

    for (i = 0; i < size; i++)
    {
        for (j = i + 1; j < size; j++)
        {
            double dist = distance(points[i], points[j]);
            if (dist < smaller)
            {
                smaller= dist;
                *smallerA = points[i];
                *smallerB = points[j];
            }
        }
    }
    return smaller;
}

void main()
{
    // read size and points from file.
    int size;
    Point *points= (Point *)malloc(size * sizeof(Point));

    // just to make sure everything is ready before the benchmark begins    
    system("pause");

    Point smallerA, smallerB;
    if (!QueryPerformanceFrequency((LARGE_INTEGER *)&perfFrequency))
        printf("Couldn't query performance frequency.");

    long long start, end;   
    double smaller;
    QueryPerformanceCounter((LARGE_INTEGER *)&start);

    smaller= smallerDistance(points, size, &smallerA, &smallerB);

    QueryPerformanceCounter((LARGE_INTEGER *)&end);

    printf("The smaller distance is: %lf. The coordinates of the most close points are: (%lf, %lf) and (%lf, %lf). Time taken: %lfms\n",
        smaller, smallerA.X, smallerA.Y, smallerB.X, smallerB.Y, (end - start) * 1000.0 / perfFrequency);

}

C#:

using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Diagnostics;

namespace StructuredTest
{
    struct Point
    {
        public double X;
        public double Y;
    }

    class Program
    {
        static double Distance(Point p1, Point p2)
        {
            return Math.Sqrt(Math.Pow(p1.X - p2.X, 2) + Math.Pow(p1.Y - p2.Y, 2));
        }

        static double SmallerDistance(Point[] points, int size, out Point smallerA, out Point smallerB)
        {
            int i, j;
            double smaller = Distance(points[0], points[1]);
            smallerA = default(Point);
            smallerB = default(Point);

            for (i = 0; i < size; i++)
            {
                for (j = i + 1; j < size; j++)
                {
                    double dist = Distance(points[i], points[j]);
                    if (dist < smaller)
                    {
                        smaller = dist;
                        smallerA = points[i];
                        smallerB = points[j];
                    }
                }
            }

            return smaller;
        }

        static void Main(string[] args)
        {
            // read size and points from file 
            int size = int.Parse(file[0]);
            Point[] points= new Point[size];                   

            // make sure everything is ready
            Console.WriteLine("Press any key to continue...");
            Console.ReadKey(true);

            Point smallerA, smallerB;
            double smaller;

            Stopwatch sw = new Stopwatch();
            sw.Restart();

            smaller = SmallerDistance(points, size, out smallerA, out smallerB);

            sw.Stop();

            Console.WriteLine($"The smaller distance is: {smaller}. The coordinates of the most close points are: ({smallerA.X}, {smallerA.Y}) and " +
                $"({smallerB.X}, {smallerB.Y}). Time taken: {sw.ElapsedMilliseconds}ms.");

        }
    }
}

Java:

package structuredtest;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;

class Point {

    public Point(double X, double Y) {
        this.X = X;
        this.Y = Y;
    }

    double X;
    double Y;
}

class Result {

    double distance;
    Point p1;
    Point p2;
}

public class StructuredTest {

    static double distance(Point p1, Point p2) {
        return Math.sqrt(Math.pow(p1.X - p2.X, 2) + Math.pow(p1.Y - p2.Y, 2));
    }

    static Result smallerDistance(Point[] points, int size) {
        int i, j;
        double smaller = distance(points[0], points[1]);
        Result r = new Result();

        for (i = 0; i < size; i++) {
            for (j = i + 1; j < size; j++) {
                double dist = distance(points[i], points[j]);
                if (dist < smaller) {
                    smaller = dist;
                    r.p1 = points[i];
                    r.p2 = points[j];
                }
            }
        }

        r.distance = smaller;
        return r;
    }

    public static void main(String[] args) throws IOException {
        // read size and points from file
        int size = Integer.parseInt(file[0]);
        Point[] points = new Point[size];

        // make sure everything is ready    
        System.out.println("Press any key to continue...");
        System.in.read();

        double start = System.nanoTime(), end;

        Result r = smallerDistance(points, size);

        end = System.nanoTime();

        System.out.println("The smaller distance is: " + r.distance + ". The most close points are: ("
                + r.p1.X + "," + r.p1.Y + ") and " + r.p2.X + "," + r.p2.Y + "). Time taken: " + (end - start) / 1000000 + "ms.");

    }

}

If java had beaten both C and C# by a small margin I wouldn't be surprised, but 20 times faster?!

The file is in the following format:

3 // number of points in the file. Note that there no comments in the actual file
(3.7098722472288, 4.49056397953787) // point (X,Y)
(8.90232811621332, 9.67982769279173)
(5.68254334818822, 1.71918922506136)
(6.22585901842366, 9.51660500242835)

Funny thing: At first, the file with 10000 points that I mentioned earlier, which I was using to benchmark, was actually just a 100 times copy paste of another file with 100 random points. Like this:

(Point 1)
(Point 2)
(Point 3)
(Point 1)
(Point 2)
(Point 3)

I thought that there was no need of generating 10000 random points because as the code has to run through all of the numbers anyway, it would make little difference (only more assignments). But then I decided to generate 10000 random points to see how they would react: both C and C# still ran in about the same time (~50ms increase); Java, on the other hand, had a increase of ~500ms.

Also, I think it is worth noting that java takes about 11 seconds when running inside NetBeans (even in "Run" mode, not "Debug").

And I also tried compiling as C++ instead of C, but it made no difference.

I'm using VS 2015 for both C and C#.

These are the settings for each language:

C:

x64
Optimization: Maximize Speed (/O2)
Intrinsic Functions: Yes (/Oi)
Favor Size or Speed: Favor fast code (/Ot)
Enable Fiber-Safe Optimizations: Yes (/GT)
Security Check: Disable Security Check (/GS-)
Floating point model: Fast (/fp:fast)
Everything else: default

C#:

x64
Release Mode
Optimize Code: Enabled
Check for arithmetic overflow: Disabled
.NET 4.5.2 

Java:

JRE/JDK 1.8
Default settings (if there are any)

EDIT:

Okay, I re-did the tests, following the suggestions:

First off, I used the Result class/struct in both C and C#. The reason I used it in java but not in C/C# is because java can't pass by reference. Second, I now repeat the test in the main() function. And thanks @Tony D for catching that bug! :)

I won't post the code because the changes are minor: simply implement exactly the java version in the other tests, that's what I did.

This time I tested with only 7000 Points (not 10000) and with only 30 iterations, because it is taking a long time to test and its quite late here.

The results didn't change much: C# took 5228ms in average, C 4424ms and Java 223ms. Java still wins by being 20 or more times faster.

Then I tried removing the calls to Math.Pow (simply changing for ((p1.X - p2.X) * (p1.X - p2.X)) + ((p1.Y - p2.Y) * (p1.Y - p2.Y))), then everything changed. The new results:

Java: 220ms average

C#: 195ms average

C: 195ms average

If I only checked that before :p

As I commented, I thought about doing that, but then decided that it would be better to test each compiler's ability to inline functions and optimize this kind of simple calls. However, when I obtained those strange results, I should've gone back and done this, but I got so nervous that I forgot about doing it.

Anyway, to be really honest, I'm surprised that the Java compiler was able to completely optimize that line of code while C#'s and C++'s weren't. Although I know about the corner case checks and the internal calls on C#, I find it really interesting that the Java compiler was able to notice that no corner-case checks were necessary in that code whatsoever.

andre_ss6
  • 1,195
  • 1
  • 13
  • 34
  • in the C++ version, try making the smallerA and smallerB holders be pointers, rather than making copies of the current smallest point, then copy at the end. Or follow the method used by java to assign to a local result variable and return the value. I suspect you're spending a lot of time copying data in the C++ and C# version, and not in the java version – Russ Schultz Oct 30 '15 at 03:56
  • 1
    Perhaps Java is optimising the `Math.pow(p1.X - p2.X, 2)` call to a multiplication: you could explicitly change it to a multiplication in the C version and see if that accounts for the performance difference. `QueryPerformanceCounter` is also famously unreliable, depending on your exact OS version and hardware, though if your machine is otherwise idle and it doesn't flip the program across cores you shouldn't be affected by the bugs. – Tony Delroy Oct 30 '15 at 03:56
  • 1
    Why does the java version have a `Result` class, but the C# and C++ versions do not? Are you sure you are comparing apples to apples? Is the output from these 3 programs equivalent? – Ron Beyer Oct 30 '15 at 03:57
  • 2
    Is this after the obligatory 40 sec warmup? – Martin James Oct 30 '15 at 03:58
  • @TonyD I thought about that when making the benchmark, but then I decided that the compiler's abilities to inline functions should also be evaluated. Should have done it anyway after obtaining these absurd results. And about the `QueryPerformanceCount`, I don't think it is a reliability problem: the C version is obviously slower, and I even checked against the clock to see if it was really taking 9 seconds. – andre_ss6 Oct 30 '15 at 03:58
  • Java version look incomplete, `int size = Integer.parseInt(file[0]);` file not defined, may be in java version you not read or init input data? – fghj Oct 30 '15 at 04:00
  • @RussSchultz I've tried that in the C# version, didn't make any difference. I'll try it again though, both in C# and C. RonBeyer Because Java can't pass by reference; I'll change it anyway though, I got so nervous with the result that I forgot to make everything absolutely equal. I'm sorry, I'll fix it right now and post the results. And yes, all of the three outputs the same thing. – andre_ss6 Oct 30 '15 at 04:00
  • @user1034749 I've deleted everything about file reading, as that occurs only on application start. The file is never touched after the timer starts (or the timestamp is acquired, in the case of C and java). – andre_ss6 Oct 30 '15 at 04:02
  • Also, for the C# version, run the test more than once to allow JIT to properly optimize and emit machine code. For all the tests, I'd take an average of 100 runs (you are saying it takes max 10 seconds, so this would take about 17 minutes to run). – Ron Beyer Oct 30 '15 at 04:04
  • @RonBeyer Yes, I'm aware of that and I ran all three multiple times. – andre_ss6 Oct 30 '15 at 04:06
  • I mean put a loop in main, not run the program multiple times, the main methods you have above don't reflect that, and JIT doesn't save across application runs in C# (not sure about Java). – Ron Beyer Oct 30 '15 at 04:09
  • 2
    I've copied the program into VC++2012 - for 1000 points initialised with random numbers, it takes ~6.8ms on this PC. For 10k points, ~738ms. Release mode build with no care taken over other optimisation settings. (You have a bug: if not other points are closer together than points[0] and points[1], you never set `*smallerA` and `*smallerB`). Avoiding pow in favour of manual multiplication: ~700ms. – Tony Delroy Oct 30 '15 at 04:22
  • 2
    In fiddling with things, that which made the largest difference was changing `sqrt(pow(p1.X - p2.X, 2) + pow(p1.Y - p2.Y, 2))` to `sqrt((p1.X - p2.X)*(p1.X - p2.X) + (p1.Y - p2.Y)*(p1.Y - p2.Y));` made it take 1/10 the time. (10000 random points, 4500mS down to 322mS) this is with 2015 community edition – Russ Schultz Oct 30 '15 at 04:24
  • I did a similar test recently with sequential/threaded sorts. Comparing C++ (g++ -O3), VC++(Full opts), and Java. For insertion sorts <= 300k, Java was ~2x faster across all categories of various # threads. About 2x slower with quick sort. Including time to merge threads. – ChiefTwoPencils Oct 30 '15 at 04:27
  • It's illegal C. It may compile, it's all undefined behavior. The speed is just as undefined. Try [this](https://godbolt.org/z/1zn4Wb) instead. Should be pretty fast. Far from the fastest. This is keeping the data model you picked, and using managed memory. – viraltaco_ Sep 01 '20 at 19:43

1 Answers1

10

As explained here and checked here, in pure C there is no overload with integer power, like this one:

double pow(double base, int exponent );

It means that when you call pow in C, it is processed in a way similar to:

double pow(double base, double exponent) {
    return exp(log(base) * exponent);
}

Also there should be some check for the case of negative base and integer power, which is handled in special way. It is not a good idea to add conditions like if (exponent == 1.0) and if (exponent == 2.0) here, because it would slow down mathematical code that really uses pow for proper purposes. As a result, squaring becomes slower in twelve times or something like that.

In principle, the only reasonable way to optimize pow(x, 2) to x * x is to make compiler recognize such antipatterns and generate special code for them. It just happened so that your C and C# compilers with your settings are not able to do it, while Java compiler can do it. Note that it has nothing to do with inlining capabilities: just inlining exp(log(a) * b) would not make this code any faster.

As a conclusion, I'd like to note that no programmer good in writing mathematical code would ever write pow(x, 2) in his code.

Community
  • 1
  • 1
stgatilov
  • 5,333
  • 31
  • 54
  • 1
    Also, note that using `x*x` allows it [to be used in a constexpr](http://stackoverflow.com/a/25170731/1708801) in C++11 which is not true to `cmath` libraries. – Shafik Yaghmour Oct 30 '15 at 09:25