3

Why is the first piece of code much slower than the second?

public long Gen(int mod)
{
    long a, b = 0x7fffffffffffffff - mod;

    do
    {
        a = (long)(Gen() >> 1);
    } while (a > b);

    return a % mod;
}

public long Gen(int mod)
{
    long a, b = 0x7fffffffffffffff - mod;

    do
    {
        a = (long)(Gen() >> 1);
    } while (a > b);

    return a % 12345;
}

The gen function is a 64 bit unsigned PRNG (see below).

The problem is that the first piece of code is so much slower, that using a variable to calculate the modulus basically adds 3x the amount of time it takes to calculate just the random numbers! To add to the mystery, when you remove the loop, and calculate the modulus using a variable, the speed is similar to the second piece of code.

Something weird is going on around here, because you can't tell me that a modulus using a variable is several times as slow as this:

public ulong Gen()
{
    counter = (counter + 1) & 3;

    if (counter == 0)
    {
        state[0]++;

        ulong x0 = state[0];
        ulong x1 = state[1];
        ulong x2 = state[2];
        ulong x3 = state[3];

        for (int i = 0; i < 2; i++)
        {
            x0 += x1; x1 ^= ((x0 << 32) | (x0 >> (64 - 32)));
            x1 += x0; x0 ^= ((x1 << 32) | (x1 >> (64 - 32)));
            x2 += x3; x3 ^= ((x2 << 32) | (x2 >> (64 - 32)));
            x3 += x2; x2 ^= ((x3 << 32) | (x3 >> (64 - 32)));

            x0 += x2; x2 ^= ((x0 << 27) | (x0 >> (64 - 27)));
            x2 += x0; x0 ^= ((x2 << 27) | (x2 >> (64 - 27)));
            x1 += x3; x3 ^= ((x1 << 27) | (x1 >> (64 - 27)));
            x3 += x1; x1 ^= ((x3 << 27) | (x3 >> (64 - 27)));

            x0 += x3; x3 ^= ((x0 << 11) | (x0 >> (64 - 11)));
            x3 += x0; x0 ^= ((x3 << 11) | (x3 >> (64 - 11)));
            x1 += x2; x2 ^= ((x1 << 11) | (x1 >> (64 - 11)));
            x2 += x1; x1 ^= ((x2 << 11) | (x2 >> (64 - 11)));
        }

        block[0] = x0;
        block[1] = x1;
        block[2] = x2;
        block[3] = x3;
    }

    return block[counter];
}

Minimal reproducible version as requested:

using System;
using System.Diagnostics;

class Program
{
    static void Main(string[] args)
    {
        Stopwatch sw = new Stopwatch();
        Arx rng = new Arx();
        long a = 0;

        // constant = fast

        sw.Start();
        for (int i = 0; i < 10000000; i++)
        {
            a += rng.GenConstant(123);
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
        Console.WriteLine("{0:x16}", a);
        sw.Reset();

        // no loop = fast

        sw.Start();
        for (int i = 0; i < 10000000; i++)
        {
            a += rng.GenNoLoop(123);
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
        Console.WriteLine("{0:x16}", a);
        sw.Reset();

        // modulus variable = slow

        sw.Start();
        for (int i = 0; i < 10000000; i++)
        {
            a += rng.GenVariable(123);
        }
        sw.Stop();
        Console.WriteLine(sw.ElapsedMilliseconds);
        Console.WriteLine("{0:x16}", a);
        sw.Reset();
    }
}

class Arx
{
    static public ulong[] state = new ulong[4];
    static public ulong[] outBlock = new ulong[4];

    static int counter = -1;

    public Arx(ulong seed = 0)
    {
        if (seed == 0)
            state[1] = (ulong)DateTime.UtcNow.Ticks;

        else
            state[1] = seed;
    }

    public ulong Gen()
    {
        counter = (counter + 1) & 3;

        if (counter == 0)
        {
            state[0]++;

            ulong x0 = state[0];
            ulong x1 = state[1];
            ulong x2 = state[2];
            ulong x3 = state[3];

            for (int i = 0; i < 2; i++)
            {
                x0 += x1; x1 ^= ((x0 << 32) | (x0 >> (64 - 32)));
                x1 += x0; x0 ^= ((x1 << 32) | (x1 >> (64 - 32)));
                x2 += x3; x3 ^= ((x2 << 32) | (x2 >> (64 - 32)));
                x3 += x2; x2 ^= ((x3 << 32) | (x3 >> (64 - 32)));

                x0 += x2; x2 ^= ((x0 << 27) | (x0 >> (64 - 27)));
                x2 += x0; x0 ^= ((x2 << 27) | (x2 >> (64 - 27)));
                x1 += x3; x3 ^= ((x1 << 27) | (x1 >> (64 - 27)));
                x3 += x1; x1 ^= ((x3 << 27) | (x3 >> (64 - 27)));

                x0 += x3; x3 ^= ((x0 << 11) | (x0 >> (64 - 11)));
                x3 += x0; x0 ^= ((x3 << 11) | (x3 >> (64 - 11)));
                x1 += x2; x2 ^= ((x1 << 11) | (x1 >> (64 - 11)));
                x2 += x1; x1 ^= ((x2 << 11) | (x2 >> (64 - 11)));
            }

            outBlock[0] = x0;
            outBlock[1] = x1;
            outBlock[2] = x2;
            outBlock[3] = x3;
        }

        return outBlock[counter];
    }

    public long GenConstant(int mod)
    {
        long a, b = 0x7fffffffffffffff - mod;

        do
        {
            a = (long)(Gen() >> 1);
        } while (a > b);

        return a % 12345;
    }

    public long GenVariable(int mod)
    {
        long a, b = 0x7fffffffffffffff - mod;

        do
        {
            a = (long)(Gen() >> 1);
        } while (a > b);

        return a % mod;
    }

    public long GenNoLoop(int mod)
    {
        long a = (long)(Gen() >> 1);

        return a % mod;
    }
}
Thorham
  • 447
  • 3
  • 11
  • 2
    Can you show your measurement code please? – Enigmativity Jun 22 '19 at 01:14
  • does the cpu have to execute more instructions each clock cycle for loading a variable to calculate the modulus versus a constant? – JamieT Jun 22 '19 at 01:25
  • @Jamie Taylor Sangerman - Probably not relevant here, because when the loop is removed the speed becomes similar to the code with the constant. – Thorham Jun 22 '19 at 01:28
  • @Thorham indeed strange – JamieT Jun 22 '19 at 01:30
  • 3
    64bit modulo-by-variable is quite slow (modulo-by-constant is optimized), but that loop mystery is mysterious – harold Jun 22 '19 at 01:31
  • @Thorham - Can you measure both functions within the same test code? You should run multiple iterations, interleaving between both functions, and then compute a average of each? Then, after that, swap the order in which you call both functions and run it again. – Enigmativity Jun 22 '19 at 01:36
  • Where does `Arx256` come from? – Enigmativity Jun 22 '19 at 01:37
  • @Thorham - And what's the `Gen()` function? You haven't provided that. You need to provide a [mcve] for us to be able to help. – Enigmativity Jun 22 '19 at 01:38
  • @Enigmativity - It's my own design. Of course it's not cryptographic. – Thorham Jun 22 '19 at 01:57
  • 1
    @Enigmativity - Minimal version added. – Thorham Jun 22 '19 at 01:57
  • @Thorham - Do be careful using `sw.ElapsedMilliseconds` - it's the millisecond component of the `TimeSpan`, not the total milliseconds. If, for example, your `TimeSpan` was `1.001` **seconds** then `sw.ElapsedMilliseconds` would be only `1` and not `1001`. To get the full milliseconds always use `sw.Elapsed.TotalMilliseconds`. – Enigmativity Jun 22 '19 at 05:43
  • @Enigmativity - Strange, for me sw.ElapsedMilliseconds works fine for more than a second. Did they change the property perhaps? – Thorham Jun 22 '19 at 08:59
  • Division is an extremely slow operation. Division and modulo by a compile-time constant is always faster because they can be converted into a multiplication which is much faster. See [Why does GCC use multiplication by a strange number in implementing integer division?](https://stackoverflow.com/q/41183935/995714) – phuclv Jun 22 '19 at 11:29
  • @phuclv - Indeed. I really didn't expect that. On the old Motorola 68030 a 32 bit unsigned division with 32 bit remainder is 78 cycles. You'd expect current and recent cpus to be able to do that in at least ten times fewer cycles, but no, my Amd A4 3400 needs from 24 to 87 cycles for 64 bit divisions. – Thorham Jun 22 '19 at 13:31
  • division is highly serial. You can't do it in parallel like other arithmetic operations, so it's almost impossible to make them fast. Even on modern Intel or AMD CPUs they may take more than a hundred cycles – phuclv Jun 22 '19 at 13:55
  • @phuclv - Luckily, as it turns out, you don't divisions to generate random numbers in a range. Found that out today. – Thorham Jun 22 '19 at 14:02

2 Answers2

1

This is an optimizer issue.

First no doubt that using a variable is slower than using a constant because loading a variable needs more time.

But when you remove the loop part, the methods become simple and the optimizer makes them inline. And notice when the method is inline the expression a % mod from rng.GenNoLoop(123) can be recognized as constant. So they are identical now.

To restore the unoptimized state, you need pass a real variable to GenNoLoop.

static int mod = 123;

static void Main(string[] args)
{
    rng.GenNoLoop(mod);
}

Another choice is force the method no inline

[MethodImpl(MethodImplOptions.NoInlining)]
public long GenNoLoop(int mod)
shingo
  • 18,436
  • 5
  • 23
  • 42
  • 2
    Loading a variable isn't much of a problem, it's the 64bit `idiv` that takes a lot of time (it is not used for modulo-by-constant) – harold Jun 22 '19 at 05:12
  • @shingo - You're right. Tried it by providing a random modulo with System.Random, and the no loop version is now slow as well. – Thorham Jun 22 '19 at 06:09
  • @harold - That doesn't explain why it takes as much time as it does, though. The number of milliseconds is almost five times as much as the whole Gen() function! It's only 103 ms for Gen() and 492 ms for GenVariable(). Seems VERY strange that those divisions are that slow. – Thorham Jun 22 '19 at 06:15
  • @@harold - Actually, you're right. The divisions are indeed slow. Checked the cycle times for divisions on my Amd A4 3400, and for 64 bit they range from 24 to 87 cycles. – Thorham Jun 22 '19 at 06:30
0

I used this code to test the speed of the two methods:

void Main()
{
    Stopwatch sw = new Stopwatch();
    var ts1 = TimeSpan.Zero;
    var ts2 = TimeSpan.Zero;

    Arx rng = new Arx();

    for (var x = 0; x < 1000; x++)
    {
        long a = 0;
        sw.Start();
        for (int i = 0; i < 100000; i++)
        {
            a += rng.GenVariable(123);
        }
        sw.Stop();
        ts1 += sw.Elapsed;
        sw.Reset();

        a = 0;
        sw.Start();
        for (int i = 0; i < 100000; i++)
        {
            a += rng.GenConstant(123);
        }
        sw.Stop();
        ts2 += sw.Elapsed;
        sw.Reset();     
    }

    ts1.TotalMilliseconds.Dump();
    ts2.TotalMilliseconds.Dump();
}

I got 2890.5391 & 2805.8824 milliseconds out respectively. The variable version is only 3% slower. There's not a big difference.

Enigmativity
  • 113,464
  • 11
  • 89
  • 172
  • Make susre you're building release builds for x64 with optimizations on. If I use debug build I get 1117, 1094 and 1128 ms for the code I posted. With release build for x64 and optimizations I get 156, 126, 467 ms. Huge difference. – Thorham Jun 22 '19 at 06:03
  • @Thorham - You're right. On 32-bit it doesn't matter much. On 64-bit the release version is twice as fast for the constant over variable implementations. We need Eric Lippert. – Enigmativity Jun 22 '19 at 06:38