Odd performance issue with nested for loops

Question

Below is the full source code you can just copy paste into Visual Studio for easy repro.

#include <Windows.h>

#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>

LARGE_INTEGER gFreq;

struct CProfileData;

// Yes, we map the pointer itself not the string, for performance reasons
std::vector<CProfileData*> gProfileData;

// simulate a draw buffer access to avoid CBlock::Draw being optimized away
float gDrawBuffer = 0;

struct CTimer
{
    CTimer()
    {
        Reset();
    }

    size_t GetElapsedMicro()
    {
        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);
        return (1000000 * (now.QuadPart - m_timer.QuadPart)) / gFreq.QuadPart;
    }

    inline void Reset()
    {
        ::QueryPerformanceCounter(&m_timer);
    }

    LARGE_INTEGER m_timer;
};

struct CProfileData
{
    CProfileData() : m_hitCount(0), m_totalTime(0), m_minTime(-1),
        m_maxTime(0), m_name(NULL)
    {
        gProfileData.push_back(this);
    }

    size_t m_totalTime;
    size_t m_minTime;
    size_t m_maxTime;
    size_t m_hitCount;
    const char * m_name;
};

class CSimpleProfiler
{
public:
    CSimpleProfiler(const char * aLocationName, CProfileData * aData)
        : m_location(aLocationName), m_data(aData)
    {
        ::QueryPerformanceCounter(&m_clock);
    }

    ~CSimpleProfiler()
    {
        CProfileData & data = *m_data;
        data.m_name = m_location;
        ++data.m_hitCount;


        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);

        size_t elapsed = (1000000 * (now.QuadPart - m_clock.QuadPart)) / gFreq.QuadPart;
        data.m_totalTime += elapsed;

        elapsed < data.m_minTime ? data.m_minTime = elapsed : true;
        elapsed > data.m_maxTime ? data.m_maxTime = elapsed : true;
    }

    static void PrintAll()
    {
        std::stringstream str;
        str.width(20);
        str << "Location";
        str.width(15);
        str << "Total time";
        str.width(15);
        str << "Average time";
        str.width(15);
        str << "Hit count";
        str.width(15);
        str << "Min";
        str.width(15);
        str << "Max" << std::endl;

        ::OutputDebugStringA(str.str().c_str());

        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            CProfileData & data = **i;
            std::stringstream str;
            str.width(20);
            str << data.m_name;
            str.width(15);
            str << data.m_totalTime;
            str.width(15);
            str << data.m_totalTime / (float)data.m_hitCount;
            str.width(15);
            str << data.m_hitCount;
            str.width(15);
            str << data.m_minTime;
            str.width(15);
            str << data.m_maxTime << std::endl;

            ::OutputDebugStringA(str.str().c_str());
        }
    }

    static void Clear()
    {
        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            (*i)->m_totalTime = 0;
            (*i)->m_minTime = 0;
            (*i)->m_maxTime = 0;
            (*i)->m_hitCount = 0;
        }
    }

private:
    LARGE_INTEGER m_clock;
    const char * m_location;
    CProfileData * m_data;
};


#define PROFILING_ENABLED

#ifdef PROFILING_ENABLED
#define SIMPLE_PROFILE \
    static CProfileData pdata ## __LINE__; \
    CSimpleProfiler p ## __LINE__(__FUNCTION__, & pdata ## __LINE__)

#define SIMPLE_PROFILE_WITH_NAME(Name) \
    static CProfileData pdata ## __LINE__; \
    CSimpleProfiler p ## __LINE__(Name, & pdata ## __LINE__)
#else
#define SIMPLE_PROFILE __noop
#define SIMPLE_PROFILE_WITH_NAME(Name) __noop
#endif


void InvalidateL1Cache()
{
    const int size = 256 * 1024; 
    static char *c = (char *)malloc(size);
    for (int i = 0; i < 0x0fff; i++)
        for (int j = 0; j < size; j++)
            c[j] = i*j;
}

int _tmain(int argc, _TCHAR* argv[])
{
    ::QueryPerformanceFrequency(&gFreq);

    LARGE_INTEGER pc;
    ::QueryPerformanceCounter(&pc);

    struct CBlock
    {
        float x;
        float y;

        void Draw(float aBlend)
        {   
            for (size_t i = 0; i < 100; ++i )
                gDrawBuffer += aBlend;
        }
    };


    typedef std::vector<std::vector<CBlock>> Layer;
    typedef std::vector<Layer> Layers;
    Layers mBlocks;

    // populate with dummy data;
    mBlocks.push_back(Layer());
    Layer & layer = mBlocks.back();
    layer.resize(109);

    srand(0); // for reprodicibility (determinism)
    for (auto i = layer.begin(); i != layer.end(); ++i)
    {
        i->resize(25 + rand() % 10 - 5);
    }

    // end populating dummy data

    while (1)
    {
        CSimpleProfiler::Clear();

        float aBlend = 1.f / (rand() % 100);
        {
            for (auto i = mBlocks.begin(); i != mBlocks.end(); ++i)
            {
                for (auto j = i->begin(); j != i->end(); ++j)
                {

                    CTimer t;

                    {
                        SIMPLE_PROFILE_WITH_NAME("Main_Draw_3");
                        for (auto blockIt = j->begin(); blockIt != j->end();)
                        {
                            CBlock * b = nullptr;
                            {
                                b = &*blockIt;
                            }
                        {
                            b->Draw(aBlend);
                        }
                        {
                            ++blockIt;
                        }
                        }
                    }

                    if (t.GetElapsedMicro() > 1000)
                    {
                        ::OutputDebugStringA("SLOWDOWN!\n");
                        CSimpleProfiler::PrintAll();
                    }
                }
            }
        }
    }

    return 0;
}

I get the following profiling from time to time, expressed in microseconds:

SLOWDOWN!
            Location     Total time   Average time      Hit count            Min            Max
         Main_Draw_3           2047        36.5536             56              0           1040

This spikes from time to time. Normally, it takes 100 microseconds for Main_Draw_3 block to finish, but it spikes to 1000 (the Max column) from time to time. What causes this?

I'm aware cache misses could play a role, but is it really that in this case?... What is happening here and how can I mitigate this?

More info:

compiler VS 2013, compiled with Maximize Speed (/O2)

Absolutely reproducible performance measurement sample. Bravo! Rarely seen a better one :-P ... — πάντα ῥεῖ, Oct 14 '14 at 21:23
@πάνταῥεῖ not sure what you mean, but thanks, I guess? — Ed Rowlett-Barbu, Oct 14 '14 at 21:25
_"but thanks, I guess?"_ Sarcasm, is one of the less appreciated things usually, though one of the less understood as well unfortunately. — πάντα ῥεῖ, Oct 14 '14 at 21:27
Take a look at the HUGE variance in Main_Draw_4, that is off not 3. We don't know what Draw_4 is doing; and depending on what he does performance could be all over the place. — UpAndAdam, Oct 14 '14 at 21:29
@UpAndAdam if you look carefully, `Main_Draw_4` measures the draw AND the `increment`. Just the draw is illustrated first in the table and it only accounts for 142 microseconds — Ed Rowlett-Barbu, Oct 14 '14 at 21:31
Do you need to be using vectors? Maybe using plain 3D arrays would provide more consistent performance? — Lee Jacobs, Oct 14 '14 at 21:32
Honestly, I have no idea what your simple profile macro does.. so maybe that's incorrect; but he's the one with the variance, not draw3 — UpAndAdam, Oct 14 '14 at 21:32
@Zadirion - Posts that basically say "why is my code slow/fast?" should be accompanied with compiler and options used to build the test. You didn't state what compiler you're using, whether you are timing an optimized build, etc. — PaulMcKenzie, Oct 14 '14 at 21:33
@LeeJacobs What might be better is if he allocates ONE continuous blob of memory that contains ALL of the data instead of arrays of arrays or vectors of vectors. you can set up the pointers to present the same, but the prefetching would likely be both better and more consistent. — UpAndAdam, Oct 14 '14 at 21:35
@UpAndAdam definitely no resizes. The code just iterates over the structure. I will post the compiler and settings in the question. — Ed Rowlett-Barbu, Oct 14 '14 at 21:35
What other processes are on the box? Have you pinned this task to a specific core without interuption from others? If you get swapped out all your numbers will go to crap. — UpAndAdam, Oct 14 '14 at 21:37
@UpAndAdam: I think you meant context switch, which has to do with sharing the CPU, and not "swapped", which has to do with sharing RAM. — Ben Voigt, Oct 14 '14 at 21:39
@UpAndAdam: Also, a 1D vector is contiguous like a 1D array, but nested vectors are not contiguous while nested arrays are. — Ben Voigt, Oct 14 '14 at 21:40
@BenVoigt Swapped out as in the swap out is likely to occur if another task comes in and trashes all of memory. If you have no control then 'anything can happen'. When I care about performance I isolate my cpu's from the OS's. — UpAndAdam, Oct 14 '14 at 21:42
@BenVoigt How is a nested array guaranteed to be contiguous? — UpAndAdam, Oct 14 '14 at 21:43
@UpAndAdam thanks Adam. How do I prevent the OS from context switching my process with another task like this? This is really hurting the stability of my framerates (it's a game engine) — Ed Rowlett-Barbu, Oct 14 '14 at 21:44
Honestly, I don't know on windows :-) On linux where I work I always isolate CPU's from the OS, and use taskset and system calls to programmatically bind to a given core. Given that you care for windows specifically you might want to add a windows tag. I also dont know if isolation will be as possible for a game engine; in my case it's for HPC so we control any all software on our hardware. You obviously won't always have that ability as easily. — UpAndAdam, Oct 14 '14 at 21:46
@UpAndAdam: Because arrays contain their elements. Now, if you mean an array of pointers to other arrays, that would be non-contiguous like the vector case. But Lee said "plain 3D array", which means he's talking about `T[m][n][k]` -- no pointers in sight, just contiguously placed elements. Remember sizeof an array is length * sizeof element... so the 3D array has sizeof m*n*k*sizeof element, which is contiguous storage for all the data. — Ben Voigt, Oct 14 '14 at 21:49
This might help : http://superuser.com/questions/349591/how-can-i-set-the-default-affinity-for-any-process-in-windows — UpAndAdam, Oct 14 '14 at 21:51
@Zadirion: You can increase thread priority, and you can 'pin' to one core using affinity, but that just keeps your thread from running on other cores, it doesn't reserve that core for you. — Ben Voigt, Oct 14 '14 at 21:51
@UpAndAdam: SU solution is likely less than ideal, affinity can be set with a single function call, since he's writing the code. The SuperUser answer has to deal with setting affinity of some other process without source code, which is much harder. — Ben Voigt, Oct 14 '14 at 21:52
@BenVoigt I understand what you said and now that you have provided a definition of a 'plain 3D array' I removed my comment. If you read what I said it shuold be clear from what I said that I had no idea what he meant by 'plain 3D array'. http://stackoverflow.com/questions/17584215/how-does-c-allocate-space-for-a-2d-3d-array-when-using-malloc In the end you are arguing about english and not the actual code — UpAndAdam, Oct 14 '14 at 22:01
Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/63064/discussion-between-upandadam-and-ben-voigt). — UpAndAdam, Oct 14 '14 at 22:02
Hit count for the first two for loops is one. That means the matrix is 1x1xN right? So I don't think being a 3d matrix is the issue here. — Neil Kirk, Oct 14 '14 at 22:19
@NeilKirk It's 1xMxN. You looked at Main_Draw which is outside the for loop. This makes it a 2D matrix. — Ed Rowlett-Barbu, Oct 14 '14 at 23:13
We need a *complete* compilable but *minimal* example to reproduce this. — Flexo, Oct 15 '14 at 07:53
I strongly recommend reading this: http://www.akkadia.org/drepper/cpumemory.pdf Not only because it has very good chance to help in this situation... — pasztorpisti, Oct 15 '14 at 09:59
I have added a full minimal reproducible case, including a profiler. — Ed Rowlett-Barbu, Oct 15 '14 at 12:49

score 0 · Answer 1 · answered Oct 14 '14 at 21:31

0

I think there might be two issues:

Are you compiling with optimizations on? What are the flags?
Maybe you could increase the sample size (by doing for instance ten (or hundred, or thousand etc) runs of this code in one profiling run). The reason is that if the sample size is small, the standard deviation is very high

answered Oct 14 '14 at 21:31

lisu

2,213
14
22

Compiled in VS 2013 with `Maximize Speed (/O2)`. The same sample was done on larger sets as well (about 2000 elements) and the results are the same. Majority of iterations are fast, but once in a while one of them spikes at around 15,000 microseconds – Ed Rowlett-Barbu Oct 14 '14 at 21:33
2000 elements is not a lot of elements. Try doing at least a million, and repeat the data collection multiple times. :-p – UpAndAdam Oct 14 '14 at 21:36
2

@Zadirion: "around 15,000 microseconds" is the default quantum of the system scheduler. A context switch caused your thread to stop running so another piece of code (that's been waiting its turn patiently) could have the processor for 15ms. – Ben Voigt Oct 14 '14 at 21:38
@BenVoigt you were thinking the same thing I just was – UpAndAdam Oct 14 '14 at 21:39
You are 100% sure Draw() also doesn't trigger anything in different thread? – lisu Oct 14 '14 at 21:47
I'd like to try to reproduce it, but I need the whole code, or at least Draw() function implementation. – lisu Oct 15 '14 at 07:53

Odd performance issue with nested for loops

1 Answers1