6

I have a dynamic array and there is a hot loop that spends a lot of time adding a lot of elements to the dynamic array I have implemented.

The dynamic array works by storing a header at the start of the array of values:

typedef struct
{
    unsigned char* begin; // Pointer to first element (should just point to end of struct, the header is stored prior data)
    unsigned char* end; // Pointer to last element
    size_t typeSize; // Size of type that the array stores
    size_t capacity; // capacity of array (when size ((end - begin) / typeSize) exceeds capacity, realloc)
} dynamicArr;

For some reason the loop runs faster when I compare the sizeof item to the typeSize, and its by a large margin (30% increase) as to when I don't make the comparison.

Note, the comparison is only there to safe keep against adding an item of different type and causing misalignment in the array. This should not happen anyway if the dynamic array is used properly to store 1 type and thus in practice should always evaluate to true.

Here's the code for adding an element to the list:

if (arr)
{
    dynamicArr* header = dynamicArr_Header(arr);
    if (header->typeSize && header->typeSize == sizeof item) //If I remove "header->typeSize == sizeof item" performance decreases.
    {
        size_t arrSize = dynamicArr_Size(header);
        if (arrSize == header->capacity)
        {
            size_t newCapacity = (size_t)(header->capacity * 1.5f);
            if (newCapacity == header->capacity) ++newCapacity;
            void* tmp = realloc(header, sizeof(dynamicArr) + header->typeSize * newCapacity);
            if (tmp)
            {
                dynamicArr_Construct(header, tmp, newCapacity, arrSize, header->typeSize);
                *((void**)&(arr)) = header->begin;
                arr[arrSize] = item;
                header->end += header->typeSize;
            }
            else 
            { 
                free(header); 
                arr = NULL; 
            }
        }
        else
        {
            arr[arrSize] = item;
            header->end += header->typeSize;
        }
    }
}

I don't understand why this was the case. I'm not very good at reading assembly either, from what I can see though they are very different, so if someone could help me out here that would be much appreciated!

(Compiled in MSVC with /O2 and /Tc)

Link to the assembly and the rest of the relevant code.

Edit 1: A lot of people seem to think that the reason is because sizeof item is simply evaluated at compile time. I don't think that this is the case because if I remove the condition and replace all instances of header->typeSize with sizeof item the performance is still worse than if the if condition was there. => I seem to have missed changing the use of header->typeSize in the macro dynamicArr_Size which caused this confusion, refer to the marked answer.

Here is the full code:

typedef struct
{
    unsigned char* begin; // Pointer to data
    unsigned char* end; // Pointer to last element
    size_t typeSize; // Size of type
    size_t capacity; // Capacity of array (not number of elements in array)
} dynamicArr;

#define dynamicArr_ConstructBase(dest, src, newCapacity) dest = src; dest->capacity = newCapacity; dest->begin = (unsigned char*)dest + sizeof *dest
#define dynamicArr_Construct(dest, src, newCapacity, currSize, typeSize) dynamicArr_ConstructBase(dest, src, newCapacity); dest->end = dest->begin + typeSize * (currSize)

#define dynamicArr_Header(arr) ((dynamicArr*)((unsigned char*)(arr) - sizeof(dynamicArr)))
static inline size_t dynamicArr_Size(dynamicArr* arr)
{
    return (arr->end - arr->begin) / arr->typeSize;
}

#define dynamicArr_Create(typename, arr) typename* arr = (typename*)dynamicArr_Create_(sizeof(typename))
static inline unsigned char* dynamicArr_Create_(size_t typeSize)
{
    dynamicArr* dynArr;
    void* tmp = malloc(sizeof * dynArr + typeSize * 10);
    if (!tmp) return NULL;

    dynArr = tmp;
    dynArr->begin = (unsigned char*)dynArr + sizeof * dynArr;
    dynArr->end = dynArr->begin;
    dynArr->capacity = 10;
    dynArr->typeSize = typeSize;

    return dynArr->begin;
}

#define dynamicArr_Free(arr) free(dynamicArr_Header(arr))

#define dynamicArr_Push(arr, item) \
do {\
if (arr) \
{ \
    dynamicArr* header = dynamicArr_Header(arr); \
    if (header->typeSize && header->typeSize == sizeof item) \
    { \
        size_t arrSize = dynamicArr_Size(header); \
        if (arrSize == header->capacity) \
        { \
            size_t newCapacity = (size_t)(header->capacity * 1.5f); \
            if (newCapacity == header->capacity) ++newCapacity; \
            void* tmp = realloc(header, sizeof(dynamicArr) + header->typeSize * newCapacity); \
            if (tmp) \
            { \
                dynamicArr_Construct(header, tmp, newCapacity, arrSize, header->typeSize); \
                *((void**)&(arr)) = header->begin; \
                arr[arrSize] = item; \
                header->end += header->typeSize; \
            } \
            else  \
            {  \
                free(header);  \
                arr = NULL;  \
            } \
        } \
        else \
        { \
            arr[arrSize] = item; \
            header->end += header->typeSize; \
        } \
    } \
} \
} while(0)

And example use:

void Func()
{
    dynamicArr_Create(int, intArr);
    dynamicArr_Push(intArr, 10);
    printf("%i\n", intArr[0]);
    dynamicArr_Free(intArr);
}

As for a simple test for profiling:

int main()
{
    dynamicArr_Create(int, intArr);

    clock_t begin = clock();

    for (int i = 0; i < 1000000000; ++i)
    {
        dynamicArr_Push(intArr, 10);
    }

    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("%f\n", time_spent);

    dynamicArr_Free(intArr);
}

Compiling in release mode in Visual Studio 2019 on windows using /Tc I get the results:

  • With header->typeSize == sizeof item => 3.65 seconds
  • Without header->typeSize == sizeof item => 9.374 seconds
  • Replacing header->typeSize with sizeof item and removing header->typeSize == sizeof item => 9.302 seconds

I repeated the test 10 times and it was consistent to the results above.

name
  • 181
  • 1
  • 12
  • 5
    With the test, if `header->typesize` is sometimes different from `sizeof item`, then you get to skip a big chunk of code and save the time that it would take to execute it. – Nate Eldredge Jul 10 '21 at 02:43
  • 1
    And even if you don't skip it, every occurrence of `header->typeSize` inside the `if` body can be optimized into the constant `sizeof item`. – Nate Eldredge Jul 10 '21 at 02:44
  • @NateEldredge In my use case the `header->typesize` will always be equal to `sizeof item`. Its purely there to prevent misalignment when adding items (since the dynamic array should only store 1 type). I'll edit to clarify this in my question. – name Jul 10 '21 at 02:44
  • In particular, multiplying by a constant is usually a lot faster than multiplying by a variable. Especially if that constant is a power of 2. – Nate Eldredge Jul 10 '21 at 02:45
  • You can reason this out. When does the `sizeof` comparison make a difference? Well, if the comparison is true, then it will function the same as if it weren't there. But if the comparison is false, then the entire block of code will be skipped when it otherwise would have been executed. So it's running less code. Less code = improved speed. The more of your loop you can skip, the faster your code will run. – Tom Karzes Jul 10 '21 at 02:46
  • Since C doesn't have any kind of generics, I'm a little curious about `item`... What is `item`? How is it declared? How is it even used (besides getting its size)? – Some programmer dude Jul 10 '21 at 02:48
  • @Someprogrammerdude item is just the item being added to the dynamic array. The link the assembly should also show the code if your curious about how it would work. – name Jul 10 '21 at 02:49
  • Dereferencing `header` after it's been realloced looks really wrong. Unless `dynamicArr_construct` is somehow a macro that assigns `tmp` to `header`. – Nate Eldredge Jul 10 '21 at 02:51
  • I was afraid it might have been a pointer. Since you use macros it isn't. By the way, that link was impossible to use on a phone. – Some programmer dude Jul 10 '21 at 02:53
  • @NateEldredge I'm not really convinced since if I change all the `header->typesize` to `sizeof item` or even to a constant (hard coding the size of item) I still get the same performance as without the comparison. With the comparison it is still faster. – name Jul 10 '21 at 02:54
  • @NateEldredge Yes dynamicArr_construct is a macro that asigns tmp to header. – name Jul 10 '21 at 02:54
  • @Someprogrammerdude That's fine haha. – name Jul 10 '21 at 02:55
  • 5
    You can see in the codegen that with the test against `sizeof item`, the compiler can assume inside the branch that `sizeof item == 4`, so it can replace an expensive `div r8` with a very cheap `shr, rdi, 2`. – Raymond Chen Jul 10 '21 at 05:30
  • @RaymondChen Could you explain what those instructions do? I'm not very good with assembly. Also, surely in the case that I replace all the instances of `header->typeSize` with `sizeof item` and remove the comparison it should also make that same assumption? But when I do that the performance is worse unless I add back the if condition. Why is the if condition so important? – name Jul 10 '21 at 05:48
  • Can you provide the (complete) code of the functions use like `dynamicArr_getHeader`, `dynamicArr_getSize`, `dynamicArr_construct` and the type of `arr` and `item` (or the prototype of the encompassing function)? This is important as they can strongly impact performance too (regarding the conditional), especially if they are inlined by the compiler. – Jérôme Richard Jul 10 '21 at 07:20
  • 2
    If performance is a concern, you should probably replace `size_t newCapacity = (size_t)(header->capacity * 1.5f);` with something like `size_t newCapacity = header->capacity + headerCapacity / 2;` or `size_t newCapacity = 2 * header->capacity;`. The floating-point calculation is likely a lot more expensive than a simple bit shift or a bit shift and add, which is what the integer calculations will likely be optimized into. The floating-point calculation probably can't be optimized into a bit shift and add because that could produce a different value. – Andrew Henle Jul 10 '21 at 12:03
  • @JérômeRichard I have now provided the rest of the relevant code. It should also be on the link provided to the assembly. – name Jul 10 '21 at 14:12
  • @AndrewHenle That is true, I only used 1.5 as the growth rate since it allows for a higher chance `realloc` is able to reuse memory. Refer to [this](https://stackoverflow.com/questions/1100311/what-is-the-ideal-growth-rate-for-a-dynamically-allocated-array) question. – name Jul 10 '21 at 14:13
  • 1
    @name That falls apart with `glibc` because once you reach the `mmap()` threshold the memory is directly allocated with `mmap()`'d blocks and there's no "hole" left behind to be reused. Nevermind it assumes the size of your allocations is random. If the bulk of your memory allocations are powers of two, then using powers of two for size reduces the chances of heap memory being wasted. The entire theory of the "best growth factor" also always starts with, "Assume we have a stupid memory allocator...". TLDR: There's waaay too much overthinking performance corner cases in computer science... – Andrew Henle Jul 10 '21 at 14:40
  • @AndrewHenle I see, thanks for the extra info haha. Profiling it again, the difference between the 1.5 and 2 is not a large margin (as expected tbh) compared with the comparison vs no comparison. – name Jul 10 '21 at 14:43
  • @name Well, once you hit the `mmap()` threshold in `glibc()`, I strongly suspect the bulk of your time will be spent page-faulting when you have to do a `realloc()`. The difference between calculating `1.5f * size` instead of `size + size/2` isn't even noise when you have to page fault a few pages of virtual memory. – Andrew Henle Jul 10 '21 at 14:47
  • @AndrewHenle That's very true, still doesn't help explain why having a comparison doubles the speed though xD. – name Jul 10 '21 at 14:50

3 Answers3

5

This is pretty straightforward constant propagation, which enables the compiler to use more efficient instructions.

Since the value of sizeof is a constant known at compile time, in the version with the comparison the compiler knows the value of typeSize and therefore can use more efficient instructions. Here’s for example the computation of dynamicArr_Size(header), which is to say, (header->end - header->begin) / header->typeSize:

--- without sizeof
+++ with sizeof
-        mov     rax, QWORD PTR [rbx+8]
-        xor     edx, edx
-        sub     rax, QWORD PTR [rbx]
-        div     r8
+        mov     rdi, QWORD PTR [rbx+8]
+        sub     rdi, QWORD PTR [rbx]
+        shr     rdi, 2

In the version without sizeof, the compiler has to treat the element size as an unknown and use the actual divide instruction (which also requires zeroing the rdx register, as the instruction takes a 128-bit dividend in the rdx:rax register pair). When the size is known, the compiler can substitute a much faster bit shift instead and avoid touching rdx. This is probably the most impactful difference, as the divide instruction tends to be incredibly slow relative to other arithmetic; most compilers, whenever they have a chance (when the divisor is constant), will instead use bit shifts or wrap-around multiplication tricks just to avoid using the division instruction. (Matt Godbolt has a whole section about division in his talk about compiler optimisations.) More efficient use of registers also frees up registers to be used in other places and may prevent spilling values into memory (though in this case it doesn’t seem to make much of a difference).

Another example, here’s how sizeof(dynamicArr) + header->typeSize * newCapacity is computed:

--- without sizeof
+++ with sizeof
-        mov     rdx, rsi
-        imul    rdx, r8
-        add     rdx, 32
+        lea     rdx, QWORD PTR [rsi*4+32]

In the version without comparison against sizeof, the compiler has to treat header->typeSize as an unknown and use a generic multiplication instruction. When the size is known, it can instead use a lea instruction with a special addressing mode that allows to compute value in a single instruction. Though generic multiplication is not as slow as division, a bit shift will still beat it. But even if lea were not necessarily faster in itself, the higher code density allows more code to fit into the instruction cache and avoid slowdowns due to cache misses.

Finally, you claimed that

A lot of people seem to think that the reason is because sizeof item is simply evaluated at compile time. I don't think that this is the case because if I remove the condition and replace all instances of header->typeSize with sizeof item the performance is still worse than if the if condition was there.

I assume you have only replaced direct uses of the field inside the macro and not the indirect use inside the dynamicArr_Size utility function. When you replace that use as well, the generated code is nearly identical, modulo label names and the immediate value in the if condition check. With comparison against sizeof, the compiler simply does that automatically while inlining that function.

user3840170
  • 26,597
  • 4
  • 30
  • 62
  • Thanks for explaining everything so clearly, I profiled it again to check and this does indeed seem to be the case. `I assume you have only replaced direct uses of the field inside the macro and not the indirect use inside the dynamicArr_Size utility function.` - Yup this was indeed what had happened haha, I ended up confusing myself and possibly others with this. Didn't expect such a large difference in performance from using different instructions! – name Jul 14 '21 at 19:34
-1

The main things to observe here is that dynamicArr_Push is a macro and not a function.

if (header->typeSize && header->typeSize == sizeof item) \

And this line gives a hint to the compiler that only typeSize items will ever be inserted and it can pre-calculate the max size you ever need inside the for-loop (1000000000*sizeof(item)) and eliminate the repeated memory allocations, by allocating one big chunk at the first go itself.

for (int i = 0; i < 1000000000; ++i)
{
    dynamicArr_Push(intArr, 10);
}

To test this, please change the initial capacity size to 1000000000 (dynArr->capacity = 10;) inside dynamicArr_Create_ and run the test again removing the check header->typeSize == sizeof item. You should observe similar performance improvement as with the size-check version. Or else try strace like tool to count the number of calls to realloc with and without size-check. They must have been completely eliminated in the size-check version.

Fractal
  • 816
  • 5
  • 15
-3

sizeof is an operator which is almost all the time evaluated at compile time (VLA is an exception)

vtorri
  • 166
  • 13
  • I don't think this is the reason considering that if I remove the if condition and replace `header->typeSize` with sizeof item the performance is still worse unless the `if` condition is there. – name Jul 10 '21 at 13:57