I have a dynamic array and there is a hot loop that spends a lot of time adding a lot of elements to the dynamic array I have implemented.
The dynamic array works by storing a header at the start of the array of values:
typedef struct
{
unsigned char* begin; // Pointer to first element (should just point to end of struct, the header is stored prior data)
unsigned char* end; // Pointer to last element
size_t typeSize; // Size of type that the array stores
size_t capacity; // capacity of array (when size ((end - begin) / typeSize) exceeds capacity, realloc)
} dynamicArr;
For some reason the loop runs faster when I compare the sizeof item
to the typeSize
, and its by a large margin (30% increase) as to when I don't make the comparison.
Note, the comparison is only there to safe keep against adding an item of different type and causing misalignment in the array. This should not happen anyway if the dynamic array is used properly to store 1 type and thus in practice should always evaluate to true.
Here's the code for adding an element to the list:
if (arr)
{
dynamicArr* header = dynamicArr_Header(arr);
if (header->typeSize && header->typeSize == sizeof item) //If I remove "header->typeSize == sizeof item" performance decreases.
{
size_t arrSize = dynamicArr_Size(header);
if (arrSize == header->capacity)
{
size_t newCapacity = (size_t)(header->capacity * 1.5f);
if (newCapacity == header->capacity) ++newCapacity;
void* tmp = realloc(header, sizeof(dynamicArr) + header->typeSize * newCapacity);
if (tmp)
{
dynamicArr_Construct(header, tmp, newCapacity, arrSize, header->typeSize);
*((void**)&(arr)) = header->begin;
arr[arrSize] = item;
header->end += header->typeSize;
}
else
{
free(header);
arr = NULL;
}
}
else
{
arr[arrSize] = item;
header->end += header->typeSize;
}
}
}
I don't understand why this was the case. I'm not very good at reading assembly either, from what I can see though they are very different, so if someone could help me out here that would be much appreciated!
(Compiled in MSVC with /O2 and /Tc)
Link to the assembly and the rest of the relevant code.
Edit 1:
A lot of people seem to think that the reason is because => I seem to have missed changing the use of sizeof item
is simply evaluated at compile time. I don't think that this is the case because if I remove the condition and replace all instances of header->typeSize
with sizeof item
the performance is still worse than if the if
condition was there.header->typeSize
in the macro dynamicArr_Size
which caused this confusion, refer to the marked answer.
Here is the full code:
typedef struct
{
unsigned char* begin; // Pointer to data
unsigned char* end; // Pointer to last element
size_t typeSize; // Size of type
size_t capacity; // Capacity of array (not number of elements in array)
} dynamicArr;
#define dynamicArr_ConstructBase(dest, src, newCapacity) dest = src; dest->capacity = newCapacity; dest->begin = (unsigned char*)dest + sizeof *dest
#define dynamicArr_Construct(dest, src, newCapacity, currSize, typeSize) dynamicArr_ConstructBase(dest, src, newCapacity); dest->end = dest->begin + typeSize * (currSize)
#define dynamicArr_Header(arr) ((dynamicArr*)((unsigned char*)(arr) - sizeof(dynamicArr)))
static inline size_t dynamicArr_Size(dynamicArr* arr)
{
return (arr->end - arr->begin) / arr->typeSize;
}
#define dynamicArr_Create(typename, arr) typename* arr = (typename*)dynamicArr_Create_(sizeof(typename))
static inline unsigned char* dynamicArr_Create_(size_t typeSize)
{
dynamicArr* dynArr;
void* tmp = malloc(sizeof * dynArr + typeSize * 10);
if (!tmp) return NULL;
dynArr = tmp;
dynArr->begin = (unsigned char*)dynArr + sizeof * dynArr;
dynArr->end = dynArr->begin;
dynArr->capacity = 10;
dynArr->typeSize = typeSize;
return dynArr->begin;
}
#define dynamicArr_Free(arr) free(dynamicArr_Header(arr))
#define dynamicArr_Push(arr, item) \
do {\
if (arr) \
{ \
dynamicArr* header = dynamicArr_Header(arr); \
if (header->typeSize && header->typeSize == sizeof item) \
{ \
size_t arrSize = dynamicArr_Size(header); \
if (arrSize == header->capacity) \
{ \
size_t newCapacity = (size_t)(header->capacity * 1.5f); \
if (newCapacity == header->capacity) ++newCapacity; \
void* tmp = realloc(header, sizeof(dynamicArr) + header->typeSize * newCapacity); \
if (tmp) \
{ \
dynamicArr_Construct(header, tmp, newCapacity, arrSize, header->typeSize); \
*((void**)&(arr)) = header->begin; \
arr[arrSize] = item; \
header->end += header->typeSize; \
} \
else \
{ \
free(header); \
arr = NULL; \
} \
} \
else \
{ \
arr[arrSize] = item; \
header->end += header->typeSize; \
} \
} \
} \
} while(0)
And example use:
void Func()
{
dynamicArr_Create(int, intArr);
dynamicArr_Push(intArr, 10);
printf("%i\n", intArr[0]);
dynamicArr_Free(intArr);
}
As for a simple test for profiling:
int main()
{
dynamicArr_Create(int, intArr);
clock_t begin = clock();
for (int i = 0; i < 1000000000; ++i)
{
dynamicArr_Push(intArr, 10);
}
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%f\n", time_spent);
dynamicArr_Free(intArr);
}
Compiling in release mode in Visual Studio 2019 on windows using /Tc I get the results:
- With
header->typeSize == sizeof item
=> 3.65 seconds - Without
header->typeSize == sizeof item
=> 9.374 seconds - Replacing
header->typeSize
withsizeof item
and removingheader->typeSize == sizeof item
=> 9.302 seconds
I repeated the test 10 times and it was consistent to the results above.