Here's some Win32 code to profile various versions of the algorithm (compiled using VS2010 Express using default release build):-
#include <windows.h>
#include <stdlib.h>
#include <stdio.h>
const size_t
size = 0x1D4C00;
_declspec(align(16)) int
g_array [size];
_declspec(align(16)) int
_vec4_123456 [] = { 123456, 123456, 123456, 123456 };
void Test (void (*fn) (size_t, int *), char *test)
{
printf ("Executing test: %s\t", test);
for(size_t i=0; i<size; ++i) {
g_array[i] = rand() & 1;
}
LARGE_INTEGER
start,
end;
QueryPerformanceCounter (&start);
fn (size, g_array);
QueryPerformanceCounter (&end);
printf("size: %u\t count: %09u\n", size, (int) (end.QuadPart - start.QuadPart));
}
void Test1 (size_t size, int *array)
{
for(size_t i=0; i<size; ++i) {
array[i] *= 123456;
}
}
void Test2 (size_t size, int *array)
{
for(size_t i=0; i<size; ++i) {
if(array[i]) array[i] = 123456;
}
}
void Test3 (size_t array_size, int *array)
{
__asm
{
mov edi,array
mov ecx, array_size
lea esi, [edi + ecx * 4]
neg ecx
pxor xmm0, xmm0
movdqa xmm1, [_vec4_123456] ; value of { 123456, 123456, 123456, 123456 }
_replaceloop:
movdqa xmm2, [esi + ecx * 4] ; assumes the array is 16 aligned, make that true
add ecx, 4
pcmpeqd xmm2, xmm0
pandn xmm2, xmm1
movdqa [esi + ecx * 4 - 16], xmm2
jnz _replaceloop
}
}
void Test4 (size_t array_size, int *array)
{
array_size = array_size * 8 / 12;
__asm
{
mov edi,array
mov ecx,array_size
lea esi,[edi+ecx*4]
lea edi,[edi+ecx*4]
neg ecx
mov edx,[_vec4_123456]
pxor xmm0,xmm0
movdqa xmm1,[_vec4_123456]
replaceloop:
movdqa xmm2,[esi+ecx*4]
mov eax,[edi]
mov ebx,[edi+4]
movdqa xmm3,[esi+ecx*4+16]
add edi,16
add ecx,9
imul eax,edx
pcmpeqd xmm2,xmm0
imul ebx,edx
pcmpeqd xmm3,xmm0
mov [edi-16],eax
mov [edi-12],ebx
pandn xmm2,xmm1
mov eax,[edi-8]
mov ebx,[edi-4]
pandn xmm3,xmm1
imul eax,edx
movdqa [esi+ecx*4-36],xmm2
imul ebx,edx
movdqa [esi+ecx*4-20],xmm3
mov [edi-8],eax
mov [edi-4],ebx
loop replaceloop
}
}
int main()
{
Test (Test1, "Test1 - mul");
Test (Test2, "Test2 - branch");
Test (Test3, "Test3 - simd");
Test (Test4, "Test4 - simdv2");
}
It's got for tests: C using an if()...
, C using a multiply, harold's simd version and my simd version.
Running it many times (remember, when profiling you should average the results over several runs) there's little difference between all the versions except the branching one which is significantly slower.
This is not very surprising as the algortihm is doing very little work for each memory item. What this means is that the real limiting factor is the bandwidth between the CPU and the memory, the CPU is constantly waiting for the memory to catch up, even with the cpu helping with prefetching the data (ia32's detect and prefetch data linearly).